In [1]:
import os
import json
import dill
import boto3
import numpy as np
import pandas as pd
from datetime import datetime

from lime.lime_tabular import LimeTabularExplainer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBM

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

tf.__version__

'2.3.1'

# Model Creation
Notebook to create models (3 inputs) and explainers and save them to S3.

In [None]:
datapath = "Dataset"

df = pd.concat([
    pd.read_csv(os.path.join(datapath, p))
    for p in os.listdir(datapath)
    if p.endswith(".csv")
]).reset_index(drop=True)

df.body = df.body.str.lower()
df.title = df.title.str.lower()

print(df.shape)

In [None]:
print(np.quantile(df["word_count"], [0.005, 0.5, 0.9, 0.95]))

In [None]:
median_score = df.score.median()
print("Score median: %0.4f" % median_score)
df["target"] = df["score"] >= median_score

print("Target Mean: %0.4f" % df["target"].mean())

FEATURES = [
    "wh_word_count",
    "sentence_count",
    "word_count",
    "example_count",
    "n_linebreaks",
    "title_word_count",
    "title_question_marks",
    "num_question_marks",
    "n_links",
#     "n_tags",
    "n_lists",
]

x = df[FEATURES + ["body", "title"]]
y = df["target"]

In [None]:
from typing import List

def make_tokenizer(text_vecs: List[List[str]], *args, **kwargs):
    """
    Train a tokenizer on the given lists of strings (corpus).
    """
    texts = [" ".join(row) for row in zip(*text_vecs)]
    tok = keras.preprocessing.text.Tokenizer(*args, **kwargs)
    tok.fit_on_texts(texts)
    return tok



def make_model(
        max_body_len: int,
        max_title_len: int,
        vocab_size: int,
        num_handmade_feat: int,
        emb_dim: int = 64,
        dropout_rate: float = 0.4):
    """
    Makes the keras model. Define model architecture here.
    """
    
    # Define inputs
    body_input = keras.layers.Input((max_body_len,), name="body_tokens")
    title_input = keras.layers.Input((max_title_len,), name="title_tokens")
    feature_inputs = keras.layers.Input((num_handmade_feat,), name="features_input")
    
    # Embeddings
    embedding = keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=emb_dim,
        name="word_embed"
    )
    
    # Process title and body texts
    title_emb = embedding(title_input)
    title_emb = keras.layers.Dropout(dropout_rate)(title_emb)
    title_gru = keras.layers.Bidirectional(keras.layers.GRU(16, activation="tanh"))(title_emb)
    title_gru = keras.layers.Dropout(dropout_rate)(title_gru)
    
    body_emb = embedding(body_input)
    body_emb = keras.layers.Dropout(dropout_rate)(body_emb)
    body_gru = keras.layers.Bidirectional(keras.layers.GRU(96, activation="tanh"))(body_emb)
    body_gru = keras.layers.Dropout(dropout_rate)(body_gru)
    
    # Combine features
    all_feat = keras.layers.Concatenate(axis=1)([title_gru, body_gru, feature_inputs])
    all_feat = keras.layers.BatchNormalization()(all_feat)
    
    # Final layers
    dense_1 = keras.layers.Dense(64, activation="relu", kernel_regularizer=keras.regularizers.l2(1e-4))(all_feat)
    dense_1 = keras.layers.Dropout(dropout_rate)(dense_1)
    
    output = keras.layers.Dense(1, activation="sigmoid")(dense_1)
    return keras.Model(inputs=[title_input, body_input, feature_inputs], outputs=output)



In [None]:
x.head()

In [None]:
y.head()

In [None]:
np.random.seed(854)

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

train_bodies = xtrain["body"]
train_titles = xtrain["title"]

test_bodies = xtest["body"]
test_titles = xtest["title"]

print(xtrain.shape, xtest.shape)

In [None]:
VOCAB_SIZE = 7000
MAX_BODY_LEN = 180
MAX_TITLE_LEN = 24
NUM_HANDMADE = len(FEATURES)
EMB_DIM = 64


def df_to_inputs(df: pd.DataFrame, tokenizer):
    """
    Converts a df to the inputs required by the model 
    (title, body, hand-engineered features).
    """
    bodies = keras.preprocessing.sequence.pad_sequences(
        tokenizer.texts_to_sequences(df["body"]),
        maxlen=MAX_BODY_LEN,
        padding="post",
        truncating="post"
    )
    
    titles = keras.preprocessing.sequence.pad_sequences(
        tokenizer.texts_to_sequences(df["title"]),
        maxlen=MAX_TITLE_LEN,
        padding="post",
        truncating="post"
    )
    
    return titles, bodies, df[FEATURES].copy()


In [None]:
# Make tokenizer
tokenize = make_tokenizer([train_bodies, train_titles], oov_token="<oov>", num_words=VOCAB_SIZE)

In [None]:
# Clear backend
keras.backend.clear_session()

# Make model
model = make_model(
    MAX_BODY_LEN,
    MAX_TITLE_LEN,
    VOCAB_SIZE,
    NUM_HANDMADE,
    EMB_DIM,
    dropout_rate=0.5
)

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.summary()

In [None]:
# Inputs
train_inputs = df_to_inputs(xtrain, tokenize)
test_inputs = df_to_inputs(xtest, tokenize)

train_inputs[-1].head()

In [None]:
hist = model.fit(train_inputs, ytrain, validation_data=(test_inputs, ytest), epochs=20, batch_size=4096)

In [None]:
train_pred = model.predict(train_inputs) > 0.5

train_acc = accuracy_score(y_true=ytrain, y_pred=train_pred)
train_mat = confusion_matrix(y_true=ytrain, y_pred=train_pred)

print("Train set performance:")
print("Accuracy: %0.4f" % train_acc)
print("Confusion matrix: \n", train_mat)
print()

test_pred = model.predict(test_inputs) > 0.5

test_acc = accuracy_score(y_true=ytest, y_pred=test_pred)
test_mat = confusion_matrix(y_true=ytest, y_pred=test_pred)

print("Test set performance:")
print("Accuracy: %0.4f" % test_acc)
print("Confusion matrix: \n", test_mat)


fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(13, 8))

axs[0].plot(hist.history["loss"], color="blue", label="Train Loss")
axs[0].plot(hist.history["val_loss"], color="red", label="Val Loss")
axs[0].legend()
axs[0].set_title("Loss Function")

axs[1].plot(hist.history["accuracy"], color="blue", label="Train Acc")
axs[1].plot(hist.history["val_accuracy"], color="red", label="Val Acc")
axs[1].legend()
axs[1].set_title("Accuracy")

plt.show()

explainer = LimeTabularExplainer(train_inputs[-1].values, feature_names=list(train_inputs[-1].columns))

In [None]:

def make_dir(save_dir: str = "models") -> str:
    today = datetime.now().strftime("%Y-%m-%d")
    store = os.path.join(save_dir, today)
    
    if not os.path.isdir(store):
        os.makedirs(store)
    
    number = len(os.listdir(store))
    store = os.path.join(store, f"model_{number:02d}")
    os.mkdir(store)
    return store

def save_model(model, tokenizer, explainer=None, meta: dict = {}, save_dir: str = "models"):
    """
    Save model, tokenizer and Lime explainer.
    """
    store = make_dir(save_dir)
    model.save(os.path.join(store, "model.h5"))
    
    with open(os.path.join(store, "tokenizer.json"), "w") as f:
        f.write(tokenizer.to_json())
    
    meta.update(
        num_inputs=len(model.inputs),
        body_pad_length=MAX_BODY_LEN,
        title_pad_length=MAX_TITLE_LEN,
        features=FEATURES
    )
    meta_path = os.path.join(store, "meta.json")
    with open(meta_path, "w") as f:
        json.dump(meta, f)
    
    if explainer is not None:
        exp_path = os.path.join(store, "explainer.dill")
        with open(exp_path, "wb") as f:
             dill.dump(explainer, f)
    return True

In [16]:
save_model(model, tokenize, explainer, meta={"val_accuracy": float(test_acc)})

True

In [17]:
!aws s3 sync models/ s3://models-storage-dsr/models/

upload: models/2020-12-01/model_01/explainer.dill to s3://models-storage-dsr/models/2020-12-01/model_01/explainer.dill
upload: models/2020-12-01/model_01/meta.json to s3://models-storage-dsr/models/2020-12-01/model_01/meta.json
upload: models/2020-12-01/model_01/model.h5 to s3://models-storage-dsr/models/2020-12-01/model_01/model.h5
upload: models/2020-12-01/model_01/tokenizer.json to s3://models-storage-dsr/models/2020-12-01/model_01/tokenizer.json
