In [34]:
train_path = "../../data/train_preprocessed.json"
test_path = "../../data/test.json"
embeddings_path = "../../data/glove.6B.200d.txt"
lstm_model_path =  "../pretrained_models/lstm_model"
mlp_model_path = "../pretrained_models/mlp_model.h5"

In [2]:
import sys
sys.path.append('../')
from models import LSTM_model, MLP_model
import tensorflow as tf
from utils import embeddings, read_data
import numpy as np
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from tensorflow.keras.metrics import Precision, Recall
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [3]:
# FIRST MODEL
max_words = 5000
embedding_dim = 200

datasets = read_dataset(train_path, test_path, max_words, balance_test=True)
trainx_lstm, trainy_lstm, testx_lstm, testy_lstm, tokenizer, max_seq_length = datasets["lstm_data"]
trainx_mlp, trainy_mlp, testx_mlp, testy_mlp, user_max, book_max = datasets["mlp_data"]

print("Prepared Data")

Prepared Data


In [4]:
embedding_matrix = embeddings(embeddings_path, embedding_dim, tokenizer, max_words)

In [5]:
lstm_model = LSTM_model(max_words, embedding_dim, embedding_matrix, max_seq_length)
lstm_model.load_weights(lstm_model_path)

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])

In [12]:
# SECOND MODEL

mlp_model = MLP_model(user_max, book_max, user_emb_size=24, book_emb_size=16)
mlp_model.load_weights(mlp_model_path)

mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy", Precision(), Recall()])

In [19]:
def decrease_data_size(inputs_lstm, labels_lstm, inputs_mlp, labels_mlp, l=None):
    if not l:
        return inputs_lstm, labels_lstm, inputs_mlp, labels_mlp
    user_ids = inputs_mlp["user_id"][:l]
    book_ids = inputs_mlp["book_id"][:l]
    numerics = inputs_mlp["numerics"][:l]
    new_labels_mlp = labels_mlp[:l]
    new_inputs_mlp = {
    "user_id": user_ids,
    "book_id": book_ids,
    "numerics": numerics
    }
    new_inputs_lstm = inputs_lstm[:l]
    new_labels_lstm = labels_lstm[:l]

    return new_inputs_lstm, new_labels_lstm, new_inputs_mlp, new_labels_mlp

tiny_xtrain_lstm, tiny_ytrain_lstm, tiny_xtrain_mlp, tiny_ytrain_mlp = decrease_data_size(trainx_lstm, trainy_lstm, trainx_mlp, trainy_mlp)
tiny_xtest_lstm, tiny_ytest_lstm, tiny_xtest_mlp, tiny_ytest_mlp = decrease_data_size(testx_lstm, testy_lstm, testx_mlp, testy_mlp)

In [20]:
lstm_train_logits = lstm_model.predict(tiny_xtrain_lstm, batch_size=64)
mlp_train_logits = mlp_model.predict(tiny_xtrain_mlp)
classifier_train_input = np.concatenate((lstm_train_logits, mlp_train_logits), axis=1)
classifier_train_labels = tiny_ytrain_lstm.copy()

lstm_test_logits = lstm_model.predict(tiny_xtest_lstm, batch_size=64)
mlp_test_logits = mlp_model.predict(tiny_xtest_mlp)
classifier_test_input = np.concatenate((lstm_test_logits, mlp_test_logits), axis=1)
classifier_test_labels = tiny_ytest_lstm.copy()



In [21]:
classifiers = {"random_forest": RandomForestClassifier(),
                   "ada_boost": AdaBoostClassifier(),
                   "svc": SVC(),
                   "logistic_regression": LogisticRegression(),
                   "knn": KNeighborsClassifier(),
                   "naive_bayesian": GaussianNB(),
                   "xgb": XGBClassifier()}

best = {"model": None, "score": 0}
for model_name in classifiers.keys():
    curr = classifiers[model_name]
    curr.fit(classifier_train_input, classifier_train_labels)
    preds = curr.predict(classifier_test_input)
    score = f1_score(tiny_ytest_lstm, preds)
    if score > best["score"]:
        best["model"] = model_name
        best["score"] = score
    print(f"{model_name}: F1 - {score}")
print("-"*50)
print(f"Best model: {best['model']} | Score: {best['score']}")

random_forest: F1 - 0.7367603550295857
ada_boost: F1 - 0.7536296793938717
svc: F1 - 0.758206977771235
logistic_regression: F1 - 0.7583729371702279
knn: F1 - 0.7361503726274827
naive_bayesian: F1 - 0.7587133368161804
xgb: F1 - 0.756371952344067
Best model: naive_bayesian | Score: 0.7587133368161804


In [23]:
def evaluate_models(lstm_model, mlp_model, classifier, lstm_in, lstm_lab, mlp_in, mlp_lab):

    _, lstm_acc, lstm_precision, lstm_recall = lstm_model.evaluate(lstm_in, lstm_lab, verbose=0, batch_size=256)
    lstm_f1 = 2*lstm_recall*lstm_precision / (lstm_recall+lstm_precision)
    lstm_auc = roc_auc_score(tiny_ytest_lstm, lstm_test_logits)
    _, mlp_acc, mlp_precision, mlp_recall = mlp_model.evaluate(mlp_in, mlp_lab, verbose=0, batch_size=256)
    mlp_f1 = 2*mlp_recall*mlp_precision / (mlp_recall+mlp_precision)
    mlp_auc = roc_auc_score(tiny_ytest_mlp, mlp_test_logits)

    classifier_preds_binary = classifier.predict(classifier_test_input)
    classifier_acc = accuracy_score(tiny_ytest_lstm, classifier_preds_binary)
    classifier_precision = precision_score(tiny_ytest_lstm, classifier_preds_binary)
    classifier_recall = recall_score(tiny_ytest_lstm, classifier_preds_binary)
    classifier_f1 = 2*classifier_recall*classifier_precision / (classifier_recall+classifier_precision)
    classifier_auc = roc_auc_score(tiny_ytest_lstm, classifier_preds_binary)

    print(f"LSTM    : Acc: {lstm_acc:.4f} | Precision: {lstm_precision:.4f} | Recall: {lstm_recall:.4f} | F1: {lstm_f1:.4f} | AUC: {lstm_auc:.4f}")
    print(f"MLP     : Acc: {mlp_acc:.4f} | Precision: {mlp_precision:.4f} | Recall: {mlp_recall:.4f} | F1: {mlp_f1:.4f} | AUC: {mlp_auc:.4f}")
    print(f"Ensemble: Acc: {classifier_acc:.4f} | Precision: {classifier_precision:.4f} | Recall: {classifier_recall:.4f} | F1: {classifier_f1:.4f} | AUC: {classifier_auc:.4f}")

evaluate_models(lstm_model, mlp_model, classifiers[best["model"]], tiny_xtest_lstm, tiny_ytest_lstm, tiny_xtest_mlp, tiny_ytest_mlp)

LSTM    : Acc: 0.7723 | Precision: 0.6984 | Recall: 0.7450 | F1: 0.7209 | AUC: 0.8532
MLP     : Acc: 0.6939 | Precision: 0.5997 | Recall: 0.6751 | F1: 0.6352 | AUC: 0.7609
Ensemble: Acc: 0.8043 | Precision: 0.7392 | Recall: 0.7792 | F1: 0.7587 | AUC: 0.7999713398742909


In [24]:
import pickle

with open('../pretrained_models/classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)