In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report, f1_score
from tensorflow import keras

In [2]:
# read train test data and transform into usable form
train = pd.read_json("../data/train_preprocessed.json")
test = pd.read_json("../data/test.json")

X_train, y_train = train.drop(columns=["has_spoiler"]), train["has_spoiler"]
X_test, y_test = test.drop(columns=["has_spoiler"]), test["has_spoiler"]

inputs_train = {
"user_id": X_train["user_id"].values,
"book_id": X_train["book_id"].values,
"numerics": X_train[["rating", "n_votes", "n_comments"]].values
}

inputs_test = {
"user_id": X_test["user_id"].values,
"book_id": X_test["book_id"].values,
"numerics": X_test[["rating", "n_votes", "n_comments"]].values
}

user_max = X_train.user_id.max()
book_max = X_train.book_id.max()

In [3]:
class Model(keras.Model):
    def __init__(self, user_max, book_max, user_emb_size=16, book_emb_size=12):
        super(mModel, self).__init__()
        
        # create embeddings
        self.user_embedding = keras.layers.Embedding(input_dim=user_max+3, output_dim=user_emb_size, input_length=1, name="user_embedding")
        self.book_embedding = keras.layers.Embedding(input_dim=book_max+3, output_dim=book_emb_size,input_length=1, name="book_embedding")
        
        # this layer will only be used for numeric features
        self.numerics = keras.Sequential([
            keras.layers.Dense(32, activation="relu"),
            keras.layers.BatchNormalization(),
        ])
        
        # this part will be used for combination of numeric feats and user_id/book_id features
        self.dense_layers = keras.Sequential([
            keras.layers.Dense(128, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(64, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(1, activation="sigmoid")
        ])
              
    def call(self, inputs):
        user_id_input = inputs["user_id"]
        book_id_input = inputs["book_id"]
        numerics = inputs["numerics"]
        user_embedded = self.user_embedding(user_id_input)
        book_embedded = self.book_embedding(book_id_input)
        user_flattened = keras.layers.Flatten()(user_embedded)
        book_flattened = keras.layers.Flatten()(book_embedded)
        
        numerics = self.numerics(numerics) # only numerics
        concatenated = keras.layers.Concatenate()([user_flattened, book_flattened, numerics]) # all features
        out = self.dense_layers(concatenated)
        
        return out

In [4]:
# create and train the model
model = Model(user_max, book_max)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), loss="binary_crossentropy", metrics=[tf.keras.metrics.AUC()])
model.fit(inputs_train, y_train, epochs=20, batch_size=256, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x21a9a11c310>

In [8]:
# EVALUATION CELL

preds = model.predict(inputs_test, batch_size=512, verbose=0)

# selects best threshold value
thresholds = np.linspace(0,0.5,100)
best = 0
best_th = None
for th in thresholds:
    f1 = f1_score(y_test, (preds>th).astype(int))
    if f1 > best:
        best = f1
        best_th = th

In [9]:
# prints report
print(classification_report(y_test, (preds>best_th).astype(int)))

              precision    recall  f1-score   support

           0       0.94      0.86      0.90    119401
           1       0.27      0.48      0.35     13041

    accuracy                           0.82    132442
   macro avg       0.60      0.67      0.62    132442
weighted avg       0.87      0.82      0.84    132442

