In [1]:
from tensorflow_ import mModel
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, auc, roc_curve, f1_score, confusion_matrix, classification_report
from tensorflow.keras.metrics import Precision, Recall, F1Score
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras

In [2]:
class mModel(keras.Model):
    def __init__(self, user_max, book_max, user_emb_size=8, book_emb_size=8):
        super(mModel, self).__init__()
        self.user_id_input = keras.Input(shape=(1,), name="user_id")
        self.book_id_input = keras.Input(shape=(1,), name="book_id")
        self.numerics_input = keras.Input(shape=(3,), name="numerics")

        self.hidden1 = keras.layers.Dense(256, activation="relu")
        self.user_embedding = keras.layers.Embedding(input_dim=user_max+3, output_dim=user_emb_size, input_length=1, name="user_embedding")
        self.book_embedding = keras.layers.Embedding(input_dim=book_max+3, output_dim=book_emb_size,input_length=1, name="book_embedding")

        self.flatten = keras.layers.Flatten()

        self.hidden2 = keras.layers.Dense(128, activation="relu")
        self.hidden3 = keras.layers.Dense(64, activation="relu")
        self.hidden4 = keras.layers.Dense(1, activation="sigmoid")

        self.dropout1 = keras.layers.Dropout(0.4)
        self.dropout2 = keras.layers.Dropout(0.4)
        self.dropout3 = keras.layers.Dropout(0.4)

        self.concatenate = keras.layers.Concatenate()


    def call(self, inputs):
        user_id_input = inputs["user_id"]
        book_id_input = inputs["book_id"]
        numerics = inputs["numerics"]

        numerics = self.dropout1(self.hidden1(numerics))
        user_embedded = self.user_embedding(user_id_input)
        book_embedded = self.book_embedding(book_id_input)

        user_flattened = self.flatten(user_embedded)
        book_flattened = self.flatten(book_embedded)
        concatenated = self.concatenate([user_flattened, book_flattened, numerics])

        out = self.dropout2(self.hidden2(concatenated))
        out = self.dropout3(self.hidden3(out))
        out = self.hidden4(out)
        return out

In [50]:

class mModel(keras.Model):
    def __init__(self, user_max, book_max, user_emb_size=8, book_emb_size=8):
        super(mModel, self).__init__()

        self.user_embedding = keras.layers.Embedding(input_dim=user_max+3, output_dim=user_emb_size, input_length=1, name="user_embedding")
        self.book_embedding = keras.layers.Embedding(input_dim=book_max+3, output_dim=book_emb_size,input_length=1, name="book_embedding")

        #self.matrix_factorization = keras.layers.Dot(axes=1, normalize=False)  # Matrix factorization layer
        

        self.numerics_ = keras.Sequential([
            keras.layers.Dense(16, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(32, activation="relu")
        ])

        self.concatenate = keras.layers.Concatenate()
        
        self.dense_layers = keras.Sequential([
            keras.layers.Dense(128, activation="relu", input_shape=(19,)),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.4),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.4),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.4),
            keras.layers.Dense(1, activation="sigmoid")
        ])

        self.out_layer = keras.layers.Dense(1, activation="sigmoid")
        

    def call(self, x):
        user_id_input = x["user_id"]
        book_id_input = x["book_id"]
        numerics = x["numerics"]

        user_embedded = self.user_embedding(user_id_input)
        book_embedded = self.book_embedding(book_id_input)

        user_flattened = keras.layers.Flatten()(user_embedded)
        book_flattened = keras.layers.Flatten()(book_embedded)

        concatenated = self.concatenate([user_flattened, book_flattened, numerics])
        print(concatenated.shape)
        #print(f"conc: {concatenated.shape}")
        out = self.dense_layers(concatenated)
        return out

In [3]:
train = pd.read_json("../data/train_preprocessed.json")
test = pd.read_json("../data/test.json")

In [4]:
false_label = train[train.has_spoiler == 0].sample(55000).reset_index(drop=True)

In [5]:
true_label = train[train.has_spoiler == 1]
train = pd.concat((true_label, false_label)).reset_index(drop=True)

In [6]:
X_train, y_train = train.drop(columns=["has_spoiler"]), train["has_spoiler"]
X_test, y_test = test.drop(columns=["has_spoiler"]), test["has_spoiler"]

In [7]:
inputs_train = {
"user_id": X_train["user_id"].values,
"book_id": X_train["book_id"].values,
"numerics": X_train[["rating", "n_votes", "n_comments"]].values
}

user_max = X_train.user_id.max()
book_max = X_train.book_id.max()

In [8]:
model = mModel(user_max, book_max)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss="binary_crossentropy", metrics=["acc"])

model.fit(inputs_train, y_train, epochs=10, batch_size=256, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1d99aa52890>

In [None]:
inputs_test = {
"user_id": X_test["user_id"].values,
"book_id": X_test["book_id"].values,
"numerics": X_test[["rating", "n_votes", "n_comments"]].values
}

In [None]:
preds = model.predict(inputs_test)

In [None]:
plt.hist(preds);

In [26]:
b
preds[preds > 0.3].shape

(12231,)

12231

In [29]:
preds[preds > 0.3]

array([0.32332984, 0.47949287, 0.40253657, ..., 0.6086842 , 0.35590684,
       0.5556251 ], dtype=float32)

In [24]:
preds.shape[0] * 0.098

12979.316

In [21]:
y_test.sum() / len(y_test)

0.09846574349526585

In [34]:
cm = confusion_matrix(y_test, (preds > 0.3).astype(int))

In [35]:
cm

array([[111087,   8314],
       [  9124,   3917]], dtype=int64)

In [None]:
sklearn.met

In [20]:
preds.mean()

0.09124283

In [10]:
y_train.shape

(529768,)

In [11]:
y_train.sum()

51930

In [12]:
y_test.sum()

13041

In [40]:
print(classification_report(y_test, (preds>0.3).astype(int)))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93    119401
           1       0.32      0.30      0.31     13041

    accuracy                           0.87    132442
   macro avg       0.62      0.62      0.62    132442
weighted avg       0.86      0.87      0.87    132442



In [38]:
(preds>0.3).astype(int)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [39]:
y_test

0         0
1         1
2         0
3         0
4         0
         ..
132437    0
132438    0
132439    0
132440    0
132441    0
Name: has_spoiler, Length: 132442, dtype: int64