Here we are going to test our model class to make sure it works as expected.

In [14]:
import tensorflow as tf
from tensorflow.keras import layers, optimizers, regularizers
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd

### Load the Data

In [15]:
df = pd.read_parquet("../data/train.parquet")
print(f"Num of rows: {df.shape[0]}")

Num of rows: 20377


In [16]:
val_df = pd.read_parquet("../data/valid.parquet")
print(f"Num of rows: {val_df.shape[0]}")

Num of rows: 2264


### Define the Model

In [17]:
class NbowModel:
    def __init__(self, vocab_sz):
        self.vocab_sz = vocab_sz
        # Instantiate the CountVectorizer
        self.cv = CountVectorizer(
            min_df=0.005,
            max_df=0.75,
            stop_words="english",
            strip_accents="ascii",
            max_features=self.vocab_sz,
        )

        # Define the keras model
        inputs = tf.keras.Input(shape=(self.vocab_sz,), name="Input")
        x = layers.Dropout(0.10)(inputs)
        x = layers.Dense(
            15,
            activation="relu",
            kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
        )(x)
        predictions = layers.Dense(
            1,
            activation="sigmoid",
        )(x)
        self.model = tf.keras.Model(inputs, predictions)
        opt = optimizers.Adam(learning_rate=0.002)
        self.model.compile(
            loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"]
        )

    def fit(self, X, y):
        res = self.cv.fit_transform(X).toarray()
        self.model.fit(x=res, y=y, batch_size=32, epochs=10, validation_split=0.2)

    def predict(self, X):
        res = self.cv.transform(X).toarray()
        return self.model.predict(res)

    def eval_acc(self, X, labels, threshold=0.5):
        return accuracy_score(labels, self.predict(X) > threshold)

    def eval_rocauc(self, X, labels):
        return roc_auc_score(labels, self.predict(X))

    @property
    def model_dict(self):
        return {"vectorizer": self.cv, "model": self.model}

    @classmethod
    def from_dict(cls, model_dict):
        "Get Model from dictionary"
        nbow_model = cls(len(model_dict["vectorizer"].vocabulary_))
        nbow_model.model = model_dict["model"]
        nbow_model.cv = model_dict["vectorizer"]
        return nbow_model

### Train the Model

In [18]:
model = NbowModel(vocab_sz=750)
model.fit(X=df["review"], y=df["labels"])

Epoch 1/10
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7876 - loss: 0.4506 - val_accuracy: 0.8810 - val_loss: 0.2956
Epoch 2/10
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8821 - loss: 0.2972 - val_accuracy: 0.8751 - val_loss: 0.3036
Epoch 3/10
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8892 - loss: 0.2863 - val_accuracy: 0.8790 - val_loss: 0.2947
Epoch 4/10
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8874 - loss: 0.2790 - val_accuracy: 0.8795 - val_loss: 0.2950
Epoch 5/10
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8972 - loss: 0.2659 - val_accuracy: 0.8756 - val_loss: 0.2983
Epoch 6/10
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8978 - loss: 0.2617 - val_accuracy: 0.8761 - val_loss: 0.3026
Epoch 7/10
[1m510/510[0m 

### Evaluate the model performance

Now we are going to evaluate the model performance and compare it to the baseline.
- Baseline Accuracy: 0.773
- Baseline AUC: 0.5

In [19]:
model_acc = model.eval_acc(val_df["review"], val_df["labels"])
model_rocauc = model.eval_rocauc(val_df["review"], val_df["labels"])

msg = "Model Accuracy: {}\nModel AUC: {}"
print(msg.format(round(model_acc, 3), round(model_rocauc, 3)))

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Model Accuracy: 0.872
Model AUC: 0.915


Now that we confirmed that the model is working as expected we can move on to the next step.

### Saving and loading the model

In [20]:
model_dict = model.model_dict

In [21]:
model_loaded = NbowModel.from_dict(model_dict)
print(
    "Model Accuracy:",
    round(model_loaded.eval_acc(val_df["review"], val_df["labels"]), 3)
)
print(
    "Model AUC:",
    round(model_loaded.eval_rocauc(val_df["review"], val_df["labels"]), 3)
)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Model Accuracy: 0.872
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Model AUC: 0.915
