In [24]:
import pandas as pd
from ast import literal_eval

In [25]:
# There are multiple genres per movie
train_df = pd.read_csv("dataset/train_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})
test_df = pd.read_csv("dataset/test_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})
train_df.head()

Unnamed: 0,overview,genres
0,"During her wedding ceremony, Rachel notices Lu...","[Comedy, Drama, Romance]"
1,While doing undercover work in a mental hospit...,"[Adventure, Horror]"
2,Depressed single mom Adele and her son Henry o...,[Drama]
3,Jenny is young. Her life is over. She killed s...,"[Drama, Music]"
4,Raised in a single parent family by his mother...,[Drama]


In [26]:
# Initial train and test split.
from sklearn.model_selection import train_test_split

test_split = 0.1

train_df, val_df = train_test_split(
    train_df,
    test_size=test_split,
    stratify=train_df["genres"].values,
)

In [27]:
# # There are multiple genres per movie
# train_df = pd.read_csv("dataset/train_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})[:1000]
# test_df = pd.read_csv("dataset/test_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})[:100]
# train_df.head()

# # Initial train and test split.
# from sklearn.model_selection import train_test_split

# test_split = 0.1

# train_df, val_df = train_test_split(
#     train_df,
#     test_size=test_split,
# )

In [28]:
import tensorflow as tf


genres = tf.ragged.constant(train_df["genres"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot", num_oov_indices=0)
lookup.adapt(genres)
vocab = lookup.get_vocabulary()

print("Vocabulary:\n")
print(vocab)

Vocabulary:

['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime', 'Documentary', 'Adventure', 'Science Fiction', 'Family', 'Mystery', 'Fantasy', 'Animation', 'Music', 'Foreign', 'History', 'War', 'Western', 'TV Movie']


In [29]:
batch_size = 128

In [30]:
def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["genres"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["overview"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)


train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

# Models

In [31]:
import keras
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping


def make_mlp_model(text_vectorizer):
    model = keras.Sequential(
        [
            tf.keras.Input(shape=(1,), dtype=tf.string, name='text'),
            text_vectorizer,
            layers.Dense(64, activation="relu"),
            layers.Dense(32, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ],
    )
    return model


def make_conv_model(text_vectorizer):
    model = keras.Sequential(
        [
            tf.keras.Input(shape=(1,), dtype=tf.string, name='text'),
            text_vectorizer,
            layers.Lambda(lambda x: tf.cast(x, "float32")),
            layers.Reshape((-1, 1)),
            layers.Conv1D(64, 3, activation='sigmoid'),
            layers.Conv1D(32, 3, activation='sigmoid'),
            layers.Flatten(),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ],
        
    )
    return model


def make_lstm_model(text_vectorizer):
    model = tf.keras.Sequential([
        text_vectorizer,
        layers.Embedding(
            input_dim=len(text_vectorizer.get_vocabulary()),
            output_dim=64,
            mask_zero=True),
        layers.Bidirectional(tf.keras.layers.LSTM(64)),
        layers.Dense(64, activation='relu'),
        layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
    ])
    return model


def make_stacked_lstm_model(text_vectorizer):
    model = tf.keras.Sequential([
        text_vectorizer,
        tf.keras.layers.Embedding(len(text_vectorizer.get_vocabulary()), 64, mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
    ])
    return model

# Util Functions

In [32]:
import matplotlib.pyplot as plt


def plot_result(history, item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

def train_model(model, model_name, epochs=10, plot_metrics=False, optimizer="adam"):
    model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(),  
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall')]
    )   

    early_stopping_monitor = EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=0,
        verbose=0,
        mode='auto',
        baseline=None,
        restore_best_weights=True
    )

    history = model.fit(
        train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
    )

    if plot_metrics:
        plot_result(history, "loss")
        plot_result(history, "binary_accuracy")

    metrics_values = model.evaluate(test_dataset)
    metrics_names = model.metrics_names

    result = {metrics_names[i]: metrics_values[i] for i in range(len(metrics_names))}

    result["model_name"] = model_name

    return result

In [33]:
results = []

In [34]:
text_vectorizer = layers.TextVectorization(output_mode="tf_idf")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

model = make_mlp_model(text_vectorizer)

result = train_model(model, "mlp")

results.append(result)

Epoch 1/10
Epoch 2/10


In [35]:
max_seqlen = 150
text_vectorizer = layers.TextVectorization(output_sequence_length=max_seqlen, output_mode="int")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

model = make_conv_model(text_vectorizer)

result = train_model(model, "conv")

results.append(result)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [36]:
VOCAB_SIZE = 10000
text_vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

model = make_lstm_model(text_vectorizer)

result = train_model(model, "lstm")

results.append(result)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [37]:
VOCAB_SIZE = 10000
text_vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

model = make_stacked_lstm_model(text_vectorizer)

result = train_model(model, "stacked_lstm")

results.append(result)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [38]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
from tfhub_maps import *

In [39]:
def build_classifier_model():
  bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'

  tfhub_handle_encoder = map_name_to_handle[bert_model_name]
  tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(lookup.vocabulary_size(), activation="sigmoid")(net)
  return tf.keras.Model(text_input, net)

In [40]:
epochs = 10
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

model = build_classifier_model()

result = train_model(model, "bert", optimizer=optimizer)

results.append(result)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [41]:
model = build_classifier_model()

result = train_model(model, "berty")

results.append(result)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [42]:
pd.DataFrame(results)

Unnamed: 0,loss,binary_accuracy,f1_score,precision,recall,model_name
0,0.227237,0.918105,0.243036,0.69966,0.362516,mlp
1,0.283308,0.897031,0.032038,0.508511,0.084145,conv
2,0.219622,0.915221,0.211336,0.635507,0.419669,lstm
3,0.238806,0.906617,0.13219,0.619048,0.24868,stacked_lstm
4,0.284396,0.89674,0.032038,0.0,0.0,bert
5,0.284039,0.89674,0.032038,0.0,0.0,berty


In [43]:
# import os

# model_dir = "models/"
# model_name = "model"
# model_version = "1"
# model_export_path = f"{model_dir}/{model_name}/{model_version}"

# invert_stringlookup_layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)

# model_for_inference = keras.Sequential([model, 
#                                         layers.Lambda(lambda x: tf.round(x)),
#                                         layers.Lambda(lambda x: tf.map_fn(lambda y: tf.where(y == 1.0)[..., 0] + 1, x, dtype=(tf.int64))),
#                                         invert_stringlookup_layer
#                                         ])

# tf.saved_model.save(
#     model_for_inference,
#     export_dir=model_export_path,
# )

# print(f"SavedModel files: {os.listdir(model_export_path)}")