In [2]:
import pandas as pd
from ast import literal_eval

In [3]:
# There are multiple genres per movie
train_df = pd.read_csv("dataset/train_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})
test_df = pd.read_csv("dataset/test_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})
train_df.head()

Unnamed: 0,genres,overview
0,"[Comedy, Drama, Romance]",Beautiful chanteuse 'Bijou' (Marlene Dietrich)...
1,"[Action, Crime, Drama, Thriller]",In a post-apocalyptic world ravaged by feuding...
2,"[Drama, Romance, TV Movie, Western]","Marty is a 19 year old pioneer woman, recently..."
3,"[Action, Comedy]",A couple of fumbling best friends run a privat...
4,"[Comedy, Romance, TV Movie]",One woman's unexpected race to the altar teach...


In [4]:
# Initial train and test split.
from sklearn.model_selection import train_test_split

test_split = 0.1

train_df, val_df = train_test_split(
    train_df,
    test_size=test_split,
    stratify=train_df["genres"].values,
)

In [5]:
import tensorflow as tf


genres = tf.ragged.constant(train_df["genres"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot", num_oov_indices=0)
lookup.adapt(genres)
vocab = lookup.get_vocabulary()

print("Vocabulary:\n")
print(vocab)

2023-06-17 20:28:53.858774: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-17 20:28:55.000240: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-17 20:28:55.079026: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-

Vocabulary:

['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime', 'Documentary', 'Adventure', 'Science Fiction', 'Family', 'Mystery', 'Fantasy', 'Animation', 'Music', 'Foreign', 'History', 'War', 'Western', 'TV Movie']


In [6]:
batch_size = 128

In [7]:
def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["genres"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["overview"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)


train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

# Models

In [8]:
import keras
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping


def make_mlp_model(text_vectorizer):
    model = keras.Sequential(
        [
            tf.keras.Input(shape=(1,), dtype=tf.string, name='text'),
            text_vectorizer,
            layers.Dense(64, activation="relu"),
            layers.Dense(32, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]
    )
    return model


def make_conv_model(text_vectorizer):
    model = keras.Sequential(
        [
            tf.keras.Input(shape=(1,), dtype=tf.string, name='text'),
            text_vectorizer,
            layers.Lambda(lambda x: tf.cast(x, "float32")),
            layers.Reshape((-1, 1)),
            layers.Conv1D(64, 3, activation='sigmoid'),
            layers.Conv1D(32, 3, activation='sigmoid'),
            layers.Flatten(),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]
    )
    return model


def make_lstm_model(text_vectorizer):
    model = tf.keras.Sequential([
        text_vectorizer,
        layers.Embedding(
            input_dim=len(text_vectorizer.get_vocabulary()),
            output_dim=64,
            mask_zero=True),
        layers.Bidirectional(tf.keras.layers.LSTM(64)),
        layers.Dense(64, activation='relu'),
        layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
    ])
    return model


def make_stacked_lstm_model(text_vectorizer):
    model = tf.keras.Sequential([
        text_vectorizer,
        tf.keras.layers.Embedding(len(text_vectorizer.get_vocabulary()), 64, mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
    ])
    return model

# Util Functions

In [9]:
import matplotlib.pyplot as plt


def plot_result(history, item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

# TF-IDF Vectorization with MLP Model

In [122]:
text_vectorizer = layers.TextVectorization(output_mode="tf_idf")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

epochs = 2

model = make_mlp_model(text_vectorizer)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(), 
             tf.keras.metrics.CategoricalAccuracy(), 
             tf.keras.metrics.Accuracy(), 
             tf.keras.metrics.AUC(), 
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(), 
             tf.keras.metrics.Recall()]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
)

# plot_result(history, "loss")
# plot_result(history, "binary_accuracy")

model.evaluate(test_dataset)

Epoch 1/2


Epoch 2/2


[0.22423888742923737,
 0.9180562496185303,
 0.5336887836456299,
 0.0,
 0.8821662068367004,
 0.2619091272354126,
 0.6888936758041382,
 0.3769357204437256]

In [141]:
max_seqlen = 150
text_vectorizer = layers.TextVectorization(output_sequence_length=max_seqlen, output_mode="int")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

epochs = 2

model = make_conv_model(text_vectorizer)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(), 
             tf.keras.metrics.CategoricalAccuracy(), 
             tf.keras.metrics.Accuracy(), 
             tf.keras.metrics.AUC(), 
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(), 
             tf.keras.metrics.Recall()]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
)

# plot_result(history, "loss")
# plot_result(history, "binary_accuracy")

model.evaluate(test_dataset)

Epoch 1/2
Epoch 2/2


[0.28439009189605713,
 0.896703839302063,
 0.47164323925971985,
 0.0,
 0.7651023864746094,
 0.032048750668764114,
 0.0,
 0.0]

In [16]:
VOCAB_SIZE = 10000
text_vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [18]:
epochs = 2

model = make_lstm_model(text_vectorizer)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(), 
             tf.keras.metrics.CategoricalAccuracy(), 
             tf.keras.metrics.Accuracy(), 
             tf.keras.metrics.AUC(), 
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(), 
             tf.keras.metrics.Recall()]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
)

# plot_result(history, "loss")
# plot_result(history, "binary_accuracy")

model.evaluate(test_dataset)

Epoch 1/2
Epoch 2/2


[0.26637598872184753,
 0.8996485471725464,
 0.4578284025192261,
 0.0,
 0.8152556419372559,
 0.053361546248197556,
 0.5903345942497253,
 0.09314875304698944]

In [20]:
epochs = 2

model = make_stacked_lstm_model(text_vectorizer)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(), 
             tf.keras.metrics.CategoricalAccuracy(), 
             tf.keras.metrics.Accuracy(), 
             tf.keras.metrics.AUC(), 
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(), 
             tf.keras.metrics.Recall()]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
)

# plot_result(history, "loss")
# plot_result(history, "binary_accuracy")

model.evaluate(test_dataset)

Epoch 1/2
Epoch 2/2


[0.27534621953964233,
 0.9007391929626465,
 0.4718855917453766,
 0.0,
 0.7935128808021545,
 0.032402507960796356,
 0.612728476524353,
 0.10617081075906754]

In [10]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [12]:
from tfhub_maps import *

bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
