In [55]:
import pandas as pd
from ast import literal_eval

In [56]:
# There are multiple genres per movie
train_df = pd.read_csv("dataset/train_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})
test_df = pd.read_csv("dataset/test_data.csv",usecols=['genres', 'overview'], converters={"genres":literal_eval})
train_df.head()

Unnamed: 0,genres,overview
0,"[Comedy, Drama, Romance]",Beautiful chanteuse 'Bijou' (Marlene Dietrich)...
1,"[Action, Crime, Drama, Thriller]",In a post-apocalyptic world ravaged by feuding...
2,"[Drama, Romance, TV Movie, Western]","Marty is a 19 year old pioneer woman, recently..."
3,"[Action, Comedy]",A couple of fumbling best friends run a privat...
4,"[Comedy, Romance, TV Movie]",One woman's unexpected race to the altar teach...


In [59]:
# Initial train and test split.
from sklearn.model_selection import train_test_split

test_split = 0.1

train_df, val_df = train_test_split(
    train_df,
    test_size=test_split,
    stratify=train_df["genres"].values,
)

In [63]:
import tensorflow as tf


genres = tf.ragged.constant(train_df["genres"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot", num_oov_indices=0)
lookup.adapt(genres)
vocab = lookup.get_vocabulary()

print("Vocabulary:\n")
print(vocab)

Vocabulary:

['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime', 'Documentary', 'Adventure', 'Science Fiction', 'Family', 'Mystery', 'Fantasy', 'Animation', 'Music', 'Foreign', 'History', 'War', 'Western', 'TV Movie']


In [64]:
batch_size = 128

In [65]:
def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["genres"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["overview"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)


train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

# Models

In [140]:
import keras
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping


def make_mlp_model(text_vectorizer):
    model = keras.Sequential(
        [
            tf.keras.Input(shape=(1,), dtype=tf.string, name='text'),
            text_vectorizer,
            layers.Dense(64, activation="relu"),
            layers.Dense(32, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]
    )
    return model


def make_conv_model(text_vectorizer):
    model = keras.Sequential(
        [
            tf.keras.Input(shape=(1,), dtype=tf.string, name='text'),
            text_vectorizer,
            layers.Lambda(lambda x: tf.cast(x, "float32")),
            layers.Reshape((-1, 1)),
            layers.Conv1D(64, 3, activation='sigmoid'),
            layers.Conv1D(32, 3, activation='sigmoid'),
            layers.Flatten(),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
        ]
    )
    return model

# Util Functions

In [73]:
import matplotlib.pyplot as plt


def plot_result(history, item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

# TF-IDF Vectorization with MLP Model

In [72]:
text_vectorizer = layers.TextVectorization(output_mode="tf_idf")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [122]:
epochs = 2

model = make_mlp_model(text_vectorizer)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(), 
             tf.keras.metrics.CategoricalAccuracy(), 
             tf.keras.metrics.Accuracy(), 
             tf.keras.metrics.AUC(), 
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(), 
             tf.keras.metrics.Recall()]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
)

# plot_result(history, "loss")
# plot_result(history, "binary_accuracy")

model.evaluate(test_dataset)

Epoch 1/2


Epoch 2/2


[0.22423888742923737,
 0.9180562496185303,
 0.5336887836456299,
 0.0,
 0.8821662068367004,
 0.2619091272354126,
 0.6888936758041382,
 0.3769357204437256]

In [138]:
max_seqlen = 150
text_vectorizer = layers.TextVectorization(output_sequence_length=max_seqlen, output_mode="int")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [141]:
epochs = 2

model = make_conv_model(text_vectorizer)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam",
    metrics=[tf.keras.metrics.BinaryAccuracy(), 
             tf.keras.metrics.CategoricalAccuracy(), 
             tf.keras.metrics.Accuracy(), 
             tf.keras.metrics.AUC(), 
             tf.keras.metrics.F1Score(average='macro'), 
             tf.keras.metrics.Precision(), 
             tf.keras.metrics.Recall()]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[early_stopping_monitor], verbose=1
)

# plot_result(history, "loss")
# plot_result(history, "binary_accuracy")

model.evaluate(test_dataset)

Epoch 1/2
Epoch 2/2


[0.28439009189605713,
 0.896703839302063,
 0.47164323925971985,
 0.0,
 0.7651023864746094,
 0.032048750668764114,
 0.0,
 0.0]

In [129]:
a = next(iter(train_dataset))[0]

In [131]:
text_vectorizer(a).shape

TensorShape([128, 150])