# Movie Genre Classification with LSTM
## Document-level approach

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
from gensim.models import KeyedVectors

RANDOM_STATE = 1212

cmu_data = pd.read_csv('cmu_data_final.csv')
imdb_data = pd.read_csv('imdb_data_final.csv')

# Dataset preparation

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

cmu_data['processed_plot'] = cmu_data['plot'].apply(preprocess_text)
imdb_data['processed_plot'] = imdb_data['plot'].apply(preprocess_text)

# Choose either the CMU embeddings or the Google News embeddings

# Word2vec embeddings, trained on the CMU dataset
# word2vec_model = KeyedVectors.load('word2vec_model_from_cmu_utf8.bin')

# Google news word2vec embeddings
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

### CMU

In [None]:
# Tokenize and pad the text sequences
cmu_tokenizer = Tokenizer()
cmu_tokenizer.fit_on_texts(cmu_data['processed_plot'])
cmu_sequences = cmu_tokenizer.texts_to_sequences(cmu_data['processed_plot'])
cmu_word_index = cmu_tokenizer.word_index
cmu_padded_sequences = pad_sequences(cmu_sequences, maxlen=300)

# Label encoding
cmu_data['genre'] = cmu_data['genre'].apply(lambda x: x.split('|'))
cmu_mlb = MultiLabelBinarizer()
cmu_genres_encoded = cmu_mlb.fit_transform(cmu_data['genre'])

# Train-test split
cmu_X_train, cmu_X_test, cmu_y_train, cmu_y_test = train_test_split(
    cmu_padded_sequences, cmu_genres_encoded, test_size=0.2, random_state=RANDOM_STATE)

### IMDb

In [None]:
# Tokenize and pad the text sequences
imdb_tokenizer = Tokenizer()
imdb_tokenizer.fit_on_texts(imdb_data['processed_plot'])
imdb_sequences = imdb_tokenizer.texts_to_sequences(imdb_data['processed_plot'])
imdb_word_index = imdb_tokenizer.word_index
imdb_padded_sequences = pad_sequences(imdb_sequences, maxlen=300)

# Label encoding
imdb_data['genre'] = imdb_data['genre'].apply(lambda x: x.split('|'))
imdb_mlb = MultiLabelBinarizer()
imdb_genres_encoded = imdb_mlb.fit_transform(imdb_data['genre'])

# Train-test split
imdb_X_train, imdb_X_test, imdb_y_train, imdb_y_test = train_test_split(
    imdb_padded_sequences, imdb_genres_encoded, test_size=0.2, random_state=RANDOM_STATE)

# Model preparation

### CMU

In [None]:
# Create the CMU embedding matrix
""" cmu_embedding_dim = word2vec_model.vector_size
cmu_embedding_matrix = np.zeros((len(cmu_word_index) + 1, cmu_embedding_dim))
for word, i in cmu_word_index.items():
    if word in word2vec_model.wv:
        cmu_embedding_matrix[i] = word2vec_model.wv[word] """

# Create the Google News embedding matrix
cmu_embedding_dim = word2vec_model.vector_size
cmu_embedding_matrix = np.zeros((len(cmu_word_index) + 1, cmu_embedding_dim))
for word, i in cmu_word_index.items():
    if word in word2vec_model:
        cmu_embedding_matrix[i] = word2vec_model[word]

In [None]:
# Define and compile the LSTM model
model = Sequential()
model.add(Embedding(len(cmu_word_index) + 1, cmu_embedding_dim, embeddings_initializer=Constant(
    cmu_embedding_matrix), trainable=False))
model.add(LSTM(128, dropout=0.25, recurrent_dropout=0.25, return_sequences=True))
model.add(LSTM(64, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(len(cmu_mlb.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(cmu_X_train, cmu_y_train, batch_size=64, epochs=7, validation_data=(
    cmu_X_test, cmu_y_test))

In [None]:
# Make predictions on the CMU test set
cmu_y_pred = model.predict(cmu_X_test)
cmu_y_pred_classes = (cmu_y_pred > 0.35).astype(int)

# Make predictions on the IMDb test set
cmu_y_pred_imdb = model.predict(imdb_X_test)
cmu_y_pred_imdb_classes = (cmu_y_pred_imdb > 0.35).astype(int)

### IMDb

In [None]:
# Create the CMU embedding matrix
""" imdb_embedding_dim = word2vec_model.vector_size
imdb_embedding_matrix = np.zeros((len(imdb_word_index) + 1, imdb_embedding_dim))
for word, i in imdb_word_index.items():
    if word in word2vec_model.wv:
        imdb_embedding_matrix[i] = word2vec_model.wv[word] """

# Create the Google News embedding matrix
imdb_embedding_dim = word2vec_model.vector_size
imdb_embedding_matrix = np.zeros((len(imdb_word_index) + 1, imdb_embedding_dim))
for word, i in imdb_word_index.items():
    if word in word2vec_model:
        imdb_embedding_matrix[i] = word2vec_model[word]

In [None]:
# Define and compile the LSTM model
model = Sequential()
model.add(Embedding(len(imdb_word_index) + 1, imdb_embedding_dim, embeddings_initializer=Constant(
    imdb_embedding_matrix), trainable=False))
model.add(LSTM(128, dropout=0.25, recurrent_dropout=0.25, return_sequences=True))
model.add(LSTM(64, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(len(imdb_mlb.classes_), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(imdb_X_train, imdb_y_train, batch_size=64, epochs=7, validation_data=(
    imdb_X_test, imdb_y_test))

In [None]:
# Make predictions on the IMDb test set
imdb_y_pred = model.predict(imdb_X_test)
imdb_y_pred_classes = (imdb_y_pred > 0.35).astype(int)

# Make predictions on the CMU test set
imdb_y_pred_cmu = model.predict(cmu_X_test)
imdb_y_pred_cmu_classes = (imdb_y_pred_cmu > 0.35).astype(int)

# Evaluation

## CMU

### CMU on CMU

In [None]:
# Calculate evaluation metrics
precision_micro = precision_score(cmu_y_test, cmu_y_pred_classes, average='micro')
recall_micro = recall_score(cmu_y_test, cmu_y_pred_classes, average='micro')
f1_micro = f1_score(cmu_y_test, cmu_y_pred_classes, average='micro')

precision_macro = precision_score(cmu_y_test, cmu_y_pred_classes, average='macro')
recall_macro = recall_score(cmu_y_test, cmu_y_pred_classes, average='macro')
f1_macro = f1_score(cmu_y_test, cmu_y_pred_classes, average='macro')

# Print the metrics
print(f'Micro Precision: {precision_micro}')
print(f'Micro Recall: {recall_micro}')
print(f'Micro F1-score: {f1_micro}')
print()
print(f'Macro Precision: {precision_macro}')
print(f'Macro Recall: {recall_macro}')
print(f'Macro F1-score: {f1_macro}')

In [None]:
# Calculate evaluation metrics for each genre
cmu_genre_scores = {}
for i, genre in enumerate(imdb_mlb.classes_):
    genre_accuracy = accuracy_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_precision = precision_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_recall = recall_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_f1 = f1_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    
    cmu_genre_scores[genre] = {'Accuracy': genre_accuracy,
                                'Precision': genre_precision,
                                'Recall': genre_recall,
                                'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in cmu_genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()

### CMU on IMDb

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(imdb_y_test, cmu_y_pred_imdb_classes)
precision = precision_score(imdb_y_test, cmu_y_pred_imdb_classes, average='micro')
recall = recall_score(imdb_y_test, cmu_y_pred_imdb_classes, average='micro')
f1 = f1_score(imdb_y_test, cmu_y_pred_imdb_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

In [None]:
# Calculate evaluation metrics for each genre
genre_scores = {}
for i, genre in enumerate(cmu_mlb.classes_):
    genre_accuracy = accuracy_score(imdb_y_test[:, i], cmu_y_pred_imdb_classes[:, i])
    genre_precision = precision_score(imdb_y_test[:, i], cmu_y_pred_imdb_classes[:, i])
    genre_recall = recall_score(imdb_y_test[:, i], cmu_y_pred_imdb_classes[:, i])
    genre_f1 = f1_score(imdb_y_test[:, i], cmu_y_pred_imdb_classes[:, i])
    
    genre_scores[genre] = {'Accuracy': genre_accuracy,
                           'Precision': genre_precision,
                           'Recall': genre_recall,
                           'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()

## IMDb

### IMDb on IMDb

In [None]:
# Calculate evaluation metrics
precision_micro = precision_score(imdb_y_test, imdb_y_pred_classes, average='micro')
recall_micro = recall_score(imdb_y_test, imdb_y_pred_classes, average='micro')
f1_micro = f1_score(imdb_y_test, imdb_y_pred_classes, average='micro')

precision_macro = precision_score(imdb_y_test, imdb_y_pred_classes, average='macro')
recall_macro = recall_score(imdb_y_test, imdb_y_pred_classes, average='macro')
f1_macro = f1_score(imdb_y_test, imdb_y_pred_classes, average='macro')

print(f'Precision (micro): {precision_micro}')
print(f'Recall (micro): {recall_micro}')
print(f'F1-score (micro): {f1_micro}')
print()
print(f'Precision (macro): {precision_macro}')
print(f'Recall (macro): {recall_macro}')
print(f'F1-score (macro): {f1_macro}')


In [None]:
# Calculate evaluation metrics for each genre
imdb_genre_scores = {}
for i, genre in enumerate(imdb_mlb.classes_):
    genre_accuracy = accuracy_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    genre_precision = precision_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    genre_recall = recall_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    genre_f1 = f1_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    
    imdb_genre_scores[genre] = {'Accuracy': genre_accuracy,
                                'Precision': genre_precision,
                                'Recall': genre_recall,
                                'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in imdb_genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()

### IMDb on CMU

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(cmu_y_test, imdb_y_pred_cmu_classes)
precision = precision_score(cmu_y_test, imdb_y_pred_cmu_classes, average='micro')
recall = recall_score(cmu_y_test, imdb_y_pred_cmu_classes, average='micro')
f1 = f1_score(cmu_y_test, imdb_y_pred_cmu_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

In [None]:
# Calculate evaluation metrics for each genre
imdb_genre_scores = {}
for i, genre in enumerate(imdb_mlb.classes_):
    genre_accuracy = accuracy_score(cmu_y_test[:, i], imdb_y_pred_cmu_classes[:, i])
    genre_precision = precision_score(cmu_y_test[:, i], imdb_y_pred_cmu_classes[:, i])
    genre_recall = recall_score(cmu_y_test[:, i], imdb_y_pred_cmu_classes[:, i])
    genre_f1 = f1_score(cmu_y_test[:, i], imdb_y_pred_cmu_classes[:, i])
    
    imdb_genre_scores[genre] = {'Accuracy': genre_accuracy,
                                'Precision': genre_precision,
                                'Recall': genre_recall,
                                'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in imdb_genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()