# Movie Genre Classification with LSTM

In [1]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
from gensim.models import KeyedVectors

RANDOM_STATE = 1212

cmu_data = pd.read_csv('cmu_data_final.csv')
imdb_data = pd.read_csv('imdb_data_final.csv')

# Dataset preparation

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

cmu_data['processed_plot'] = cmu_data['plot'].apply(preprocess_text)
imdb_data['processed_plot'] = imdb_data['plot'].apply(preprocess_text)

# Word2vec embeddings, trained on the CMU dataset
#word2vec_model = KeyedVectors.load('word2vec_model_from_cmu_utf8.bin')
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### CMU

In [3]:
# Tokenize and pad the text sequences
cmu_tokenizer = Tokenizer()
cmu_tokenizer.fit_on_texts(cmu_data['processed_plot'])
cmu_sequences = cmu_tokenizer.texts_to_sequences(cmu_data['processed_plot'])
word_index = cmu_tokenizer.word_index
cmu_padded_sequences = pad_sequences(cmu_sequences, maxlen=300)

# Label encoding
cmu_data['genre'] = cmu_data['genre'].apply(lambda x: x.split('|'))
cmu_mlb = MultiLabelBinarizer()
cmu_genres_encoded = cmu_mlb.fit_transform(cmu_data['genre'])

# Train-test split
cmu_X_train, cmu_X_test, cmu_y_train, cmu_y_test = train_test_split(
    cmu_padded_sequences, cmu_genres_encoded, test_size=0.2, random_state=RANDOM_STATE)

### IMDb

In [23]:
# Tokenize and pad the text sequences
imdb_tokenizer = Tokenizer()
imdb_tokenizer.fit_on_texts(imdb_data['processed_plot'])
imdb_sequences = imdb_tokenizer.texts_to_sequences(imdb_data['processed_plot'])
word_index = imdb_tokenizer.word_index
imdb_padded_sequences = pad_sequences(imdb_sequences, maxlen=300)

# Label encoding
imdb_data['genre'] = imdb_data['genre'].apply(lambda x: x.split('|'))
imdb_mlb = MultiLabelBinarizer()
imdb_genres_encoded = imdb_mlb.fit_transform(imdb_data['genre'])

# Train-test split
imdb_X_train, imdb_X_test, imdb_y_train, imdb_y_test = train_test_split(
    imdb_padded_sequences, imdb_genres_encoded, test_size=0.2, random_state=RANDOM_STATE)

# Model preparation

### CMU

In [4]:
""" # Create the word embedding matrix
cmu_embedding_dim = word2vec_model.vector_size
cmu_embedding_matrix = np.zeros((len(word_index) + 1, cmu_embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        cmu_embedding_matrix[i] = word2vec_model.wv[word] """

# Create the word embedding matrix
cmu_embedding_dim = word2vec_model.vector_size
cmu_embedding_matrix = np.zeros((len(word_index) + 1, cmu_embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model:  # Change this line
        cmu_embedding_matrix[i] = word2vec_model[word]  # And this line

In [5]:
# Define and compile the LSTM model
model = Sequential()
model.add(Embedding(len(word_index) + 1, cmu_embedding_dim, embeddings_initializer=Constant(
    cmu_embedding_matrix), trainable=False))
model.add(LSTM(192, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(len(cmu_mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(cmu_X_train, cmu_y_train, batch_size=64, epochs=7, validation_data=(
    cmu_X_test, cmu_y_test))

Epoch 1/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 656ms/step - accuracy: 0.3285 - loss: 0.5710 - val_accuracy: 0.4786 - val_loss: 0.4802
Epoch 2/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 635ms/step - accuracy: 0.5089 - loss: 0.4744 - val_accuracy: 0.5310 - val_loss: 0.4494
Epoch 3/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 635ms/step - accuracy: 0.5513 - loss: 0.4518 - val_accuracy: 0.5397 - val_loss: 0.4484
Epoch 4/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 632ms/step - accuracy: 0.5653 - loss: 0.4359 - val_accuracy: 0.6026 - val_loss: 0.4140
Epoch 5/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 625ms/step - accuracy: 0.6033 - loss: 0.4129 - val_accuracy: 0.6402 - val_loss: 0.3878
Epoch 6/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 625ms/step - accuracy: 0.6185 - loss: 0.4111 - val_accuracy: 0.6341 - val_loss: 0.3922
Epoch 7/7
[1m72/72[0m [32

<keras.src.callbacks.history.History at 0x13ed4315750>

In [6]:
# Make predictions on the test set
cmu_y_pred = model.predict(cmu_X_test)
cmu_y_pred_classes = (cmu_y_pred > 0.5).astype(int) #0.5 här är att den uppskattade sannolikheten måste vara över 0.5. Annars blir den None (gör stor inpact på precision/recall men är oftast en trade off mellan dom båda)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 123ms/step


### IMDb

In [27]:
""" # Create the word embedding matrix
imdb_embedding_dim = word2vec_model.vector_size
imdb_embedding_matrix = np.zeros((len(word_index) + 1, imdb_embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        imdb_embedding_matrix[i] = word2vec_model.wv[word] """

# Create the word embedding matrix
imdb_embedding_dim = word2vec_model.vector_size
imdb_embedding_matrix = np.zeros((len(word_index) + 1, imdb_embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model:  # Change this line
        imdb_embedding_matrix[i] = word2vec_model[word]  # And this line

In [28]:
# Define and compile the LSTM model
model = Sequential()
model.add(Embedding(len(word_index) + 1, imdb_embedding_dim, embeddings_initializer=Constant(
    imdb_embedding_matrix), trainable=False))
model.add(LSTM(192, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(len(imdb_mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(imdb_X_train, imdb_y_train, batch_size=64, epochs=7, validation_data=(
    imdb_X_test, imdb_y_test))

Epoch 1/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 582ms/step - accuracy: 0.3423 - loss: 0.5653 - val_accuracy: 0.5668 - val_loss: 0.4562
Epoch 2/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 574ms/step - accuracy: 0.5531 - loss: 0.4531 - val_accuracy: 0.6017 - val_loss: 0.4187
Epoch 3/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 572ms/step - accuracy: 0.6017 - loss: 0.4208 - val_accuracy: 0.6157 - val_loss: 0.4114
Epoch 4/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 678ms/step - accuracy: 0.6068 - loss: 0.4091 - val_accuracy: 0.6271 - val_loss: 0.4030
Epoch 5/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 762ms/step - accuracy: 0.6291 - loss: 0.4010 - val_accuracy: 0.6498 - val_loss: 0.3804
Epoch 6/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 782ms/step - accuracy: 0.6193 - loss: 0.3981 - val_accuracy: 0.6611 - val_loss: 0.3746
Epoch 7/7
[1m72/72[0m [32

<keras.src.callbacks.history.History at 0x213b033f410>

In [29]:
# Make predictions on the test set
imdb_y_pred = model.predict(imdb_X_test)
imdb_y_pred_classes = (imdb_y_pred > 0.5).astype(int) #0.5 here means that the estimated probability must be over 0.5. Otherwise, it becomes None (has a big impact on precision/recall but is usually a trade-off between the two)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 107ms/step


# Evaluation

### CMU

In [7]:
# Calculate evaluation metrics
accuracy = accuracy_score(cmu_y_test, cmu_y_pred_classes)
precision = precision_score(cmu_y_test, cmu_y_pred_classes, average='micro')
recall = recall_score(cmu_y_test, cmu_y_pred_classes, average='micro')
f1 = f1_score(cmu_y_test, cmu_y_pred_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.5318777292576419
Precision: 0.6880630630630631
Recall: 0.5336244541484716
F1-score: 0.6010821446138711


In [8]:
# Calculate evaluation metrics for each genre
genre_scores = {}
for i, genre in enumerate(cmu_mlb.classes_):
    genre_accuracy = accuracy_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_precision = precision_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_recall = recall_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_f1 = f1_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    
    genre_scores[genre] = {'Accuracy': genre_accuracy,
                           'Precision': genre_precision,
                           'Recall': genre_recall,
                           'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()

Genre: Comedy
Accuracy: 0.8296943231441049
Precision: 0.7027027027027027
Recall: 0.6066666666666667
F1-score: 0.6511627906976745

Genre: Drama
Accuracy: 0.8087336244541484
Precision: 0.6907216494845361
Recall: 0.45733788395904434
F1-score: 0.5503080082135524

Genre: Horror
Accuracy: 0.874235807860262
Precision: 0.6845425867507886
Recall: 0.8314176245210728
F1-score: 0.7508650519031141

Genre: Thriller
Accuracy: 0.7790393013100436
Precision: 0.6610169491525424
Recall: 0.26804123711340205
F1-score: 0.38141809290953543



### IMDb

In [32]:
# Calculate evaluation metrics
accuracy = accuracy_score(imdb_y_test, imdb_y_pred_classes)
precision = precision_score(imdb_y_test, imdb_y_pred_classes, average='micro')
recall = recall_score(imdb_y_test, imdb_y_pred_classes, average='micro')
f1 = f1_score(imdb_y_test, imdb_y_pred_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.5563318777292576
Precision: 0.7065337763012182
Recall: 0.5572052401746724
F1-score: 0.623046875


In [33]:
# Calculate evaluation metrics for each genre
imdb_genre_scores = {}
for i, genre in enumerate(imdb_mlb.classes_):
    genre_accuracy = accuracy_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    genre_precision = precision_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    genre_recall = recall_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    genre_f1 = f1_score(imdb_y_test[:, i], imdb_y_pred_classes[:, i])
    
    imdb_genre_scores[genre] = {'Accuracy': genre_accuracy,
                                'Precision': genre_precision,
                                'Recall': genre_recall,
                                'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in imdb_genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()

Genre: Comedy
Accuracy: 0.8078602620087336
Precision: 0.6169590643274854
Recall: 0.7033333333333334
F1-score: 0.6573208722741433

Genre: Drama
Accuracy: 0.8148471615720524
Precision: 0.7454545454545455
Recall: 0.4197952218430034
F1-score: 0.537117903930131

Genre: Horror
Accuracy: 0.879475982532751
Precision: 0.8153846153846154
Recall: 0.6091954022988506
F1-score: 0.6973684210526315

Genre: Thriller
Accuracy: 0.8235807860262009
Precision: 0.7213930348258707
Recall: 0.49828178694158076
F1-score: 0.5894308943089431

