# Movie Genre Classification with LSTM

In [11]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
from gensim.models import KeyedVectors

RANDOM_STATE = 1212

cmu_data = pd.read_csv('cmu_data_final.csv')
imdb_data = pd.read_csv('imdb_data_final.csv')

# Dataset preparation

In [12]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

cmu_data['processed_plot'] = cmu_data['plot'].apply(preprocess_text)
imdb_data['processed_plot'] = imdb_data['plot'].apply(preprocess_text)

imdb_data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,movie_name,year,genre,plot,processed_plot
0,"Go West, Young Man",1936,Comedy,"Movie star Mavis Arden, as amorous in private ...",movie star mavis arden amorous private pure pu...
1,Re-Animated,2006,Comedy,Jimmy is the kid everybody ignores and uses. O...,jimmy kid everybody ignores us one day get fre...
2,Blue Blazes,1936,Comedy,"Buster becomes a fireman, but unfortunately no...",buster becomes fireman unfortunately particula...
3,Meet the Baron,1933,Comedy,The famous Baron Munchausen dumps two dimwits ...,famous baron munchausen dump two dimwit africa...
4,I Got the Hook Up,1998,Comedy,Two broke buddies feel lucky when they come up...,two broke buddy feel lucky come upon truckload...


In [13]:
# Word2vec embeddings, trained on the CMU dataset
word2vec_model = KeyedVectors.load('word2vec_model_from_cmu_utf8.bin')

### CMU

In [14]:
# Tokenize and pad the text sequences
cmu_tokenizer = Tokenizer()
cmu_tokenizer.fit_on_texts(cmu_data['processed_plot'])
cmu_sequences = cmu_tokenizer.texts_to_sequences(cmu_data['processed_plot'])
word_index = cmu_tokenizer.word_index
cmu_padded_sequences = pad_sequences(cmu_sequences, maxlen=300)

# Label encoding
cmu_data['genre'] = cmu_data['genre'].apply(lambda x: x.split('|'))
cmu_mlb = MultiLabelBinarizer()
cmu_genres_encoded = cmu_mlb.fit_transform(cmu_data['genre'])

# Train-test split
cmu_X_train, cmu_X_test, cmu_y_train, cmu_y_test = train_test_split(cmu_padded_sequences, cmu_genres_encoded, test_size=0.2, random_state=RANDOM_STATE)

### IMDb

# Model preparation

In [15]:
# Create the word embedding matrix
cmu_embedding_dim = word2vec_model.vector_size
cmu_embedding_matrix = np.zeros((len(word_index) + 1, cmu_embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        cmu_embedding_matrix[i] = word2vec_model.wv[word]

In [16]:
# Define and compile the LSTM model
model = Sequential()
model.add(Embedding(len(word_index) + 1, cmu_embedding_dim, embeddings_initializer=Constant(cmu_embedding_matrix), trainable=True))
model.add(LSTM(192, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(len(cmu_mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(cmu_X_train, cmu_y_train, batch_size=64, epochs=5, validation_data=(cmu_X_test, cmu_y_test))

Epoch 1/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 653ms/step - accuracy: 0.3635 - loss: 0.5588 - val_accuracy: 0.5520 - val_loss: 0.4640
Epoch 2/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 612ms/step - accuracy: 0.4987 - loss: 0.4766 - val_accuracy: 0.6148 - val_loss: 0.4152
Epoch 3/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 600ms/step - accuracy: 0.6160 - loss: 0.4097 - val_accuracy: 0.6288 - val_loss: 0.3995
Epoch 4/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 615ms/step - accuracy: 0.6697 - loss: 0.3665 - val_accuracy: 0.6576 - val_loss: 0.3844
Epoch 5/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 608ms/step - accuracy: 0.7147 - loss: 0.3205 - val_accuracy: 0.6480 - val_loss: 0.3936


<keras.src.callbacks.history.History at 0x157b4497050>

In [17]:
# Make predictions on the test set
cmu_y_pred = model.predict(cmu_X_test)
cmu_y_pred_classes = (cmu_y_pred > 0.5).astype(int) #0.5 här är att den uppskattade sannolikheten måste vara över 0.5. Annars blir den None (gör stor inpact på precision/recall men är oftast en trade off mellan dom båda)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 102ms/step


# Evaluation

In [18]:
# Calculate evaluation metrics
accuracy = accuracy_score(cmu_y_test, cmu_y_pred_classes)
precision = precision_score(cmu_y_test, cmu_y_pred_classes, average='micro')
recall = recall_score(cmu_y_test, cmu_y_pred_classes, average='micro')
f1 = f1_score(cmu_y_test, cmu_y_pred_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.5676855895196506
Precision: 0.6970021413276232
Recall: 0.5685589519650655
F1-score: 0.6262626262626263


In [19]:
# Calculate evaluation metrics for each genre
genre_scores = {}
for i, genre in enumerate(cmu_mlb.classes_):
    genre_accuracy = accuracy_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_precision = precision_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_recall = recall_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    genre_f1 = f1_score(cmu_y_test[:, i], cmu_y_pred_classes[:, i])
    
    genre_scores[genre] = {'Accuracy': genre_accuracy,
                           'Precision': genre_precision,
                           'Recall': genre_recall,
                           'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()

Genre: Comedy
Accuracy: 0.8375545851528384
Precision: 0.7478260869565218
Recall: 0.5733333333333334
F1-score: 0.6490566037735849

Genre: Drama
Accuracy: 0.8034934497816594
Precision: 0.6259259259259259
Recall: 0.5767918088737202
F1-score: 0.6003552397868561

Genre: Horror
Accuracy: 0.8724890829694323
Precision: 0.7237354085603113
Recall: 0.7126436781609196
F1-score: 0.7181467181467182

Genre: Thriller
Accuracy: 0.8078602620087336
Precision: 0.7005649717514124
Recall: 0.4261168384879725
F1-score: 0.5299145299145299

