In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

In [2]:
# Load the dataset
data = pd.read_csv('cmu_dataset_prepared.csv')

# Preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data['processed_plot'] = data['plot'].apply(preprocess_text)

#WORD2VEC
# Train Word2Vec model
from nltk.tokenize import word_tokenize

# Tokenize each plot into a list of words
tokenized_plots = [word_tokenize(plot) for plot in data['processed_plot']]

# Train the Word2Vec model on the tokenized plots
word2vec_model = Word2Vec(sentences=tokenized_plots, vector_size=100, window=5)

# Preprocess the genres
data['genres'] = data['genre'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# similar words to 'love'
word2vec_model.wv.most_similar('laugh')

[('smile', 0.9594844579696655),
 ('reflection', 0.955476701259613),
 ('whisper', 0.9546499252319336),
 ('shake', 0.9496319890022278),
 ('cyrus', 0.9484403729438782),
 ('garfield', 0.94843989610672),
 ('bell', 0.9467282891273499),
 ('pooh', 0.9463261365890503),
 ('porky', 0.9455065727233887),
 ('there', 0.9441907405853271)]

In [4]:
# Load the pre-trained GloVe embeddings
"""embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
"""
# Tokenize and pad the text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_plot'])
sequences = tokenizer.texts_to_sequences(data['processed_plot'])
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=300)

# Create the embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, word2vec_model.vector_size))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the LSTM model
embedding_layer = Embedding(48699, 100, trainable=False)
embedding_layer.build((None,))  # if you don't do this, the next step won't work
embedding_layer.set_weights([embedding_matrix])

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 280ms/step - accuracy: 0.3230 - loss: 0.5694 - val_accuracy: 0.4283 - val_loss: 0.5075
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 268ms/step - accuracy: 0.4074 - loss: 0.5229 - val_accuracy: 0.4425 - val_loss: 0.5130
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 272ms/step - accuracy: 0.4412 - loss: 0.5097 - val_accuracy: 0.4758 - val_loss: 0.4887
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 263ms/step - accuracy: 0.4564 - loss: 0.4991 - val_accuracy: 0.5000 - val_loss: 0.4796
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 266ms/step - accuracy: 0.4826 - loss: 0.4873 - val_accuracy: 0.5000 - val_loss: 0.4812
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 254ms/step - accuracy: 0.5003 - loss: 0.4828 - val_accuracy: 0.5250 - val_loss: 0.4675
Epoch 7/10
[1m75/75[

<keras.src.callbacks.history.History at 0x2a5717c39d0>

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='micro')
recall = recall_score(y_test, y_pred_classes, average='micro')
f1 = f1_score(y_test, y_pred_classes, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step
Accuracy: 0.43833333333333335
Precision: 0.6666666666666666
Recall: 0.44333333333333336
F1-score: 0.5325325325325326


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics for each genre
genre_scores = {}
for i, genre in enumerate(mlb.classes_):
    genre_accuracy = accuracy_score(y_test[:, i], y_pred_classes[:, i])
    genre_precision = precision_score(y_test[:, i], y_pred_classes[:, i])
    genre_recall = recall_score(y_test[:, i], y_pred_classes[:, i])
    genre_f1 = f1_score(y_test[:, i], y_pred_classes[:, i])
    
    genre_scores[genre] = {'Accuracy': genre_accuracy,
                           'Precision': genre_precision,
                           'Recall': genre_recall,
                           'F1-score': genre_f1}

# Print scores for each genre
for genre, scores in genre_scores.items():
    print(f'Genre: {genre}')
    print(f'Accuracy: {scores["Accuracy"]}')
    print(f'Precision: {scores["Precision"]}')
    print(f'Recall: {scores["Recall"]}')
    print(f'F1-score: {scores["F1-score"]}')
    print()


Genre: Comedy
Accuracy: 0.8025
Precision: 0.7030567685589519
Recall: 0.48787878787878786
F1-score: 0.5760286225402504

Genre: Drama
Accuracy: 0.76
Precision: 0.5277777777777778
Recall: 0.3187919463087248
F1-score: 0.39748953974895396

Genre: Horror
Accuracy: 0.8441666666666666
Precision: 0.7068965517241379
Recall: 0.5795053003533569
F1-score: 0.6368932038834951

Genre: Thriller
Accuracy: 0.815
Precision: 0.7133757961783439
Recall: 0.3875432525951557
F1-score: 0.5022421524663677



In [8]:
# Count the number of times each genre is predicted
genre_counts = {}
for i, genre in enumerate(mlb.classes_):
    genre_count = np.sum(y_pred_classes[:, i])
    genre_counts[genre] = genre_count

# Print the counts for each genre
for genre, count in genre_counts.items():
    print(f"Predicted {genre}: {count} times")


Predicted Comedy: 229 times
Predicted Drama: 180 times
Predicted Horror: 232 times
Predicted Thriller: 157 times
