In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
# Load the dataset
data = pd.read_csv('cmu_dataset_v3.csv')

# Preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data['processed_plot'] = data['plot'].apply(preprocess_text)

# Preprocess the genres
data['genres'] = data['genre'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load the pre-trained word2vec embeddings
# word2vec_model = KeyedVectors.load_word2vec_format('word2vec_model_from_cmu_utf8.bin', binary=True) # Adjust the path accordingly
# word2vec_model = KeyedVectors.load_word2vec_format('word2vec_model_from_cmu.bin', binary=True, encoding='latin1')
word2vec_model = KeyedVectors.load('word2vec_model_from_cmu_utf8.bin')



# Tokenize and pad the text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_plot'])
sequences = tokenizer.texts_to_sequences(data['processed_plot'])
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=300)

# Create the embedding matrix
embedding_dim = word2vec_model.vector_size  # Adjust the dimensionality according to your word2vec embeddings
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

import gensim.downloader as api
embeddings = api.load("word2vec-google-news-300")

embedding_dim2 = embeddings.vector_size
embedding_matrix2 = np.zeros((len(word_index) + 1, embedding_dim2))
for word, i in word_index.items():
    if word in embeddings:
        embedding_matrix2[i] = embeddings[word]

In [4]:
from hyperopt import hp, fmin, tpe, Trials
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Define the search space
space = {
    'batch_size': hp.quniform('batch_size', 35, 70, 1),
    'epochs': hp.choice('epochs', [5, 7, 10]),
    'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.3),
    'lstm_units': hp.quniform('lstm_units', 64, 256, 64),  # Quantized uniform distribution from 32 to 1024 with step size 32
    'optimizer': hp.choice('optimizer', ['Adam', 'Nadam']),  # Optimizer choice
    'trainable': hp.choice('trainable', [True, False]),  # Trainable choice
    'embedding': hp.choice('embedding', [
        {
            'embedding_matrix': embedding_matrix,
            'embedding_dim': embedding_dim
        },
        {
            'embedding_matrix': embedding_matrix2,
            'embedding_dim': embedding_dim2
        }
    ])  # Embedding matrix choice
}

# Define objective function
def objective(params):
    # Define the LSTM model
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, params['embedding']['embedding_dim'], embeddings_initializer=Constant(params['embedding']['embedding_matrix']), trainable=params['trainable']))
    model.add(LSTM(int(params['lstm_units']), dropout=params['dropout_rate'], recurrent_dropout=params['dropout_rate']))
    model.add(Dense(len(mlb.classes_), activation='sigmoid'))
    
    # Compile the model with selected optimizer
    optimizer = params['optimizer']
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train, y_train, batch_size=int(params['batch_size']), epochs=params['epochs'], verbose=0, validation_data=(X_test, y_test))
    
    # Predict the labels
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)
    
    # Compute the micro F1 score
    f1 = f1_score(y_test, y_pred, average='micro')
    
    return -f1  # Minimize negative F1 score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)

# Define and run the hyperparameter optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters found
print("Best hyperparameters:", best)


[1m 1/38[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 280ms/step
[1m 2/38[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 78ms/step  
[1m 3/38[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 72ms/step
[1m 4/38[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 69ms/step
[1m 5/38[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 67ms/step
[1m 6/38[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m2s[0m 67ms/step
[1m 7/38[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m2s[0m 67ms/step
[1m 8/38[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1s[0m 66ms/step
[1m 9/38[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1s[0m 66ms/step
[1m10/38[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 66ms/step
[1m11/38[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 66ms/step
[1m12/38[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1s[0m 65ms/step
[1m13/38[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1s[0m 65ms/step
[1m14/38[0m [32m━━━━━━━[0m[37m━━━━━━━━

In [7]:
# Convert trials to a DataFrame
trial_results = pd.DataFrame(trials.results)

# Add the hyperparameters to the DataFrame
for key in best.keys():
    trial_results[key] = [trial['misc']['vals'][key] for trial in trials.trials]

# Compute the F1 score from the loss
trial_results['f1_score'] = -trial_results['loss']

# Sort by F1 score
trial_results = trial_results.sort_values('f1_score', ascending=False)

# Print the DataFrame
print(trial_results)

        loss status batch_size           dropout_rate embedding epochs  \
11 -0.651489     ok     [52.0]  [0.23676145168153234]       [0]    [0]   
0  -0.649228     ok     [67.0]  [0.25372457621043853]       [0]    [2]   
8  -0.648228     ok     [52.0]  [0.20085442469213008]       [0]    [1]   
18 -0.646141     ok     [49.0]  [0.23072259319920463]       [0]    [1]   
12 -0.645105     ok     [39.0]  [0.26584711934433664]       [0]    [0]   
19 -0.644319     ok     [53.0]   [0.2474274733302211]       [0]    [1]   
14 -0.638735     ok     [45.0]   [0.2846064954751698]       [0]    [2]   
6  -0.634298     ok     [47.0]  [0.25317886970503906]       [0]    [0]   
3  -0.626654     ok     [56.0]   [0.2829457070209219]       [1]    [2]   
15 -0.623674     ok     [48.0]  [0.21264575356609372]       [1]    [0]   
4  -0.621334     ok     [56.0]  [0.24129906103816395]       [1]    [1]   
13 -0.620017     ok     [53.0]  [0.23607286328252586]       [1]    [2]   
17 -0.617988     ok     [45.0]  [0.284

In [6]:
""" from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier 
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.initializers import Constant
from keras.optimizers import SGD, RMSprop, Adam, Nadam

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)


# Define the function to create the LSTM model
def create_model(dropout_rate=0.0):
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, embedding_dim, embeddings_initializer=Constant(embedding_matrix), trainable=True))
    model.add(LSTM(128, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    model.add(Dense(len(mlb.classes_), activation='sigmoid'))
    return model

# Create the KerasClassifier
model = KerasClassifier(build_fn=create_model, loss='binary_crossentropy', metrics=['accuracy'], epochs=5, batch_size=64, verbose=0)

# Define the grid search parameters
param_grid = dict(dropout_rate=[0.1, 0.2, 0.3])

# Perform grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

 """

' from sklearn.model_selection import GridSearchCV\nfrom scikeras.wrappers import KerasClassifier \nfrom keras.models import Sequential\nfrom keras.layers import Embedding, LSTM, Dense\nfrom keras.initializers import Constant\nfrom keras.optimizers import SGD, RMSprop, Adam, Nadam\n\nX_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)\n\n\n# Define the function to create the LSTM model\ndef create_model(dropout_rate=0.0):\n    model = Sequential()\n    model.add(Embedding(len(word_index) + 1, embedding_dim, embeddings_initializer=Constant(embedding_matrix), trainable=True))\n    model.add(LSTM(128, dropout=dropout_rate, recurrent_dropout=dropout_rate))\n    model.add(Dense(len(mlb.classes_), activation=\'sigmoid\'))\n    return model\n\n# Create the KerasClassifier\nmodel = KerasClassifier(build_fn=create_model, loss=\'binary_crossentropy\', metrics=[\'accuracy\'], epochs=5, batch_size=64, verbose=0)\n\n# Define the grid