# Optimizing for classification model hyperparameters

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
# Load the dataset
data = pd.read_csv('cmu_dataset_final.csv')

# Preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data['processed_plot'] = data['plot'].apply(preprocess_text)

# Preprocess the genres
data['genres'] = data['genre'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres'])

In [None]:
# Load the pre-trained word2vec embeddings
word2vec_model = KeyedVectors.load('word2vec_model_from_cmu_utf8.bin')

# Tokenize and pad the text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_plot'])
sequences = tokenizer.texts_to_sequences(data['processed_plot'])
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=300)

# Create the embedding matrix
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Google news word2vec embeddings
import gensim.downloader as api
embeddings = api.load("word2vec-google-news-300")

embedding_dim2 = embeddings.vector_size
embedding_matrix2 = np.zeros((len(word_index) + 1, embedding_dim2))
for word, i in word_index.items():
    if word in embeddings:
        embedding_matrix2[i] = embeddings[word]

In [None]:
from hyperopt import hp, fmin, tpe, Trials
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Define the search space, specify which hyperparameters to tune
space = {
    'batch_size': hp.quniform('batch_size', 35, 70, 1),
    'epochs': hp.choice('epochs', [5, 7, 10]),
    'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.3),
    'lstm_units': hp.quniform('lstm_units', 64, 256, 64),
    'optimizer': hp.choice('optimizer', ['Adam', 'Nadam']),
    'trainable': hp.choice('trainable', [True, False]),
    'embedding': hp.choice('embedding', [
        {
            'embedding_matrix': embedding_matrix,
            'embedding_dim': embedding_dim
        },
        {
            'embedding_matrix': embedding_matrix2,
            'embedding_dim': embedding_dim2
        }
    ])
}

# Define objective function
# This builds, trains and evaluates the model with the specified hyperparameters
def objective(params):
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, params['embedding']['embedding_dim'], embeddings_initializer=Constant(params['embedding']['embedding_matrix']), trainable=params['trainable']))
    model.add(LSTM(int(params['lstm_units']), dropout=params['dropout_rate'], recurrent_dropout=params['dropout_rate']))
    model.add(Dense(len(mlb.classes_), activation='sigmoid'))
    
    optimizer = params['optimizer']
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=int(params['batch_size']), epochs=params['epochs'], verbose=0, validation_data=(X_test, y_test))

    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)

    f1 = f1_score(y_test, y_pred, average='micro')
    
    return -f1

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

print("Best hyperparameters:", best)


In [None]:
trial_results = pd.DataFrame(trials.results)

# Add the hyperparameters to the DataFrame
for key in best.keys():
    trial_results[key] = [trial['misc']['vals'][key] for trial in trials.trials]

# Compute the F1 score from the loss
trial_results['f1_score'] = -trial_results['loss']

# Sort by F1 score
trial_results = trial_results.sort_values('f1_score', ascending=False)

print(trial_results)