In [2]:
import os
print(os.getcwd())


C:\Users\green\OneDrive\Documents\Portfolio Projects\movie-recommendation\notebooks


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import random
import os

# For model building and training
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# For hyperparameter tuning
import keras_tuner as kt

# For evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Set a seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load the data
# Assuming you have the MovieLens 100K dataset files in a folder called 'ml-100k'
ratings_path = '../data/movielens/u.data'
movies_path = '../data/movielens/u.item'
# Load ratings data
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=column_names)

# Load movie titles
movie_titles = pd.read_csv(
    movies_path,
    sep='|',
    encoding='latin-1',
    usecols=[0, 1],
    names=['item_id', 'title']
)

# Merge the datasets
data = pd.merge(ratings, movie_titles, on='item_id')

# Map user IDs and item IDs to indices
user_ids = data['user_id'].unique().tolist()
item_ids = data['item_id'].unique().tolist()

user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_id_to_idx = {item_id: idx for idx, item_id in enumerate(item_ids)}

data['user_idx'] = data['user_id'].map(user_id_to_idx)
data['item_idx'] = data['item_id'].map(item_id_to_idx)

num_users = len(user_ids)
num_items = len(item_ids)

# Create explicit feedback (interaction): 1 if rating >= 4, else 0
data['interaction'] = data['rating'].apply(lambda x: 1 if x >= 4 else 0)

# Split the data into training and testing sets
train_data, test_data = train_test_split(
    data[['user_idx', 'item_idx', 'interaction']],
    test_size=0.2,
    random_state=42
)

# Implement negative sampling
def create_negative_samples(df, num_negatives=4):
    users, items, labels = [], [], []
    user_item_set = set(zip(df['user_idx'], df['item_idx']))
    all_items = set(range(num_items))
    for (u, i) in user_item_set:
        # Positive instance
        users.append(u)
        items.append(i)
        labels.append(1)
        # Generate negative instances
        for _ in range(num_negatives):
            negative_item = random.choice(list(all_items - set([i])))
            users.append(u)
            items.append(negative_item)
            labels.append(0)
    return pd.DataFrame({'user_idx': users, 'item_idx': items, 'interaction': labels})

# Create training and testing datasets with negative samples
train_data_ns = create_negative_samples(train_data)
test_data_ns = create_negative_samples(test_data)

# Shuffle the datasets
train_data_ns = train_data_ns.sample(frac=1, random_state=42).reset_index(drop=True)
test_data_ns = test_data_ns.sample(frac=1, random_state=42).reset_index(drop=True)

# Prepare inputs for training and testing
train_user = train_data_ns['user_idx'].values
train_item = train_data_ns['item_idx'].values
train_label = train_data_ns['interaction'].values

test_user = test_data_ns['user_idx'].values
test_item = test_data_ns['item_idx'].values
test_label = test_data_ns['interaction'].values

# Build the model using Keras Tuner for hyperparameter optimization
def build_model(hp):
    # Hyperparameters to tune
    embedding_size = hp.Choice('embedding_size', [32, 64, 128])
    num_layers = hp.Int('num_layers', 1, 3)
    dense_units = hp.Int('dense_units', min_value=64, max_value=256, step=64)
    dropout_rate = hp.Float('dropout_rate', 0.0, 0.5, step=0.1)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])

    # User embedding
    user_input = keras.Input(shape=(1,), name='user_input')
    user_embedding = layers.Embedding(
        num_users,
        embedding_size,
        name='user_embedding',
        embeddings_regularizer=keras.regularizers.l2(1e-6)
    )(user_input)
    user_embedding = layers.Flatten()(user_embedding)

    # Item embedding
    item_input = keras.Input(shape=(1,), name='item_input')
    item_embedding = layers.Embedding(
        num_items,
        embedding_size,
        name='item_embedding',
        embeddings_regularizer=keras.regularizers.l2(1e-6)
    )(item_input)
    item_embedding = layers.Flatten()(item_embedding)

    # Concatenate embeddings
    concat = layers.Concatenate()([user_embedding, item_embedding])

    # Add dense layers
    x = concat
    for _ in range(num_layers):
        x = layers.Dense(dense_units, activation='relu')(x)
        if dropout_rate > 0.0:
            x = layers.Dropout(dropout_rate)(x)

    # Output layer
    output = layers.Dense(1, activation='sigmoid')(x)

    # Define the model
    model = keras.Model(inputs=[user_input, item_input], outputs=output)

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    return model

# Set up Keras Tuner
tuner = kt.Hyperband(
    build_model,
    objective=kt.Objective('val_auc', direction='max'),
    max_epochs=10,
    factor=3,
    directory='kt_dir',
    project_name='ncf_hyperband'
)

# Early stopping callback
early_stopping = keras.callbacks.EarlyStopping(monitor='val_auc', patience=2, restore_best_weights=True)

# Run hyperparameter search
tuner.search(
    x=[train_user, train_item],
    y=train_label,
    validation_data=([test_user, test_item], test_label),
    batch_size=1024,
    callbacks=[early_stopping],
    epochs=20
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. 
Optimal embedding size: {best_hps.get('embedding_size')}
Number of layers: {best_hps.get('num_layers')}
Dense units: {best_hps.get('dense_units')}
Dropout rate: {best_hps.get('dropout_rate')}
Learning rate: {best_hps.get('learning_rate')}
""")

# Build the best model
model = tuner.hypermodel.build(best_hps)

# Train the best model
history = model.fit(
    x=[train_user, train_item],
    y=train_label,
    validation_data=([test_user, test_item], test_label),
    batch_size=1024,
    epochs=20,
    callbacks=[early_stopping]
)

# Evaluate the model
test_loss, test_accuracy, test_auc = model.evaluate([test_user, test_item], test_label)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test AUC: {test_auc:.4f}")

# Compute Precision@K and Recall@K
def precision_recall_at_k(model, test_data, train_data, k=10):
    user_item_test = test_data.groupby('user_idx')['item_idx'].apply(set).to_dict()
    user_item_train = train_data.groupby('user_idx')['item_idx'].apply(set).to_dict()
    precisions, recalls = [], []
    for user_idx in user_item_test.keys():
        true_items = user_item_test[user_idx]
        train_items = user_item_train.get(user_idx, set())
        # Predict scores for all items
        item_array = np.array(range(num_items))
        user_array = np.full(num_items, user_idx)
        predictions = model.predict([user_array, item_array], batch_size=4096).flatten()
        # Exclude items in the training set
        predictions[list(train_items)] = -np.inf
        # Get top K items
        top_k_items = predictions.argsort()[-k:][::-1]
        # Compute precision and recall
        hits = np.isin(top_k_items, list(true_items))
        precision = np.sum(hits) / k
        recall = np.sum(hits) / len(true_items) if len(true_items) > 0 else 0
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    return avg_precision, avg_recall

precision, recall = precision_recall_at_k(model, test_data_ns, train_data_ns, k=10)
print(f"Precision@10: {precision:.4f}, Recall@10: {recall:.4f}")

# Save the model
model.save('ncf_model.h5')

# Save mappings and data for use in app.py
import pickle

# Save user_id_to_idx mapping
with open('user_id_to_idx.pkl', 'wb') as f:
    pickle.dump(user_id_to_idx, f)

# Save item_ids list (index corresponds to item_idx)
with open('item_ids.pkl', 'wb') as f:
    pickle.dump(item_ids, f)

# Save item_id_to_title mapping
item_id_to_title = dict(zip(movie_titles['item_id'], movie_titles['title']))
with open('item_id_to_title.pkl', 'wb') as f:
    pickle.dump(item_id_to_title, f)

# Save the list of user IDs
with open('user_ids.pkl', 'wb') as f:
    pickle.dump(user_ids, f)

# Save the data DataFrame (if needed for liked/disliked movies)
data.to_pickle('data.pkl')


Trial 30 Complete [00h 00m 12s]
val_auc: 0.8492797613143921

Best val_auc So Far: 0.8580106496810913
Total elapsed time: 00h 03m 01s

The hyperparameter search is complete. 
Optimal embedding size: 128
Number of layers: 2
Dense units: 64
Dropout rate: 0.2
Learning rate: 0.001

Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7923 - auc: 0.7005 - loss: 0.4714 - val_accuracy: 0.8127 - val_auc: 0.8102 - val_loss: 0.3952
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8145 - auc: 0.8134 - loss: 0.3927 - val_accuracy: 0.8133 - val_auc: 0.8117 - val_loss: 0.3941
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8158 - auc: 0.8186 - loss: 0.3885 - val_accuracy: 0.8136 - val_auc: 0.8146 - val_loss: 0.3926
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8199 - auc: 0.8301 - loss: 0.3797 - val_



Precision@10: 0.2466, Recall@10: 0.0321
