# Neural Collaborative Filtering - Movie Recommendator

# Libraries and Data Download

In [None]:
# External Libraries #
!pip install tensorflow

# Dataset Download #
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

--2024-10-04 12:25:31--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-10-04 12:25:33 (2.41 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


## Libraries

In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, Multiply, Concatenate
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

## Load Data

In [None]:
movies = pd.read_csv('ml-latest-small/movies.csv',encoding='utf8')
ratings = pd.read_csv('ml-latest-small/ratings.csv',encoding='utf8')

# Data Preprocessing

## Preparing Movies Dataset

In [None]:
def _extract_year(title):
    # Use regex to capture title and year
    match = re.match(r'^(.*)\s\((\d{4})\)$', title)
    if match:
        return match.group(1), int(match.group(2))
    else:
        return title, None

df_movies = movies.copy()

# Extract release year from the original title and drop it #
df_movies[['title', 'year']] = df_movies['title'].apply(lambda x: pd.Series(_extract_year(x)))
df_movies.dropna(inplace=True)

# Split genres into individual columns #
all_genres = set('|'.join(df_movies['genres']).split('|'))
for genre in all_genres:
    df_movies[genre] = df_movies['genres'].apply(lambda x: 1 if genre in x else 0)
df_movies.drop('genres', axis=1, inplace=True)

# Scale year column #
from sklearn.preprocessing import MinMaxScaler

year_scaler = MinMaxScaler()

df_movies['year_normalized'] = year_scaler.fit_transform(df_movies[['year']])

## Prepare Ratings and Merge with Movies

In [None]:
# Drop unnecessary columns #
df_ratings = ratings[['userId', 'movieId', 'rating']]

# Normalize ratings #
ratings_scaler = MinMaxScaler()
df_ratings['rating_normalized'] = ratings_scaler.fit_transform(df_ratings[['rating']])

# Merge the two df #
df_merged = pd.merge(df_ratings, df_movies, on='movieId')

## Encode users and movies id, and drop remaining cols

In [None]:
# Encode userId and movieId as categorical values #
df_merged['user_encoded'] = df_merged['userId'].astype('category').cat.codes
df_merged['movie_encoded'] = df_merged['movieId'].astype('category').cat.codes

df_final = df_merged.drop(['userId', 'movieId', 'title', 'year', 'rating'], axis=1)

# Model

## Split Dataset

In [None]:
# Split data into training, test and validation sets
train_val_data, test_data = train_test_split(df_final, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.2, random_state=42)

# Prepare input features (user, movie, movie metadata) and target variable (rating)
user_input = train_data['user_encoded'].values
movie_input = train_data['movie_encoded'].values
movie_features = train_data.drop(columns=['user_encoded', 'movie_encoded', 'rating_normalized']).values
ratings = train_data['rating_normalized'].values

## Model Outline

In [None]:
# Hyperparameters
embedding_dim = 64
mlp_hidden_units = [128, 64, 32]

# Input layers
user_input_layer = Input(shape=(1,), name='user_input')
movie_input_layer = Input(shape=(1,), name='movie_input')
movie_metadata_input = Input(shape=(movie_features.shape[1],), name='movie_metadata_input')

# Embedding layers for GMF
user_embedding_gmf = Embedding(input_dim=df_final['user_encoded'].nunique(), output_dim=embedding_dim)(user_input_layer)
movie_embedding_gmf = Embedding(input_dim=df_final['movie_encoded'].nunique(), output_dim=embedding_dim)(movie_input_layer)

# Embedding layers for MLP
user_embedding_mlp = Embedding(input_dim=df_final['user_encoded'].nunique(), output_dim=embedding_dim)(user_input_layer)
movie_embedding_mlp = Embedding(input_dim=df_final['movie_encoded'].nunique(), output_dim=embedding_dim)(movie_input_layer)

# Flatten embeddings
user_vec_gmf = Flatten()(user_embedding_gmf)
movie_vec_gmf = Flatten()(movie_embedding_gmf)
user_vec_mlp = Flatten()(user_embedding_mlp)
movie_vec_mlp = Flatten()(movie_embedding_mlp)

# GMF branch (element-wise product)
gmf_output = Multiply()([user_vec_gmf, movie_vec_gmf])

# MLP branch (concatenate embeddings + movie metadata)
mlp_input = Concatenate()([user_vec_mlp, movie_vec_mlp, movie_metadata_input])

# MLP hidden layers
mlp_output = mlp_input
for units in mlp_hidden_units:
    mlp_output = Dense(units, activation='relu')(mlp_output)

# NeuMF (combine GMF and MLP branches)
ncf_output = Concatenate()([gmf_output, mlp_output])

# Final output layer with sigmoid activation (rating prediction between 0 and 1)
final_output = Dense(1, activation='sigmoid', name='prediction')(ncf_output)

# Define the model
ncf_model = Model(inputs=[user_input_layer, movie_input_layer, movie_metadata_input], outputs=final_output)

# Compile the model
ncf_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Model summary
#ncf_model.summary()

## Training

In [None]:
# Early stopping mechanism
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Train the model
history = ncf_model.fit(
    [user_input, movie_input, movie_features],
    ratings,
    epochs=100,
    batch_size=64,
    validation_data=([val_data['user_encoded'].values, val_data['movie_encoded'].values, val_data.drop(columns=['user_encoded', 'movie_encoded', 'rating_normalized']).values], val_data['rating_normalized'].values),
    callbacks=[early_stopping]
)

Epoch 1/100
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - loss: 0.0320 - mae: 0.1370 - val_loss: 0.0352 - val_mae: 0.1436
Epoch 2/100
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - loss: 0.0173 - mae: 0.0981 - val_loss: 0.0366 - val_mae: 0.1465
Epoch 3/100
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - loss: 0.0077 - mae: 0.0663 - val_loss: 0.0379 - val_mae: 0.1497
Epoch 4/100
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - loss: 0.0045 - mae: 0.0512 - val_loss: 0.0388 - val_mae: 0.1513


## Testing

In [None]:
ncf_model.evaluate([test_data['user_encoded'].values, test_data['movie_encoded'].values, test_data.drop(columns=['user_encoded', 'movie_encoded', 'rating_normalized']).values], test_data['rating_normalized'].values)

[1m631/631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.0357 - mae: 0.1450


[0.03570444881916046, 0.14466847479343414]

# Save Model configs

In [None]:
# Save the model after training
ncf_model.save('ncf_model.h5')

# Save the movie embedding (GMF part)
movie_embedding_gmf = ncf_model.get_layer('embedding_1').get_weights()[0]  # Get the trained embedding weights
np.save('movie_embedding_gmf.npy', movie_embedding_gmf)



## Exporting Movies Ref

In [None]:
df_movies['movie_encoded'] = df_movies['movieId'].astype('category').cat.codes
df_movies.to_csv('movies.csv', index=False)