# Movie Recommendation System - Neural Collaborative Filtering

# Libraries and Data Download

In [1]:
# External Libraries #
!pip install tensorflow

# Dataset Download #
!wget http://files.grouplens.org/datasets/movielens/ml-25m.zip
!unzip ml-25m.zip

!du -sh ml-25m

--2025-05-01 22:04:28--  http://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2025-05-01 22:04:31 (92.6 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]

Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       
1.1G	ml-25m


## Libraries

In [2]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, Multiply, Concatenate
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Load Data

In [3]:
movies = pd.read_csv('ml-25m/movies.csv',encoding='utf8')
ratings = pd.read_csv('ml-25m/ratings.csv',encoding='utf8')

# Data Preprocessing

## Preparing Movies Dataset

In [4]:
def _extract_year(title):
    # Use regex to capture title and year
    match = re.match(r'^(.*)\s\((\d{4})\)$', title)
    if match:
        return match.group(1), int(match.group(2))
    else:
        return title, None

df_movies = movies.copy()

# Extract release year from the original title and drop it #
df_movies[['title', 'year']] = df_movies['title'].apply(lambda x: pd.Series(_extract_year(x)))
df_movies.dropna(inplace=True)

# Split genres into individual columns #
all_genres = set('|'.join(df_movies['genres']).split('|'))
for genre in all_genres:
    df_movies[genre] = df_movies['genres'].apply(lambda x: 1 if genre in x else 0)
df_movies.drop('genres', axis=1, inplace=True)

# Scale year column #
from sklearn.preprocessing import MinMaxScaler

year_scaler = MinMaxScaler()

df_movies['year_normalized'] = year_scaler.fit_transform(df_movies[['year']])

## Prepare Ratings and Merge with Movies

In [5]:
# Ensure 'movieId' is category in both DataFrames for efficient merge #
ratings['movieId'] = ratings['movieId'].astype('category')
ratings['userId'] = ratings['userId'].astype('category')
movies['movieId'] = movies['movieId'].astype('category')

# Drop unnecessary columns and copy to prevent SettingWithCopyWarning #
df_ratings = ratings[['userId', 'movieId', 'rating']].copy()

# Normalize ratings #
ratings_scaler = MinMaxScaler()
df_ratings['rating_normalized'] = ratings_scaler.fit_transform(df_ratings[['rating']])

# Drop unused columns from movies to save memory #
df_movies = movies[['movieId', 'title']].copy()

# Perform memory-efficient merge #
df_merged = pd.merge(df_ratings, df_movies, on='movieId', how='left', sort=False)

In [6]:
df_merged.shape

(25000095, 5)

In [7]:
df_merged.head()

Unnamed: 0,userId,movieId,rating,rating_normalized,title
0,1,296,5.0,1.0,Pulp Fiction (1994)
1,1,306,3.5,0.666667,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,5.0,1.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,5.0,1.0,Underground (1995)
4,1,899,3.5,0.666667,Singin' in the Rain (1952)


## Encode users and movies id, and drop remaining cols

In [8]:
# Encode userId and movieId as categorical values #
df_merged['user_encoded'] = df_merged['userId'].astype('category').cat.codes
df_merged['movie_encoded'] = df_merged['movieId'].astype('category').cat.codes

df_final = df_merged.drop(['userId', 'movieId', 'title', 'rating'], axis=1)

# Model

## Split Dataset

In [9]:
# Split data into training, test and validation sets
train_val_data, test_data = train_test_split(df_final, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.2, random_state=42)

# Prepare input features (user, movie, movie metadata) and target variable (rating)
user_input = train_data['user_encoded'].values
movie_input = train_data['movie_encoded'].values
movie_features = train_data.drop(columns=['user_encoded', 'movie_encoded', 'rating_normalized']).values
ratings = train_data['rating_normalized'].values