##Step 1: Download and Load MovieLens Data

In [1]:
# Download the small MovieLens dataset
!wget -q http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

# Unzip it
!unzip -q ml-latest-small.zip


In [2]:
import pandas as pd

# Load CSV files from the folder
movies = pd.read_csv('ml-latest-small/movies.csv')
links = pd.read_csv('ml-latest-small/links.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Check the data
movies.head(), links.head(), ratings.head()


(   movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
    movieId  imdbId   tmdbId
 0        1  114709    862.0
 1        2  113497   8844.0
 2        3  113228  15602.0
 3        4  114885  31357.0
 4        5  113041  11862.0,
    userId  movieId  rating  timestamp
 0       1        1     4.0  964982703
 1       1        3     4.0  964981247
 2       1        6     4.0  964982224
 3       1       47     5.0  964983815
 4       1       50  

##Step 2: Preprocess Movie Data for Content-based Recommendations

In [3]:
# Merge movies with links to get tmdbId (optional)
movies = pd.merge(movies, links[['movieId', 'tmdbId']], on='movieId', how='left')

# Convert genres from '|' separated string to space-separated (for embeddings)
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|', ' '))

# Combine title + genres into a single 'text' column
movies['text'] = movies['title'] + ' ' + movies['genres']

# Check the result
movies[['title', 'genres', 'text']].head()


Unnamed: 0,title,genres,text
0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Toy Story (1995) Adventure Animation Children ...
1,Jumanji (1995),Adventure Children Fantasy,Jumanji (1995) Adventure Children Fantasy
2,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men (1995) Comedy Romance
3,Waiting to Exhale (1995),Comedy Drama Romance,Waiting to Exhale (1995) Comedy Drama Romance
4,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


##Step 3: Generate Movie Embeddings

In [4]:
# Install sentence-transformers if not already installed
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
movie_embeddings = embedding_model.encode(movies['text'].tolist(), show_progress_bar=True)


print("Movie embeddings created! Shape:", movie_embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

Movie embeddings created! Shape: (9742, 384)


##Step 4: Content-based Recommendation Function

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity between all movie embeddings
cosine_sim = cosine_similarity(movie_embeddings, movie_embeddings)

# Function to get movie recommendations
def recommend_movie(title, movies=movies, cosine_sim=cosine_sim, top_n=5):
    # Find the index of the movie that matches the title
    idx = movies[movies['title'].str.lower() == title.lower()].index[0]

    # Get pairwise similarity scores for this movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar movies (excluding itself)
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top_n most similar movie titles
    return movies['title'].iloc[movie_indices].tolist()


In [6]:
recommend_movie("Toy Story (1995)", top_n=5)


['Toy Story 2 (1999)',
 'The Lego Movie (2014)',
 'Toy Story 3 (2010)',
 'Goofy Movie, A (1995)',
 'Turbo (2013)']

#Step 5: Preprocess Ratings Data and Map User/Movie IDs

In [7]:
# Load user ratings
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Remove duplicates
ratings.drop_duplicates(inplace=True)


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Map userId and movieId to contiguous indices
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

user2idx = {uid: i for i, uid in enumerate(user_ids)}
movie2idx = {mid: i for i, mid in enumerate(movie_ids)}

ratings['user_idx'] = ratings['userId'].map(user2idx)
ratings['movie_idx'] = ratings['movieId'].map(movie2idx)

num_users = len(user2idx)
num_movies = len(movie2idx)

# Train/test split
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.movies = torch.tensor(df['movie_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

train_dataset = RatingsDataset(train_df)
test_dataset = RatingsDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)


#Step 6: Define NeuMF Model

In [9]:
import torch.nn as nn
import torch.optim as optim

class NeuMF(nn.Module):
    def __init__(self, num_users, num_movies, emb_size=32):
        super(NeuMF, self).__init__()
        # Embeddings
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.movie_emb = nn.Embedding(num_movies, emb_size)

        # MLP layers
        self.fc_layers = nn.Sequential(
            nn.Linear(emb_size*2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, user, movie):
        u = self.user_emb(user)
        m = self.movie_emb(movie)
        x = torch.cat([u, m], dim=-1)
        return self.fc_layers(x).squeeze()


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuMF(num_users, num_movies).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



# Step 7: Create Negative Samples


In [11]:

import random
import numpy as np
from torch.utils.data import TensorDataset

def create_negative_samples(ratings, num_neg=4):
    user_movie_set = set(zip(ratings['userId'], ratings['movieId']))
    all_movies = list(movie2idx.keys())
    users, movies_list, labels = [], [], []

    for user, movie in user_movie_set:
        users.append(user2idx[user])
        movies_list.append(movie2idx[movie])
        labels.append(1)
        for _ in range(num_neg):
            neg_movie = random.choice(all_movies)
            while (user, neg_movie) in user_movie_set:
                neg_movie = random.choice(all_movies)
            users.append(user2idx[user])
            movies_list.append(movie2idx[neg_movie])
            labels.append(0)
    return np.array(users), np.array(movies_list), np.array(labels)

neg_users, neg_movies, neg_labels = create_negative_samples(ratings)
train_user_tensor = torch.tensor(neg_users, dtype=torch.long)
train_movie_tensor = torch.tensor(neg_movies, dtype=torch.long)
train_label_tensor = torch.tensor(neg_labels, dtype=torch.float32)
neg_dataset = TensorDataset(train_user_tensor, train_movie_tensor, train_label_tensor)
neg_loader = DataLoader(neg_dataset, batch_size=256, shuffle=True)


# Step 8: Train NeuMF Model


In [12]:
criterion = nn.BCEWithLogitsLoss()   # better loss for 0/1 implicit feedback
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5   # Increase for better results

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for users_batch, movies_batch, labels_batch in neg_loader:
        users_batch = users_batch.to(device)
        movies_batch = movies_batch.to(device)
        labels_batch = labels_batch.to(device)

        optimizer.zero_grad()

        # Predict
        outputs = model(users_batch, movies_batch)

        # Compute loss
        loss = criterion(outputs, labels_batch)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}  Loss: {total_loss/len(neg_loader):.4f}")


Epoch 1/5  Loss: 0.4292
Epoch 2/5  Loss: 0.3402
Epoch 3/5  Loss: 0.3227
Epoch 4/5  Loss: 0.3150
Epoch 5/5  Loss: 0.3099



# Step 9: Create Test Negatives


In [13]:

def create_test_negatives(ratings, num_neg=99):
    test_user_movie_set = set(zip(test_df['userId'], test_df['movieId']))
    all_movies = set(movie2idx.keys())
    user_rated = ratings.groupby('userId')['movieId'].apply(set).to_dict()
    test_data = []
    for (user, pos_movie) in test_user_movie_set:
        negatives = list(all_movies - user_rated[user])
        neg_samples = np.random.choice(negatives, size=num_neg, replace=False)
        test_data.append((user, pos_movie, neg_samples))
    return test_data

test_negatives = create_test_negatives(ratings)



# Step 10: Evaluate NeuMF Model


In [14]:

import math

def hit_ratio(ranklist, gtItem):
    return int(gtItem in ranklist)

def ndcg(ranklist, gtItem):
    if gtItem in ranklist:
        index = ranklist.index(gtItem)
        return math.log(2)/math.log(index+2)
    return 0.0

def evaluate_model(model, test_negatives, top_n=10):
    model.eval()
    HR, NDCG = [], []
    for user, pos_movie, neg_movies in test_negatives:
        movies_eval = [pos_movie] + list(neg_movies)
        user_idx_eval = torch.tensor([user2idx[user]]*len(movies_eval)).to(device)
        movie_idx_eval = torch.tensor([movie2idx[m] for m in movies_eval]).to(device)
        with torch.no_grad():
            scores = model(user_idx_eval, movie_idx_eval).cpu().numpy()
        movie_score_dict = {m: s for m, s in zip(movies_eval, scores)}
        top_movies = sorted(movie_score_dict, key=movie_score_dict.get, reverse=True)[:top_n]
        HR.append(hit_ratio(top_movies, pos_movie))
        NDCG.append(ndcg(top_movies, pos_movie))
    print(f"Hit Ratio @ {top_n}: {np.mean(HR):.4f}")
    print(f"NDCG @ {top_n}: {np.mean(NDCG):.4f}")

evaluate_model(model, test_negatives, top_n=10)


Hit Ratio @ 10: 0.6633
NDCG @ 10: 0.4163



# Step 11: Define recommend_ncf() for NeuMF Recommendations


In [15]:

def recommend_ncf(user_id, top_n=5):
    model.eval()

    # Ensure movie_ids is list
    movie_ids_list = list(movie_ids)

    # Create tensors for prediction
    user_idx = torch.tensor([user2idx[user_id]], device=device)
    movie_idx = torch.arange(num_movies, dtype=torch.long, device=device)
    user_idx_expanded = user_idx.repeat(num_movies)

    # Predict scores for all movies
    with torch.no_grad():
        scores = model(user_idx_expanded, movie_idx).cpu().numpy()

    scores = scores.flatten()

    # Get top N movies by predicted score
    top_movies_idx = scores.argsort()[-top_n:][::-1]

    # Convert to movieId values
    top_movie_ids = [int(movie_ids_list[i]) for i in top_movies_idx]

    # Ensure correct type
    movies['movieId'] = movies['movieId'].astype(int)

    # Fetch titles
    top_titles = movies.loc[movies['movieId'].isin(top_movie_ids), 'title'].tolist()

    return top_titles


In [16]:
top_recs = recommend_ncf(user_id=1, top_n=5)
print("Top 5 NeuMF recommendations for User 1:")
for i, title in enumerate(top_recs, 1):
    print(f"{i}. {title}")


Top 5 NeuMF recommendations for User 1:
1. Star Wars: Episode IV - A New Hope (1977)
2. Pulp Fiction (1994)
3. Forrest Gump (1994)
4. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
5. Lord of the Rings: The Two Towers, The (2002)
