In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load and preprocess data
df = pd.read_csv("../data/processed/ml1m_cleaned.csv")
df = df[["userId", "movieId", "rating"]].dropna()

# Encode userId and movieId
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df["userId_enc"] = user_encoder.fit_transform(df["userId"])
df["movieId_enc"] = movie_encoder.fit_transform(df["movieId"])

num_users = df["userId_enc"].nunique()
num_movies = df["movieId_enc"].nunique()
print(f"Users: {num_users}, Movies: {num_movies}")


Users: 6040, Movies: 3706


In [2]:
class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["userId_enc"].values, dtype=torch.long)
        self.movies = torch.tensor(df["movieId_enc"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["rating"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

dataset = MovieLensDataset(df)
train_loader = DataLoader(dataset, batch_size=2048, shuffle=True)


In [3]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=32):
        super(MatrixFactorization, self).__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.movie_emb = nn.Embedding(num_movies, embedding_dim)

    def forward(self, user_ids, movie_ids):
        user_vecs = self.user_emb(user_ids)
        movie_vecs = self.movie_emb(movie_ids)
        return (user_vecs * movie_vecs).sum(dim=1)

model = MatrixFactorization(num_users, num_movies)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)


In [5]:
for epoch in range(50):
    model.train()
    total_loss = 0
    for users, movies, ratings in train_loader:
        preds = model(users, movies)
        loss = criterion(preds, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 577.1751
Epoch 2, Loss: 485.7616
Epoch 3, Loss: 436.0675
Epoch 4, Loss: 406.4029
Epoch 5, Loss: 387.0213
Epoch 6, Loss: 372.7006
Epoch 7, Loss: 360.6970
Epoch 8, Loss: 350.4863
Epoch 9, Loss: 341.6288
Epoch 10, Loss: 333.6368
Epoch 11, Loss: 326.1548
Epoch 12, Loss: 319.0209
Epoch 13, Loss: 312.2430
Epoch 14, Loss: 306.0045
Epoch 15, Loss: 299.7354
Epoch 16, Loss: 294.3895
Epoch 17, Loss: 289.4383
Epoch 18, Loss: 284.8971
Epoch 19, Loss: 280.8277
Epoch 20, Loss: 277.0750
Epoch 21, Loss: 273.7022
Epoch 22, Loss: 270.6592
Epoch 23, Loss: 267.8552
Epoch 24, Loss: 265.2406
Epoch 25, Loss: 262.7798
Epoch 26, Loss: 260.5544
Epoch 27, Loss: 258.5468
Epoch 28, Loss: 256.5220
Epoch 29, Loss: 254.8697
Epoch 30, Loss: 253.3436
Epoch 31, Loss: 251.8554
Epoch 32, Loss: 250.3588
Epoch 33, Loss: 248.9936
Epoch 34, Loss: 247.5965
Epoch 35, Loss: 246.4449
Epoch 36, Loss: 245.4803
Epoch 37, Loss: 244.4725
Epoch 38, Loss: 243.3094
Epoch 39, Loss: 242.5725
Epoch 40, Loss: 241.5237
Epoch 41,

In [13]:
def recommend_movies_for_user(user_id, model, movie_encoder, user_encoder, df_movies, df, top_n=5):
    model.eval()
    user_idx = torch.tensor([user_encoder.transform([user_id])[0]])
    all_movie_idxs = torch.arange(len(movie_encoder.classes_))

    with torch.no_grad():
        scores = model(user_idx.repeat(len(all_movie_idxs)), all_movie_idxs)
        top_movie_indices = torch.topk(scores, top_n).indices

    # These are encoded movie IDs (e.g., 578, 2226, ...)
    top_movie_enc_ids = top_movie_indices.numpy()
    print("Top predicted movieId_enc:", top_movie_enc_ids)

    # Map back to original movie IDs using reverse transform
    top_movie_ids = movie_encoder.inverse_transform(top_movie_enc_ids)
    top_movie_ids_str = list(map(str, top_movie_ids))

    # Ensure movieId in df_movies is string
    df_movies["movieId"] = df_movies["movieId"].astype(str)

    # Filter movie metadata
    return df_movies[df_movies['movieId'].isin(top_movie_ids_str)][["movieId", "title"]]


In [14]:
user_id = df["userId"].iloc[0]
recommendations = recommend_movies_for_user(user_id, model, movie_encoder, user_encoder, df_movies, df)

print(f"\nTop 5 Recommendations for User {user_id}:")
print(recommendations)


Top predicted movieId_enc: [ 564 2039 3371  767 1304]

Top 5 Recommendations for User 1:
     movieId                        title
574      578  Hour of the Pig, The (1993)
804      814    Boy Called Hate, A (1995)
1383    1406         Cérémonie, La (1995)
2157    2226             Ring, The (1927)
3542    3611        Saludos Amigos (1943)


In [9]:
# Load movie titles from MovieLens 1M with correct encoding
df_movies = pd.read_csv(
    "../data/raw/ml-1m/movies.dat", 
    sep="::", 
    names=["movieId", "title", "genres"], 
    engine="python",
    encoding="ISO-8859-1"  # ✅ fix UnicodeDecodeError
)

# Ensure movieId is string for merging
df_movies["movieId"] = df_movies["movieId"].astype(str)


In [12]:
user_id = df["userId"].iloc[0]
recommendations = recommend_movies_for_user(user_id, model, movie_encoder, user_encoder, df_movies)
print(f"Top 5 Recommendations for User {user_id}:")
print(recommendations)

Recommended movieIds: [ 578 2226 3611  814 1406]
Top 5 Recommendations for User 1:
Empty DataFrame
Columns: [movieId, title]
Index: []


In [16]:
import os
# Make sure the directory exists
os.makedirs("../models", exist_ok=True)

# Save model weights
torch.save(model.state_dict(), "../models/mf_model_weights.pth")
