In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import sys
# add root folder to path
folder = "../../"
sys.path.append(folder)
from src.utils import load_data
from src.utils import plot_metrics_grid
from src.utils import load_baseline_rec_result
from src.metrics import evaluate_recommender_system

In [2]:
users,ratings,movies = load_data('../../data/ml-1m')

In [3]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


# Define model class

In [6]:


class RecommenderV3(nn.Module):
    def __init__(self, n_users, n_movies, n_dim, n_user_features, n_movie_features):
        super(RecommenderV3, self).__init__()
        
        # User embedding
        self.user_embedding = nn.Embedding(n_users, n_dim)
        self.user_dense = nn.Linear(n_dim + n_user_features, 64)
        
        # Movie embedding
        self.movie_embedding = nn.Embedding(n_movies, n_dim)
        self.movie_dense = nn.Linear(n_dim + n_movie_features, 64)
        
        # Final dense layers
        self.final_dense1 = nn.Linear(128, 64)
        self.dropout1 = nn.Dropout(0.25)
        self.final_dense2 = nn.Linear(64, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.final_output = nn.Linear(64, 1)
        
    def forward(self, user, user_features, movie, movie_features):
        # User branch
        user_embedded = self.user_embedding(user).view(user.size(0), -1)
        user_input = torch.cat([user_embedded, user_features], dim=1)
        user_out = F.relu(self.user_dense(user_input))
        
        # Movie branch
        movie_embedded = self.movie_embedding(movie).view(movie.size(0), -1)
        movie_input = torch.cat([movie_embedded, movie_features], dim=1)
        movie_out = F.relu(self.movie_dense(movie_input))
        
        # Concatenate user and movie branches
        x = torch.cat([user_out, movie_out], dim=1)
        x = F.relu(self.final_dense1(x))
        x = self.dropout1(x)
        x = F.relu(self.final_dense2(x))
        x = self.dropout2(x)
        x = self.final_output(x)
        
        return x

# Prepare data for training

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, MinMaxScaler

# Process user data
user_enc = LabelEncoder()
users['user_id'] = user_enc.fit_transform(users['user_id']) + 1  # Start IDs from 1

# Encode gender
gender_enc = LabelEncoder()
users['gender'] = gender_enc.fit_transform(users['gender'])

# Normalize age
age_scaler = MinMaxScaler()
users['age'] = age_scaler.fit_transform(users[['age']])

# Encode occupation
occupation_enc = LabelEncoder()
users['occupation'] = occupation_enc.fit_transform(users['occupation'])

# Process movie data
movies['genres'] = movies['genres'].str.split('|')
all_genres = sorted(set(g for genres in movies['genres'] for g in genres))
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])

genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)
movies = movies.join(genre_df)

# Parse release year
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(int)

# Shift movie_id to start from 1 and handle unknown movies
movie_enc = LabelEncoder()
movies['movie_id'] = movie_enc.fit_transform(movies['movie_id']) + 1

# Merge datasets
data = pd.merge(ratings, users, on='user_id', how='left')
data = pd.merge(data, movies, on='movie_id', how='left')

# Select relevant columns
user_features = ['gender', 'age', 'occupation']
movie_features = list(mlb.classes_) + ['year']
data = data[['user_id', 'movie_id', 'rating'] + user_features + movie_features + ['timestamp']]


# Temporal split (80/20)
train_size = int(0.8 * len(data))

data['year'] = data['year'].fillna(-1)
data[list(mlb.classes_)] = data[list(mlb.classes_)].fillna(-1)
train_data = data[:train_size]
val_data = data[train_size:]

n_users = train_data['user_id'].nunique()
n_movies = train_data['movie_id'].nunique()

In [8]:
data.isna().sum()

user_id        0
movie_id       0
rating         0
gender         0
age            0
occupation     0
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
year           0
timestamp      0
dtype: int64

In [9]:
user_features = ['gender', 'age', 'occupation']
movie_features = all_genres + ['year']

In [10]:
train_user_ids = set(train_data['user_id'].values.tolist())
train_movie_ids = set(train_data['movie_id'].values.tolist())

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class MovieLensDataset(Dataset):
    def __init__(self, data,user_features,movie_features,user_embedding_id_mapper, movie_embedding_id_mapper):
        self.data = data
        self.user_features = user_features
        self.movie_features = movie_features
        self.user_embedding_id_mapper = user_embedding_id_mapper
        self.movie_embedding_id_mapper = movie_embedding_id_mapper
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user_id = torch.tensor(self.user_embedding_id_mapper.get(row['user_id'],0), dtype=torch.long)
        movie_id = torch.tensor(self.movie_embedding_id_mapper.get(row['movie_id'],0), dtype=torch.long)
        rating = row['rating']
        user_features_tensor = torch.tensor(row[self.user_features].values, dtype=torch.float32)
        movie_features_tensor = torch.tensor(row[self.movie_features].values, dtype=torch.float32)
        return user_id, user_features_tensor, movie_id, movie_features_tensor, rating


In [12]:
user_embedding_id_mapper = {user_id: idx for idx, user_id in enumerate(train_user_ids,start=1)}
movie_embedding_id_mapper = {movie_id: idx for idx, movie_id in enumerate(train_movie_ids,start=1)}

In [13]:
batch_size = 512
# Prepare training and validation datasets
train_dataset = MovieLensDataset(train_data,user_features, movie_features,user_embedding_id_mapper,movie_embedding_id_mapper)
val_dataset = MovieLensDataset(val_data,user_features,movie_features,user_embedding_id_mapper,movie_embedding_id_mapper)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8,prefetch_factor=8)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4,prefetch_factor=8)

# Training

In [14]:

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

# Training loop
n_epochs = 10

# Define the model parameters
n_dim = 50
n_user_features = len(user_features)
n_movie_features = len(movie_features)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_users = len(train_user_ids) + 1
n_movies = len(train_movie_ids) + 1
model = RecommenderV3(n_users, n_movies, n_dim, n_user_features, n_movie_features).to(device)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.MSELoss()

# Training loop
n_epochs = 10

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    
    train_pbar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{n_epochs}", leave=False)
    for user_id, user_features, movie_id, movie_features, rating in train_pbar:
        # Move data to the appropriate device
        user_id = user_id.to(device).long()
        user_features = user_features.to(device)
        movie_id = movie_id.to(device).long()
        movie_features = movie_features.to(device)
        rating = rating.to(device).float()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(user_id, user_features, movie_id, movie_features)
        loss = criterion(outputs.squeeze(), rating)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * user_id.size(0)
    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch {epoch+1}/{n_epochs}, Training Loss: {epoch_loss:.4f}')
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_pbar = tqdm(val_dataloader, desc=f"Validation Epoch {epoch+1}/{n_epochs}", leave=False)
    with torch.no_grad():
        for user_id, user_features, movie_id, movie_features, rating in val_pbar:
            # Move data to the appropriate device
            user_id = user_id.to(device).long()
            user_features = user_features.to(device)
            movie_id = movie_id.to(device).long()
            movie_features = movie_features.to(device)
            rating = rating.to(device).float()
            
            outputs = model(user_id, user_features, movie_id, movie_features)
            loss = criterion(outputs.squeeze(), rating)
            val_loss += loss.item() * user_id.size(0)
    
    val_loss /= len(val_dataset)
    print(f'Epoch {epoch+1}/{n_epochs}, Validation Loss: {val_loss:.4f}')

                                                                        

Epoch 1/10, Training Loss: 5.2179


                                                                        

KeyboardInterrupt: 

# Save model

In [None]:
# def save_checkpoint(model, optimizer, epoch, loss, genre_enc, user_enc, movie_enc, file_path='checkpoint.pth'):
#     state = {
#         'epoch': epoch,
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'loss': loss,
#         'genre_enc': genre_enc,
#         'user_enc': user_enc,
#         'movie_enc': movie_enc
#     }
#     torch.save(state, file_path)
#     print(f'Checkpoint saved at {file_path}')

# import os 
# os.makedirs('../../artifacts/dl_model', exist_ok=True)
# save_checkpoint(model, optimizer, n_epochs, val_loss, mlb, user_enc, movie_enc, '../../artifacts/dl_model/recommeder.pth')

In [None]:
def load_checkpoint(file_path, model, optimizer):
    checkpoint = torch.load(file_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    genre_enc = checkpoint['genre_enc']
    user_enc = checkpoint['user_enc']
    movie_enc = checkpoint['movie_enc']
    print(f'Checkpoint loaded: Epoch {epoch}, Loss: {loss:.4f}')
    return epoch, loss, genre_enc, user_enc, movie_enc

# Example usage
n_users = data['user_id'].nunique()
n_movies = data['movie_id'].nunique()
n_dim = 50
n_user_features = len(user_features)
n_movie_features = len(movie_features)

model2 = RecommenderV3(n_users, n_movies, n_dim, n_user_features, n_movie_features).cuda()
optimizer = optim.Adam(model2.parameters(), lr=0.0001)
start_epoch, start_loss, genre_enc, user_enc, movie_enc = load_checkpoint('../../artifacts/dl_model/recommeder.pth', model2, optimizer)


# Inference

In [26]:
import torch
import numpy as np
from tqdm import tqdm

def get_recommendations(model, user_id_tensor, user_features_tensor, movie_ids_tensor, movie_features_tensor, top_k=25, batch_size=512):
    model.eval()
    with torch.no_grad():
        # Make batched predictions
        predictions = []
        for i in tqdm(range(0, len(movie_ids_tensor), batch_size)):
            batch_user_id = user_id_tensor[i:i+batch_size]
            batch_user_features = user_features_tensor[i:i+batch_size]
            batch_movie_ids = movie_ids_tensor[i:i+batch_size]
            batch_movie_features = movie_features_tensor[i:i+batch_size]
            
            batch_predictions = model(batch_user_id, batch_user_features, batch_movie_ids, batch_movie_features)
            predictions.append(batch_predictions.cpu().numpy())
        
        # Concatenate predictions from batches
        predictions = np.concatenate(predictions)
    
    predictions = predictions.squeeze()
    # Get top-k movie indices
    top_k_indices = predictions.argsort()[-top_k:][::-1]
    print(top_k_indices)
    recommended_movie_ids = movie_ids_tensor[top_k_indices.copy()].cpu().numpy()
    return recommended_movie_ids.tolist()

# Example user selection
user_id = val_data['user_id'].iloc[0]
user_row = val_data[val_data['user_id'] == user_id].iloc[0]
user_features = [
    user_row['age'],
    user_row['gender'],
    user_row['occupation']
]

# Example: Assume movies_user_didnt_see is already defined
movies_user_didnt_see = val_data['movie_id'].unique()  # Replace with actual list of unseen movies

# Prepare movie features for all movies the user didn't see
movie_ids = movies_user_didnt_see
movie_features = []
for movie_id in movie_ids:
    movie_row = val_data[val_data['movie_id'] == movie_id].iloc[0]
    movie_feature = np.concatenate([
        movie_row[all_genres].values,
        np.array([movie_row['year']])
    ])
    movie_features.append(movie_feature)

movie_ids = [movie_embedding_id_mapper.get(movie_id,0) for movie_id in movie_ids]
# Convert user_id, user_features, movie_ids, and movie_features to tensors
user_id_tensor = torch.tensor([user_embedding_id_mapper.get(user_id,0)] * len(movie_ids), dtype=torch.long).cuda()[:512]
user_features_tensor = torch.tensor([user_features] * len(movie_ids), dtype=torch.float).cuda()[:512]
movie_ids_tensor = torch.tensor(movie_ids, dtype=torch.long).cuda()[:512]
movie_features_tensor = torch.tensor(movie_features, dtype=torch.float).cuda()[:512]

# Generate recommendations for the user
recommended_movie_ids = get_recommendations(model, user_id_tensor, user_features_tensor, movie_ids_tensor, movie_features_tensor)


100%|██████████| 1/1 [00:00<00:00, 1369.79it/s]

[ 46 261  91 271  56  88 144 117 325 270 377 255 256  25 405 484  18  16
 162 159 147  27 423 446 341]





In [None]:
# model2(user_id_tensor, user_features_tensor, movie_ids_tensor, movie_features_tensor)

In [None]:
print(user_id_tensor.dtype)
print(user_features_tensor.dtype)
print(movie_ids_tensor.dtype)
print(movie_features_tensor.dtype)

#print shapes
print(user_id_tensor.shape)
print(user_features_tensor.shape)
print(movie_ids_tensor.shape)
print(movie_features_tensor.shape)


In [None]:
_, _, movie_ids_tensor, _, _ = next(train_dataloader.__iter__())

In [None]:
# print(user_id_tensor.dtype)
# print(user_features_tensor.dtype)
# print(movie_ids_tensor.dtype)
# print(movie_features_tensor.dtype)

# #print shapes
# print(user_id_tensor.shape)
# print(user_features_tensor.shape)
# print(movie_ids_tensor.shape)
# print(movie_features_tensor.shape)


In [17]:
model(user_id_tensor.cuda(), user_features_tensor.cuda(), movie_ids_tensor.cuda(), movie_features_tensor.cuda())

tensor([[2.7674],
        [2.8272],
        [2.8457],
        [2.8328],
        [2.8027],
        [2.8331],
        [2.8339],
        [2.8280],
        [2.8441],
        [2.8023],
        [2.8098],
        [2.8278],
        [2.8179],
        [2.8172],
        [2.8323],
        [2.8342],
        [2.8493],
        [2.8393],
        [2.8494],
        [2.8167],
        [2.8290],
        [2.8273],
        [2.7944],
        [2.8227],
        [2.8339],
        [2.8506],
        [2.8047],
        [2.8480],
        [2.7802],
        [2.7517],
        [2.8313],
        [2.8222],
        [2.8376],
        [2.7960],
        [2.7851],
        [2.8445],
        [2.8411],
        [2.8305],
        [2.8329],
        [2.8277],
        [2.8459],
        [2.8456],
        [2.8206],
        [2.8196],
        [2.8216],
        [2.8169],
        [2.8693],
        [2.8080],
        [2.7707],
        [2.8206],
        [2.8422],
        [2.8380],
        [2.7848],
        [2.7893],
        [2.8185],
        [2

In [None]:
movie_feature

In [None]:
movie_row