# Neural Collabrative Filtering

### Pre-requisites

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

### Import necessary libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Load the data

In [3]:
df = pd.read_csv('../Datasets/merged_moviecine_tmdb.csv')
df.sample(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,release_date,genre_names,original_language_full
8789330,171636,4638,2.0,1674530557,Hot Fuzz,2007-02-14,"['Crime', 'Action', 'Comedy']",English
4343493,84433,4368,2.0,1005626863,Dark Wolf,2003-04-15,['Horror'],English
3148319,61157,8665,5.0,1620217004,K-19: The Widowmaker,2002-07-19,"['Drama', 'History', 'Thriller', 'Mystery', 'W...",English
7706255,150309,31225,3.0,1155505369,Paris Is Burning,1991-03-13,['Documentary'],English
1786627,34729,2132,4.0,951554777,Totally Blonde,2001-01-01,['Comedy'],English


### Encoding the data

In [4]:
user_enc = LabelEncoder()
movie_enc = LabelEncoder()

df['user'] = user_enc.fit_transform(df['user_id'])
df['movie'] = movie_enc.fit_transform(df['movie_id'])

In [5]:
num_users = df['user'].nunique()
num_movies = df['movie'].nunique()

In [6]:
num_users

200924

In [7]:
num_movies

11438

### Holdout

In [8]:
train_df, test_df = train_test_split(df[['user', 'movie', 'rating']], test_size=0.2, random_state=42)

### Pytorch Dataset

In [9]:
class BPRDataset(Dataset):
    def __init__(self, df, num_items):
        self.user_item_dict = df.groupby('user')['movie'].apply(set).to_dict()
        self.users = df['user'].values
        self.items = df['movie'].values
        self.num_items = num_items

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        pos_item = self.items[idx]

        # Sample a negative item
        while True:
            neg_item = np.random.randint(0, self.num_items)
            if neg_item not in self.user_item_dict.get(user, set()):
                break

        return torch.tensor(user, dtype=torch.long), \
               torch.tensor(pos_item, dtype=torch.long), \
               torch.tensor(neg_item, dtype=torch.long)
               
train_dataset = BPRDataset(train_df, num_movies)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

KeyboardInterrupt: 

### Model Building

In [None]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NCF, self).__init__()
        
        # GMF part
        self.user_embed_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embed_gmf = nn.Embedding(num_items, embedding_dim)

        # MLP part
        self.user_embed_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embed_mlp = nn.Embedding(num_items, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

        # Final layer
        self.output_layer = nn.Linear(embedding_dim + 1, 1)  # 1 for the MLP output (single prediction)

    def forward(self, user, item):
        # GMF part
        gmf_user = self.user_embed_gmf(user)
        gmf_item = self.item_embed_gmf(item)
        gmf_out = gmf_user * gmf_item  # Element-wise multiplication for GMF

        # MLP part
        mlp_user = self.user_embed_mlp(user)
        mlp_item = self.item_embed_mlp(item)
        mlp_input = torch.cat([mlp_user, mlp_item], dim=-1)
        mlp_out = self.mlp(mlp_input)  # MLP outputs a single value for each user-item pair

        # Concatenate GMF + MLP outputs
        final_input = torch.cat([gmf_out, mlp_out], dim=-1)  # Concatenate the outputs
        prediction = self.output_layer(final_input)  # Final layer to produce the predicted rating
        return prediction.squeeze()  # Return the final prediction
    
    # Define BPR loss function
def bpr_loss(pos_scores, neg_scores):
    return -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-8))


### Model Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_movies).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_losses = []
epochs = 20

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for user, pos_item, neg_item in train_loader:
        user = user.to(device)
        pos_item = pos_item.to(device)
        neg_item = neg_item.to(device)

        pos_scores = model(user, pos_item)
        neg_scores = model(user, neg_item)

        loss = bpr_loss(pos_scores, neg_scores)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, BPR Loss: {avg_loss:.4f}")


### Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['movie'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]
test_dataset = RatingDataset(test_df)
# Create DataLoader for testing
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
loss_fn1 = nn.MSELoss()

model.eval()
all_preds = []
all_targets = []
val_losses = []
val_loss = 0  # Initialize val_loss

with torch.no_grad():
    for users, movies, ratings in test_loader:
        users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)

        preds = model(users, movies)
        preds = preds.view(-1)
        ratings = ratings.view(-1)

        loss = loss_fn1(preds, ratings)
        val_loss += loss.item()

        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(ratings.cpu().numpy())

val_loss /= len(test_loader)
val_losses.append(val_loss)

# Compute RMSE and MAE
rmse = np.sqrt(mean_squared_error(all_targets, all_preds))
mae = mean_absolute_error(all_targets, all_preds)

print(f"\n📊 Evaluation Results:")
print(f"Validation Loss: {val_loss:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")

In [None]:
import numpy as np
import torch

def evaluate_topk_metrics(model, test_df, all_items, k=10, num_users=1000, device='cpu'):
    """
    Evaluate HitRate@K, Precision@K, and NDCG@K.

    model: trained PyTorch NCF model
    test_df: DataFrame with columns ['user', 'movie']
    all_items: set of all item IDs
    k: top-k items to consider
    num_users: number of users to evaluate
    device: 'cuda' or 'cpu'
    """
    model.eval()
    hits, ndcgs, precisions = [], [], []

    users = test_df['user'].unique()
    sampled_users = np.random.choice(users, min(num_users, len(users)), replace=False)

    for user in sampled_users:
        true_item = test_df[test_df['user'] == user]['movie'].values[0]

        # Sample 99 negative items
        negatives = list(all_items - set(test_df[test_df['user'] == user]['movie']))
        if len(negatives) < 99:
            continue  # Skip if not enough negatives
        sampled_negatives = np.random.choice(negatives, size=99, replace=False).tolist()

        # Combine true item with negatives
        test_items = [true_item] + sampled_negatives
        user_tensor = torch.tensor([user] * len(test_items)).to(device)
        item_tensor = torch.tensor(test_items).to(device)

        with torch.no_grad():
            scores = model(user_tensor, item_tensor).squeeze().cpu().numpy()

        ranked_indices = np.argsort(-scores)
        ranked_items = np.array(test_items)[ranked_indices]

        top_k = ranked_items[:k]
        hit = int(true_item in top_k)
        hits.append(hit)

        if hit:
            rank_position = np.where(top_k == true_item)[0][0] + 1
            ndcgs.append(1 / np.log2(rank_position + 1))
        else:
            ndcgs.append(0)

        precisions.append(hit / k)

    return {
        "HitRate@K": np.mean(hits),
        "Precision@K": np.mean(precisions),
        "NDCG@K": np.mean(ndcgs)
    }

In [None]:
all_items = set(df['movie'].unique())

# Step 2: Create a test DataFrame with user-movie pairs
# For demo, we take one rating per user (you can improve this later)
test_df = df.groupby('user').first().reset_index()[['user', 'movie']]

metrics = evaluate_topk_metrics(model, test_df, all_items, k=10, device=device)
print(f"🎯 Evaluation Results:")
print(f"HitRate@10  : {metrics['HitRate@K']:.4f}")
print(f"Precision@10: {metrics['Precision@K']:.4f}")
print(f"NDCG@10     : {metrics['NDCG@K']:.4f}")

### Output Visualization

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Loss Curves')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

### Dumping the model and encoders

In [None]:
import pickle
torch.save(model.state_dict(), "ncf_model.pth")

with open("movie_encoder.pkl", "wb") as f:
    pickle.dump(movie_enc, f)
with open("user_encoder.pkl", "wb") as f:
    pickle.dump(user_enc, f)