# Neural Collabrative Filtering

### Pre-requisites

In [17]:
import warnings
warnings.filterwarnings(action='ignore')

### Import necessary libraries

In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle

### Load the data

In [19]:
df = pd.read_csv('../Datasets/merged_moviecine_tmdb.csv')
df.sample(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,release_date,genre_names,original_language_full
1512553,29457,2176,3.5,1112839937,The Glass House,2001-09-14,"['Drama', 'Thriller']",English
1817397,35340,4470,4.0,1069963591,Das Phantom,1994-09-05,"['Crime', 'Documentary']",German
446383,8869,671,5.0,1162869006,Harry Potter and the Philosopher's Stone,2001-11-16,"['Adventure', 'Fantasy']",English
1612523,31361,41569,5.0,1459354839,The Nomi Song,2004-10-14,"['Documentary', 'Music']",English
3616387,70300,1374,0.5,1329612093,Rocky IV,1985-11-21,['Drama'],English


### Encoding the data

In [20]:
user_enc = LabelEncoder()
movie_enc = LabelEncoder()

df['user'] = user_enc.fit_transform(df['user_id'])
df['movie'] = movie_enc.fit_transform(df['movie_id'])

In [21]:
num_users = df['user'].nunique()
num_movies = df['movie'].nunique()

In [22]:
num_users

200924

In [23]:
num_movies

11438

### Holdout

In [24]:
train_df, test_df = train_test_split(df[['user', 'movie', 'rating']], test_size=0.2, random_state=42)

### Pytorch Dataset

In [25]:
class BPRDataset(Dataset):
    def __init__(self, df, num_items):
        self.user_item_dict = df.groupby('user')['movie'].apply(set).to_dict()
        self.users = df['user'].values
        self.items = df['movie'].values
        self.num_items = num_items

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        pos_item = self.items[idx]

        while True:
            neg_item = np.random.randint(0, self.num_items)
            if neg_item not in self.user_item_dict.get(user, set()):
                break

        return torch.tensor(user), torch.tensor(pos_item), torch.tensor(neg_item)

train_dataset = BPRDataset(train_df, num_movies)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

### Model Building

In [26]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embed_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embed_gmf = nn.Embedding(num_items, embedding_dim)
        self.user_embed_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embed_mlp = nn.Embedding(num_items, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

        self.output_layer = nn.Linear(embedding_dim + 1, 1)

    def forward(self, user, item):
        gmf_user = self.user_embed_gmf(user)
        gmf_item = self.item_embed_gmf(item)
        gmf_out = gmf_user * gmf_item

        mlp_user = self.user_embed_mlp(user)
        mlp_item = self.item_embed_mlp(item)
        mlp_input = torch.cat([mlp_user, mlp_item], dim=-1)
        mlp_out = self.mlp(mlp_input)

        final_input = torch.cat([gmf_out, mlp_out], dim=-1)
        prediction = self.output_layer(final_input)
        return prediction.squeeze()

def bpr_loss(pos_scores, neg_scores):
    return -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-8))


### Model Training

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_movies, embedding_dim=64).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

train_losses, val_losses = [], []

class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['movie'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

test_dataset = RatingDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=512)

best_val_loss = float('inf')
patience, wait = 3, 0

for epoch in range(5):
    model.train()
    total_loss = 0
    for user, pos_item, neg_item in train_loader:
        user, pos_item, neg_item = user.to(device), pos_item.to(device), neg_item.to(device)
        pos_scores = model(user, pos_item)
        neg_scores = model(user, neg_item)
        loss = bpr_loss(pos_scores, neg_scores)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    train_losses.append(total_loss / len(train_loader))

    # Validation
    model.eval()
    val_loss, all_preds, all_targets = 0, [], []
    with torch.no_grad():
        for users, items, ratings in test_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            preds = model(users, items)
            loss = nn.MSELoss()(preds, ratings)
            val_loss += loss.item()
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(ratings.cpu().numpy())
    val_loss /= len(test_loader)
    val_losses.append(val_loss)

    rmse = np.sqrt(mean_squared_error(all_targets, all_preds))
    mae = mean_absolute_error(all_targets, all_preds)
    print(f"Epoch {epoch+1}, BPR Loss: {train_losses[-1]:.4f}, Val Loss: {val_loss:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        wait = 0
        torch.save(model.state_dict(), "best_ncf_model.pt")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break

Epoch 1, BPR Loss: 0.1087, Val Loss: 6.8527, RMSE: 2.6178, MAE: 2.1071


KeyboardInterrupt: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df['genre_names'] = df['genre_names'].fillna('').apply(eval)
df['genre_text'] = df['genre_names'].apply(lambda genres: ' '.join(genres))
df['combined'] = df['genre_text'] + ' ' + df['original_language_full']

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
movie_id_to_idx = dict(zip(df['movie_id'], df.index))
idx_to_movie_id = dict(zip(df.index, df['movie_id']))

def get_similar_movies(movie_id, top_k=10):
    if movie_id not in movie_id_to_idx:
        return []
    idx = movie_id_to_idx[movie_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:top_k+1]]
    return [idx_to_movie_id[i] for i in top_indices]

In [None]:
class HybridRecommender:
    def __init__(self, model, user_enc, movie_enc, device):
        self.model = model
        self.user_enc = user_enc
        self.movie_enc = movie_enc
        self.device = device

    def recommend(self, user_id, movie_id, top_k=10):
        if user_id in self.user_enc.classes_ and movie_id in self.movie_enc.classes_:
            try:
                user_idx = self.user_enc.transform([user_id])[0]
                user_tensor = torch.tensor([user_idx]*len(self.movie_enc.classes_)).to(self.device)
                item_tensor = torch.arange(len(self.movie_enc.classes_)).to(self.device)
                self.model.eval()
                with torch.no_grad():
                    scores = self.model(user_tensor, item_tensor).cpu().numpy()
                top_indices = np.argsort(scores)[::-1][:top_k]
                return self.movie_enc.inverse_transform(top_indices)
            except:
                pass
        print("‚ö†Ô∏è Cold-start: Using content-based fallback")
        return get_similar_movies(movie_id, top_k)

### Model Evaluation

In [None]:
def evaluate_topk_metrics(model, test_df, all_items, k=10, device='cpu'):
    model.eval()
    hits, ndcgs, precisions, recalls = [], [], [], []
    users = test_df['user'].unique()
    sampled_users = np.random.choice(users, size=min(1000, len(users)), replace=False)

    for user in sampled_users:
        user_movies = set(test_df[test_df['user'] == user]['movie'])
        if not user_movies:
            continue
        true_item = list(user_movies)[0]
        negatives = list(all_items - user_movies)
        if len(negatives) < 99:
            continue
        sampled_negatives = np.random.choice(negatives, 99, replace=False).tolist()
        test_items = [true_item] + sampled_negatives
        user_tensor = torch.tensor([user] * len(test_items)).to(device)
        item_tensor = torch.tensor(test_items).to(device)
        with torch.no_grad():
            scores = model(user_tensor, item_tensor).squeeze().cpu().numpy()
        ranked_items = np.array(test_items)[np.argsort(-scores)][:k]
        hit = int(true_item in ranked_items)
        hits.append(hit)
        precisions.append(hit / k)
        recalls.append(hit / 1)
        ndcgs.append(1 / np.log2(np.where(ranked_items == true_item)[0][0] + 2) if hit else 0)

    return {
        "HitRate@K": np.mean(hits),
        "Precision@K": np.mean(precisions),
        "Recall@K": np.mean(recalls),
        "NDCG@K": np.mean(ndcgs)
    }

In [None]:
all_items = set(df['movie'].unique())
test_df_sample = df.groupby('user').first().reset_index()[['user', 'movie']]
metrics = evaluate_topk_metrics(model, test_df_sample, all_items, k=10, device=device)

print("\nüéØ Top-K Evaluation:")
print(f"HitRate@10  : {metrics['HitRate@K']:.4f}")
print(f"Precision@10: {metrics['Precision@K']:.4f}")
print(f"Recall@10   : {metrics['Recall@K']:.4f}")
print(f"NDCG@10     : {metrics['NDCG@K']:.4f}")

### Output Visualization

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

### Dumping the model and encoders

In [None]:
model.load_state_dict(torch.load("best_ncf_model.pt"))
with open("movie_encoder.pkl", "wb") as f:
    pickle.dump(movie_enc, f)
with open("user_encoder.pkl", "wb") as f:
    pickle.dump(user_enc, f)