# Neural Collaborative Filtering 

* Goal: Build a Neural Collaborative Filtering (NCF)-based hybrid recommender system with Pairwise Ranking 
* Use both user and item features
* Use text embedding of product title, descriprion and features
* Model seems to be overfitting, needs some work

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

os.chdir("../")
from aux.feature_engineering import calculate_rolling_stats
from aux.train_test_split import (
    global_temporal_split,
    temporal_split_users_in_both_sets,
    temporal_split_users_with_cold_start,
)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_colwidth', None)

# Load Data

In [3]:
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
df_reviews = dataset_reviews["full"].to_pandas()

dataset_items = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_items = dataset_items.to_pandas()

# filter out users with low ammount of reviews for now - look at the cold start problem later
min_ammount_reviews = 5
user_review_counts = df_reviews.groupby('user_id').size()
users_with_min_reviews = user_review_counts[user_review_counts >= min_ammount_reviews].index
df_reviews_filtered = df_reviews[df_reviews['user_id'].isin(users_with_min_reviews)]

df = pd.merge(df_reviews_filtered, df_items, on='parent_asin', how='left', suffixes=('_review', '_item'))

# Pre-Process

In [4]:
df['verified_purchase'] = df.verified_purchase.astype('int')
df['store'] = df['store'].fillna('UNKNOWN')
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df['year'] = df['timestamp'].dt.year

# Feature engineering

In [5]:
df = calculate_rolling_stats(df, 'user_id')  
df = calculate_rolling_stats(df, 'parent_asin')  
df = df.sort_values(by='timestamp')

# Feature Selection

In [6]:
# numerical features
user_review_features = ['user_id', 'rolling_avg_rating_user', 'rolling_review_count_user', 'helpful_vote', 'verified_purchase', 'year']
product_features = ['parent_asin', 'average_rating', 'rolling_avg_rating_product', 'rolling_review_count_product', 'rating_number']
categoricals = ['main_category', 'store']
target = 'rating'

train_test_split_features =  ['timestamp']

columns = train_test_split_features + user_review_features + product_features + categoricals  + [target]
df = df[columns]

In [7]:
user_numeric_cols = [
    "rolling_avg_rating_user",
    "rolling_review_count_user",
    "helpful_vote",
    "verified_purchase",  
    "year"
]

item_numeric_cols = [
    "average_rating",
    "rolling_avg_rating_product",
    "rolling_review_count_product",
    "rating_number"
]

# Train Test Split

In [8]:
train_df, test_df = global_temporal_split(df, split_ratio=0.8, exclude_cold_start_users=False)

# Encode User and Item ids

In [10]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# fit encoders on training data only
user_encoder.fit(train_df['user_id'])
item_encoder.fit(train_df['parent_asin'])  # assuming parent_asin is your item_id

# transform training data
train_user_ids = user_encoder.transform(train_df['user_id'])
train_item_ids = item_encoder.transform(train_df['parent_asin'])

# for test set, handle new users and items
test_user_ids = []
test_item_ids = []

for user_id, item_id in zip(test_df['user_id'], test_df['parent_asin']):
    if user_id in user_encoder.classes_:
        test_user_ids.append(user_encoder.transform([user_id])[0])
    else:
        test_user_ids.append(len(user_encoder.classes_))
    
    if item_id in item_encoder.classes_:
        test_item_ids.append(item_encoder.transform([item_id])[0])
    else:
        test_item_ids.append(len(item_encoder.classes_))


# Load product embedings

In [11]:
train_embeddings = np.load('train_embeddings.npy')
test_embeddings = np.load('test_embeddings.npy')

# User, Item and Product Data

In [12]:
user_data_train = train_df[user_numeric_cols]
user_data_test = test_df[user_numeric_cols]

# process item features and embeddings
base_item_train = train_df[item_numeric_cols]
base_item_test = test_df[item_numeric_cols]

train_emb_df = pd.DataFrame(
    train_embeddings, 
    columns=[f'emb_{i}' for i in range(train_embeddings.shape[1])]
)
test_emb_df = pd.DataFrame(
    test_embeddings, 
    columns=[f'emb_{i}' for i in range(test_embeddings.shape[1])]
)

item_data_train = pd.concat([base_item_train, train_emb_df], axis=1)
item_data_test = pd.concat([base_item_test, test_emb_df], axis=1)

train_ratings = train_df[target]
test_ratings = test_df[target]


In [135]:
#from sklearn.preprocessing import StandardScaler
#
## Scale user features
#user_scaler = StandardScaler()
#user_data_train_scaled = user_scaler.fit_transform(user_data_train)
#user_data_test_scaled = user_scaler.transform(user_data_test)
#
#item_scaler = StandardScaler()
#base_item_train_scaled = item_scaler.fit_transform(base_item_train)
#base_item_test_scaled = item_scaler.transform(base_item_test)
#
## Convert scaled user data back to DataFrame
#    user_data_train_scaled, 
#    columns=user_data_train.columns, 
#    index=user_data_train.index
#)
#user_data_test = pd.DataFrame(
#    user_data_test_scaled, 
#    columns=user_data_test.columns, 
#    index=user_data_test.index
#)
#
#item_data_train = pd.concat([
#    pd.DataFrame(base_item_train_scaled, columns=base_item_train.columns, index=base_item_train.index),
#    train_emb_df
#], axis=1)
#
#item_data_test = pd.concat([
#    pd.DataFrame(base_item_test_scaled, columns=base_item_test.columns, index=base_item_test.index),
#    test_emb_df
#], axis=1)

In [136]:
user_data_train.head()

Unnamed: 0,rolling_avg_rating_user,rolling_review_count_user,helpful_vote,verified_purchase,year
0,0.936376,-0.545577,4.964564,-0.724747,-7.82991
1,-0.27878,-0.545577,7.562062,-0.724747,-7.82991
2,0.936376,-0.486283,0.029319,-0.724747,-7.82991
3,0.936376,-0.42699,0.289068,-0.724747,-7.32
4,0.936376,-0.545577,1.068318,1.379793,-7.32


# Build Datasets

In [137]:
class RankingDataset(Dataset):
    def __init__(self, user_data, item_data, ratings, user_ids, item_ids, num_negative=1):
        #print("\nDataset initialization:")
        #print(f"Raw user_data shape: {user_data.shape}")
        #print(f"Raw item_data shape: {item_data.shape}")
        
        self.user_data = torch.FloatTensor(user_data.values)
        self.item_data = torch.FloatTensor(item_data.values)
        
        #print(f"After conversion to tensor:")
        #print(f"self.user_data shape: {self.user_data.shape}")
        #print(f"self.item_data shape: {self.item_data.shape}")
        
        self.user_ids = torch.LongTensor(user_ids)
        self.item_ids = torch.LongTensor(item_ids)
        self.ratings = ratings.values if hasattr(ratings, 'values') else ratings
        
        # create pairs
        self.pairs = []
        unique_users = np.unique(user_ids)
        
        for user_idx in unique_users:
            # indices of ratings given by this user
            user_indices = np.where(user_ids == user_idx)[0]
            user_ratings = self.ratings[user_indices]
            
            # create positive pairs
            for i in range(len(user_indices)):
                for j in range(i + 1, len(user_indices)):
                    if user_ratings[i] > user_ratings[j]:
                        self.pairs.append((user_idx, user_indices[i], user_indices[j]))
                    elif user_ratings[i] < user_ratings[j]:
                        self.pairs.append((user_idx, user_indices[j], user_indices[i]))
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        user_idx, pos_item_idx, neg_item_idx = self.pairs[idx]
        #print("\nBatch item shapes:")
        #print(f"Item data shape: {self.item_data.shape}")
        #print(f"Single item features shape: {self.item_data[pos_item_idx].shape}")
        return {
            'user_id': self.user_ids[user_idx],
            'user_features': self.user_data[user_idx],
            'pos_item_id': self.item_ids[pos_item_idx],
            'pos_item_features': self.item_data[pos_item_idx],
            'neg_item_id': self.item_ids[neg_item_idx],
            'neg_item_features': self.item_data[neg_item_idx],
            'pos_rating': self.ratings[pos_item_idx],
            'neg_rating': self.ratings[neg_item_idx]
        }

class EvaluationDataset(Dataset):
    def __init__(self, user_data, item_data, ratings, user_ids, item_ids):
        """
        user_data: shape [n_users, user_feature_dim]
        item_data: shape [n_items, item_feature_dim]
        
        user_ids, item_ids: arrays of length N, each element is an integer ID
          in [0..n_users-1], [0..n_items-1] respectively (after label-encoding).
        """
        self.user_data = torch.FloatTensor(user_data.values)
        self.item_data = torch.FloatTensor(item_data.values)
        
        self.user_ids = torch.LongTensor(user_ids)
        self.item_ids = torch.LongTensor(item_ids)
        
        self.ratings = (
            ratings.values if hasattr(ratings, "values") else ratings
        )

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):

        return {
            "user_id": self.user_ids[idx],
            "user_features": self.user_data[idx],  
            "item_id": self.item_ids[idx],
            "item_features": self.item_data[idx],  
            "rating": self.ratings[idx],
        }
       

In [138]:
print("Before dataset creation:")
print(f"user_data_train shape: {user_data_train.shape}")
print(f"item_data_train shape: {item_data_train.shape}")
print(f"user_data_test shape: {user_data_test.shape}")
print(f"item_data_test shape: {item_data_test.shape}")

Before dataset creation:
user_data_train shape: (11987, 5)
item_data_train shape: (11987, 388)
user_data_test shape: (2997, 5)
item_data_test shape: (2997, 388)


In [139]:
train_item_ids.shape

(11987,)

In [140]:
train_user_ids.shape

(11987,)

In [141]:
train_dataset = RankingDataset(
    user_data=user_data_train,  
    item_data=item_data_train, 
    ratings=train_ratings,
    user_ids=train_user_ids,
    item_ids=train_item_ids
)
# to train
test_dataset = RankingDataset(
    user_data=user_data_test,
    item_data=item_data_test,
    ratings=test_ratings,
    user_ids=test_user_ids,
    item_ids=test_item_ids
)
# for eval has to have a different structure
evaluation_dataset = EvaluationDataset(
    user_data=user_data_test,
    item_data=item_data_test,
    ratings=test_ratings,
    user_ids=test_user_ids,
    item_ids=test_item_ids
)    

# create data loaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=64, 
    shuffle=True
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=64, 
    shuffle=False
)


In [165]:
class RankingNCF(nn.Module):
    def __init__(self, user_features, item_features, hidden_layers=[8]):
        super().__init__()
        
        self.user_linear = nn.Linear(user_features, hidden_layers[0])
        self.user_bn = nn.BatchNorm1d(hidden_layers[0])
        self.user_dropout = nn.Dropout(0.5)
        
        self.item_linear = nn.Linear(item_features, hidden_layers[0])
        self.item_bn = nn.BatchNorm1d(hidden_layers[0])
        self.item_dropout = nn.Dropout(0.5)
        
        self.score = nn.Sequential(
            nn.Linear(hidden_layers[0] * 2, hidden_layers[0]),
            nn.BatchNorm1d(hidden_layers[0]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_layers[0], 1)
        )

    def forward(self, user_features, item_features):
        # User tower with residual
        user_out = self.user_linear(user_features)
        user_out = self.user_bn(user_out)
        user_out = F.relu(user_out)
        user_out = self.user_dropout(user_out) + user_out  # residual connection
        
        # Item tower with residual
        item_out = self.item_linear(item_features)
        item_out = self.item_bn(item_out)
        item_out = F.relu(item_out)
        item_out = self.item_dropout(item_out) + item_out  # residual connection
        
        combined = torch.cat([user_out, item_out], dim=1)
        return self.score(combined).squeeze()

# Train Model

In [166]:
import torch.nn.functional as F
def train_ranking_model(model, train_loader, val_loader, epochs=10, lr=0.0005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.005)
    
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            user_features = batch['user_features'].to(device)
            pos_item_features = batch['pos_item_features'].to(device)
            neg_item_features = batch['neg_item_features'].to(device)
            
            # forward pass
            pos_scores = model(user_features, pos_item_features)
            neg_scores = model(user_features, neg_item_features)
            
            # BPR loss
            loss = -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()
            #loss = torch.max(torch.zeros_like(pos_scores), 1 - (pos_scores - neg_scores)).mean()
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                user_features = batch['user_features'].to(device)
                pos_item_features = batch['pos_item_features'].to(device)
                neg_item_features = batch['neg_item_features'].to(device)
                
                pos_scores = model(user_features, pos_item_features)
                neg_scores = model(user_features, neg_item_features)
                
                loss = -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()
                #loss = torch.max(torch.zeros_like(pos_scores), 1 - (pos_scores - neg_scores)).mean()
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'Training Loss: {avg_train_loss:.4f}')
        print(f'Validation Loss: {avg_val_loss:.4f}')
        print('-' * 50)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict().copy()
    
    return best_model

In [167]:
model = RankingNCF(
    user_features=user_data_train.shape[1],  # 5
    item_features=item_data_train.shape[1],  # 388
)

In [168]:
trained_model = train_ranking_model(model, train_loader, test_loader,  epochs=15)
model.load_state_dict(trained_model)

Epoch 1/15:
Training Loss: 0.6965
Validation Loss: 0.6776
--------------------------------------------------
Epoch 2/15:
Training Loss: 0.6708
Validation Loss: 0.6953
--------------------------------------------------
Epoch 3/15:
Training Loss: 0.5433
Validation Loss: 0.6651
--------------------------------------------------
Epoch 4/15:
Training Loss: 0.3820
Validation Loss: 0.6750
--------------------------------------------------
Epoch 5/15:
Training Loss: 0.2750
Validation Loss: 0.6712
--------------------------------------------------
Epoch 6/15:
Training Loss: 0.2213
Validation Loss: 0.6651
--------------------------------------------------
Epoch 7/15:
Training Loss: 0.1936
Validation Loss: 0.6664
--------------------------------------------------
Epoch 8/15:
Training Loss: 0.1725
Validation Loss: 0.7117
--------------------------------------------------
Epoch 9/15:
Training Loss: 0.1651
Validation Loss: 0.7395
--------------------------------------------------
Epoch 10/15:
Traini

<All keys matched successfully>

# Eval Model

In [169]:
class EvaluationDataset(Dataset):
    def __init__(self, user_data, item_data, ratings, user_ids, item_ids):
        """
        user_data: shape [n_users, user_feature_dim]
        item_data: shape [n_items, item_feature_dim]
        
        user_ids, item_ids: arrays of length N, each element is an integer ID
          in [0..n_users-1], [0..n_items-1] respectively (after label-encoding).
        """
        self.user_data = torch.FloatTensor(user_data.values)
        self.item_data = torch.FloatTensor(item_data.values)
        
        self.user_ids = torch.LongTensor(user_ids)
        self.item_ids = torch.LongTensor(item_ids)
        
        self.ratings = (
            ratings.values if hasattr(ratings, "values") else ratings
        )

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):

        return {
            "user_id": self.user_ids[idx],
            "user_data": self.user_data[idx],  
            "item_id": self.item_ids[idx],
            "item_data": self.item_data[idx],  
            "rating": self.ratings[idx],
        }
       

In [170]:
evaluation_dataset = EvaluationDataset(
    user_data=user_data_test,
    item_data=item_data_test,
    ratings=test_ratings,
    user_ids=test_user_ids,
    item_ids=test_item_ids
)    

In [171]:
evaluation_dataset.item_data

tensor([[ 0.2568,  0.7479, -0.2778,  ..., -0.0630,  0.0079,  0.0536],
        [ 1.8829,  0.7479, -0.6557,  ..., -0.0766, -0.0121, -0.0057],
        [ 0.9795,  0.7479, -0.6557,  ..., -0.0645, -0.1007, -0.0193],
        ...,
        [ 0.6181, -0.8336,  0.8556,  ..., -0.0486,  0.0793,  0.0202],
        [ 0.6181, -0.7348,  0.4778,  ..., -0.0486,  0.0793,  0.0202],
        [ 0.6181, -0.5700,  0.1000,  ..., -0.0486,  0.0793,  0.0202]])

In [174]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_user_ids = []
all_item_ids = []
all_ratings = []
all_preds = []

with torch.no_grad():
    for batch in evaluation_dataset:
        user_ids = batch['user_id'].to(device).unsqueeze(0) 
        user_feats = batch['user_data'].to(device).unsqueeze(0) 
        item_ids = batch['item_id'].to(device).unsqueeze(0) 
        item_feats = batch['item_data'].to(device).unsqueeze(0) 

        # predict scores
        scores = model(user_feats, item_feats)
        
        all_user_ids.extend(user_ids.cpu().numpy())
        all_item_ids.extend(item_ids.cpu().numpy())
        all_ratings.append(batch["rating"])
        all_preds.append(scores.cpu().numpy())

test_df = pd.DataFrame({
    "user_id": all_user_ids,
    "item_id": all_item_ids,
    "rating": all_ratings,
    "predicted_score": all_preds
})

In [175]:
ndcg_scores = []
num_users = 0
for user_id in test_df['user_id'].unique():
    true_relevance = test_df[test_df['user_id'] == user_id][target].tolist()
    predicted_scores = test_df[test_df['user_id'] == user_id]['predicted_score'].tolist()
    if len(predicted_scores) > 1:
        user_ndcg = ndcg_score([true_relevance], [predicted_scores], k=10)
        ndcg_scores.append(user_ndcg)
        num_users += 1

average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG@10 across all users with mora than 1 rating ({num_users}, {num_users/test_df.user_id.unique().shape[0]}% of the test set): {average_ndcg:.4f}")


Average NDCG@10 across all users with mora than 1 rating (381, 0.625615763546798% of the test set): 0.9785


In [176]:
def precision_recall_at_k(group, k=10):
    # sort by predicted_score descending
    group_sorted = group.sort_values("predicted_score", ascending=False)
    
    top_k = group_sorted.head(k)
    
    # number of relevant items in the top K
    relevant_in_top_k = top_k["relevant"].sum()
    
    # total relevant items for this user
    total_relevant = group["relevant"].sum()
    
    precision_k = relevant_in_top_k / k
    recall_k = relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0
    
    return pd.Series({
        "precision@{}".format(k): precision_k,
        "recall@{}".format(k): recall_k
    })

def compute_precision_recall_at_k(df, user_col="user_id", k=10):
    """ 
    Compute mean precision@K and recall@K across all users. 
    """
    metrics_df = (
        df
        .groupby(user_col)
        [df.columns]
        .apply(lambda g: precision_recall_at_k(g, k))
    )

    return metrics_df.mean().to_dict()


test_df["relevant"] = (test_df["rating"] >= 4).astype(int)

test_df_sorted = (
    test_df
    .groupby("user_id", group_keys=False)
    [['user_id', 'relevant', 'predicted_score']] 
    .apply(lambda df: df.sort_values("predicted_score", ascending=False))
)

metrics_k10 = compute_precision_recall_at_k(test_df_sorted, user_col="user_id", k=10)
print(metrics_k10)

{'precision@10': 0.23415435139573013, 'recall@10': 0.8499832743629435}
