In [3]:
# Import packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import csv
import ast
import random
from sklearn.preprocessing import MultiLabelBinarizer
from torch_geometric.data import HeteroData
from torch_geometric.nn import HANConv
import torch
import torch.nn as nn



# Generate PyG graph

In [21]:
genome_scores = pd.read_csv('full_data/genome_scores_processed.csv')
movies = pd.read_csv('full_data/movies_warm.csv')
ratings_train = pd.read_csv('full_data/ratings_train.csv')
ratings_test = pd.read_csv('full_data/ratings_test.csv')
ratings_val = pd.read_csv('full_data/ratings_val.csv')

Extract users, movies and tags that are going to serve as our nodes. We need all of the users (i.e., from train, test and val) because HAN needs to generate embeddings for them.

In [22]:
user_ids = np.unique(np.concatenate([
    ratings_train['userId'].unique(),
    ratings_val['userId'].unique(),
    ratings_test['userId'].unique()
]))
movie_ids = movies['movieId'].unique()
unique_genres = {'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}
tags = genome_scores['tagId'].unique()

Now, add all of the edges and their inverses (inverses are needed for metapaths to be intelligible). Recall that in original form, HAN does not handle weighted edges. So, we need a way to turn genome score relation into a binary relation (`has tag / does not have tag'). We therefore need some relevance-threshold for cutting off when a movie has a tag vs when it doesn't. Due to the binary treatment, I imagine the threshold has to be fairly high - especially since the genome scoring seems to associate movies with tags very tightly (every movie has at least 1 tag with >0.99 relevance)


Same for the ratings. There will be an edge if rating is `positive' in the csv (>= 4.0)

In [23]:
relevance_threshold = 0.5
genome_scores_filtered = genome_scores[genome_scores['relevance'] > relevance_threshold]
genome_scores_filtered = genome_scores_filtered[genome_scores_filtered['movieId'].isin(movie_ids)]

For the PyG graph, we will need to assign indeces to nodes in each node type

In [24]:
movie2id = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
N_nodes = len(movie_ids)
user2id = {user_id: idx for idx, user_id in enumerate(user_ids)}
N_nodes += len(user_ids)
tag2id = {tag_id: idx for idx, tag_id in enumerate(tags)}
N_nodes += len(tags)

Add relevant edges and their inverses:

In [25]:
# Movie-Tag (and Tag-Movie)
genome_scores_filtered['movie_idx'] = genome_scores_filtered['movieId'].map(movie2id)
genome_scores_filtered['tag_idx'] = genome_scores_filtered['tagId'].map(tag2id)

movie_tag_edges_list = list(zip(genome_scores_filtered['movie_idx'], genome_scores_filtered['tag_idx']))
movie_tag_inv_edges_list = list(zip(genome_scores_filtered['tag_idx'], genome_scores_filtered['movie_idx']))

# User-Movie (and Movie-User)
ratings_train_filtered = ratings_train[ratings_train['positive_rating'] == 1]

ratings_train_filtered['user_idx'] = ratings_train_filtered['userId'].map(user2id)
ratings_train_filtered['movie_idx'] = ratings_train_filtered['movieId'].map(movie2id)

user_movie_edges_list = list(zip(ratings_train_filtered['user_idx'], ratings_train_filtered['movie_idx']))

user_movie_inv_edges_list = list(zip(ratings_train_filtered['movie_idx'], ratings_train_filtered['user_idx']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_train_filtered['user_idx'] = ratings_train_filtered['userId'].map(user2id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_train_filtered['movie_idx'] = ratings_train_filtered['movieId'].map(movie2id)


Read our nodes and lists of edges into the PyG graph:

In [26]:
data = HeteroData()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data['user'].num_nodes = len(user_ids)
data['movie'].num_nodes = len(movie_ids)
# data['genre'].num_nodes = len(unique_genres)
data['tag'].num_nodes = len(tags)

# Movie—Tag-Movie
src, dst = zip(*movie_tag_edges_list)  # lists of (movie_idx, genre_idx)
data['movie', 'hasTag', 'tag'].edge_index = torch.tensor([src, dst])
data['tag', 'inv_hasTag', 'movie'].edge_index = torch.tensor([dst, src])

# User-Movie-User
src, dst = zip(*user_movie_edges_list)
data['user', 'Likes', 'movie'].edge_index = torch.tensor([src, dst])
data['movie', 'inv_Likes', 'user'].edge_index = torch.tensor([dst, src])

data = data.to(device)

# HAN MODEL

Embedding dimension is 64, as in the paper. 

NOTE! : If you want to use features in movie node embeddings, skip the cell below, uncomment and run the cell after that.

In [27]:
# hidden_dim = 64  

# user_emb  = nn.Embedding(data['user'].num_nodes,  hidden_dim)
# movie_emb = nn.Embedding(data['movie'].num_nodes, hidden_dim)
# tag_emb   = nn.Embedding(data['tag'].num_nodes,   hidden_dim)

# metadata = data.metadata()  

# conv = HANConv(
#     in_channels  = {
#       'user':  hidden_dim,
#       'movie': hidden_dim,
#       'tag':   hidden_dim,
#     },
#     out_channels = hidden_dim,
#     metadata     = metadata,
#     heads        = 4,       # number of attention heads
#     dropout      = 0.5,     # attention‐head dropout
# )

# def forward_han(data):
#     x_dict = {
#         'user':  user_emb(torch.arange(data['user'].num_nodes,  device=device)),
#         'movie': movie_emb(torch.arange(data['movie'].num_nodes, device=device)),
#         'tag':   tag_emb(torch.arange(data['tag'].num_nodes,   device=device)),
#     }

#     x_dict = conv(x_dict, data.edge_index_dict)
#     return x_dict

# x_dict = forward_han(data)


In [10]:
genre_lists = movies['genre_list'].apply(lambda s: ast.literal_eval(s)).tolist()
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(genre_lists)
movie_features_tensor = torch.tensor(genre_features, dtype=torch.float32, device=device)
movie_features_tensor = movie_features_tensor.to(device)

hidden_dim = 64  

user_emb  = nn.Embedding(data['user'].num_nodes,  hidden_dim)
movie_emb = nn.Embedding(data['movie'].num_nodes, hidden_dim)
input_dim_movie = movie_features_tensor.shape[1] + hidden_dim
tag_emb   = nn.Embedding(data['tag'].num_nodes,   hidden_dim)

metadata = data.metadata()  

conv = HANConv(
    in_channels  = {
      'user':  hidden_dim,
      'movie': input_dim_movie,
      'tag':   hidden_dim,
    },
    out_channels = hidden_dim,
    metadata     = metadata,
    heads        = 4,       # number of attention heads
    dropout      = 0.5,     # attention‐head dropout
)

def forward_han(data):
    learned_movie_emb = movie_emb(torch.arange(data['movie'].num_nodes, device=device))
    movie_x = torch.cat([movie_features_tensor, learned_movie_emb], dim=1)
    
    x_dict = {
        'user':  user_emb(torch.arange(data['user'].num_nodes,  device=device)),
        'movie': movie_x,
        'tag':   tag_emb(torch.arange(data['tag'].num_nodes, device=device)),
    }

    x_dict = conv(x_dict, data.edge_index_dict)
    return x_dict

x_dict = forward_han(data)

# TRAIN

I use 30 as negative sample size in train, validation and test. 5 is used in the paper - but they consider link prediction, not recommendation. It seems that the model has to be more discriminatory in the recommendation task, so there is reason for higher negative sample size.

In [28]:
num_epochs = 300
validation_frequency = 5
patience = 5
best_val_loss = float('inf')
patience_counter = 0
num_negatives = 10

We will employ early stopping in our training. Once every several epochs, we will compute validation loss of the current model. Let us define the function that will compute this loss in advance.

We can only use those users and movies from validation that HAN has seen from train because those are the only ones we have embeddings for.

In [29]:
seen_users = set(ratings_train_filtered['userId'].unique())
seen_movies = set(ratings_train_filtered['movieId'].unique())

# filter validation data to only include seen users
ratings_val_filtered = ratings_val[
    (ratings_val['positive_rating'] == 1) &
    (ratings_val['userId'].isin(seen_users)) &
    (ratings_val['movieId'].isin(seen_movies))
]

# Map to indices
ratings_val_filtered['user_idx'] = ratings_val_filtered['userId'].map(user2id)
ratings_val_filtered['movie_idx'] = ratings_val_filtered['movieId'].map(movie2id)

# Create edge list
user_movie_val_edges_list = list(zip(ratings_val_filtered['user_idx'], ratings_val_filtered['movie_idx']))

val_user, val_movie = zip(*user_movie_val_edges_list)

val_user = torch.tensor(val_user, device=device)
val_movie = torch.tensor(val_movie, device=device)

def compute_validation_loss(data, val_user, val_movie, num_neg_samples=1):
    conv.eval()  # evaluation mode

    with torch.no_grad():
        # Get current node embeddings from the model
        x_dict = forward_han(data)

        # Positive scores (actual interactions)
        pos_scores = score_edges(val_user, val_movie, x_dict)

        # Sample negative movies (not interacted with by user)
        neg_user = val_user.repeat(num_negatives, 1).flatten()
        neg_movie = torch.randint(0, data['movie'].num_nodes, (len(neg_user),), device=device)

        neg_scores = score_edges(neg_user, neg_movie, x_dict)

        # Loss = average binary cross-entropy loss
        loss = -torch.log(pos_scores + 1e-15).mean() - torch.log(1 - neg_scores + 1e-15).mean()

    return loss.item()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_val_filtered['user_idx'] = ratings_val_filtered['userId'].map(user2id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_val_filtered['movie_idx'] = ratings_val_filtered['movieId'].map(movie2id)


In [30]:
import torch.nn.functional as F
optimizer = torch.optim.Adam(list(conv.parameters()) +
                             list(user_emb.parameters()) +
                             list(movie_emb.parameters()), lr=0.01)

def score_edges(u_idx, m_idx, x_dict):
    hu = x_dict['user'][u_idx.long()]
    hm = x_dict['movie'][m_idx.long()]
    return torch.sigmoid((hu * hm).sum(dim=-1))
    
train_losses = []
val_losses = []

for epoch in range(1, num_epochs+1):
    conv.train()
    optimizer.zero_grad()

    # Positive samples
    pos_user, pos_movie = zip(*user_movie_edges_list)  
    pos_user = torch.tensor(pos_user, device=device)
    pos_movie = torch.tensor(pos_movie, device=device)

    # Negative sampling (random movies for each user)
    neg_user = pos_user.repeat(num_negatives, 1).flatten()  
    neg_movie = torch.randint(0, data['movie'].num_nodes, (num_negatives * len(pos_user),), device=device)

    x_dict = forward_han(data)
    pos_scores = score_edges(pos_user, pos_movie, x_dict)
    neg_scores = score_edges(neg_user, neg_movie, x_dict)

    loss = -torch.log(pos_scores + 1e-15).mean() - torch.log(1 - neg_scores + 1e-15).mean()
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())
    
       # validate
    if epoch % validation_frequency == 0:
        print(f'epoch {epoch}')
        val_loss = compute_validation_loss(data, val_user, val_movie)  # This function computes validation loss
        val_losses.append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(conv.state_dict(), 'best_model.pt')  # Save best model
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

epoch 5
epoch 10
epoch 15
epoch 20
epoch 25
epoch 30
epoch 35
epoch 40
epoch 45
epoch 50
epoch 55
epoch 60
epoch 65
epoch 70
epoch 75
epoch 80
epoch 85
epoch 90
epoch 95
epoch 100
epoch 105
epoch 110
epoch 115
epoch 120
epoch 125
epoch 130
epoch 135
epoch 140
epoch 145
epoch 150
epoch 155
epoch 160
epoch 165
epoch 170
epoch 175
epoch 180
epoch 185
epoch 190
epoch 195
epoch 200
epoch 205
epoch 210
epoch 215
epoch 220
epoch 225
epoch 230
epoch 235
epoch 240
epoch 245
Early stopping at epoch 245


# TEST

The logic with the test set is the same as with validation.

In [31]:
# filter test data to only include seen users
ratings_test_filtered = ratings_test[
    (ratings_test['positive_rating'] == 1) &
    (ratings_test['userId'].isin(seen_users)) &
    (ratings_test['movieId'].isin(seen_movies))
]

# Map to indices
ratings_test_filtered['user_idx'] = ratings_test_filtered['userId'].map(user2id)
ratings_test_filtered['movie_idx'] = ratings_test_filtered['movieId'].map(movie2id)

# Create edge list
user_movie_test_edges_list = list(zip(ratings_test_filtered['user_idx'], ratings_test_filtered['movie_idx']))

test_user, test_movie = zip(*user_movie_test_edges_list)

test_user = torch.tensor(test_user, device=device)
test_movie = torch.tensor(test_movie, device=device)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_test_filtered['user_idx'] = ratings_test_filtered['userId'].map(user2id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_test_filtered['movie_idx'] = ratings_test_filtered['movieId'].map(movie2id)


Create evaluation function;
assume input: df user | movie | score | true label (0,1)

In [15]:
# def evaluate_ranking_tensor(data, test_user, test_movie, k=3, num_negatives=num_negatives):
#     conv.eval()
#     with torch.no_grad():
#         x_dict = forward_han(data)  # Get node embeddings

#     hits, mrr = [], []

#     for u, pos_m in zip(test_user, test_movie):
#         u = u.item()
#         pos_m = pos_m.item()

#         # Skip if user or movie out of bounds
#         if u >= data['user'].num_nodes or pos_m >= data['movie'].num_nodes:
#             continue

#         # Sample negative movies
#         neg_movies = set()
#         while len(neg_movies) < num_negatives:
#             neg = random.randint(0, data['movie'].num_nodes - 1)
#             if neg != pos_m:
#                 neg_movies.add(neg)

#         all_movies = [pos_m] + list(neg_movies)
#         all_movies_tensor = torch.tensor(all_movies, device=device)
#         user_tensor = torch.tensor([u] * len(all_movies), device=device)

#         # Get scores
#         scores = score_edges(user_tensor, all_movies_tensor, x_dict).cpu()
#         _, ranking = scores.sort(descending=True)

#         rank = (ranking == 0).nonzero(as_tuple=False).item()  # position of the positive sample

#         hits.append(1 if rank < k else 0)
#         mrr.append(1.0 / (rank + 1))

#     hits_at_k = sum(hits) / len(hits) if hits else 0.0
#     mean_mrr = sum(mrr) / len(mrr) if mrr else 0.0
#     return hits_at_k, mean_mrr

In [32]:
all_movie_idx = pd.Series(movie_ids).map(movie2id).to_numpy()

def build_eval_df(
    test_pos_edges=user_movie_test_edges_list,
    user_movie_edges_list=user_movie_edges_list,
    user_movie_val_edges_list=user_movie_val_edges_list,
    all_movie_ids=all_movie_idx):

    rows = []
    
    # Group the test positives by user
    from collections import defaultdict
    pos_by_user = defaultdict(list)
    for u, m in test_pos_edges:
        pos_by_user[u].append(m)
    
    for user_id, pos_movies in pos_by_user.items():
        # Precompute forbidden set for this user
        forbidden = set(pos_movies) \
                    | {m for (u, m) in user_movie_edges_list    if u == user_id} \
                    | {m for (u, m) in user_movie_val_edges_list if u == user_id}
        
        for pos_m in pos_movies:
            # 1 positive
            batch_movies = [pos_m]
            labels       = [1]
            
            # 9 negatives sampled uniformly
            neg_samples = []
            while len(neg_samples) < 9:
                m = all_movie_ids[torch.randint(len(all_movie_ids), (1,)).item()]
                if m not in forbidden:
                    neg_samples.append(m)
                    forbidden.add(m)  # avoid duplicates
            batch_movies += neg_samples
            labels       += [0] * len(neg_samples)
            
            # Score the batch of 10 edges
            u_tensor  = torch.tensor([user_id] * len(batch_movies), device=device)
            m_tensor  = torch.tensor(batch_movies,      device=device)
            with torch.no_grad():
                scores = score_edges(u_tensor, m_tensor, x_dict).cpu().numpy()
            
            # Accumulate rows
            for m, s, gt in zip(batch_movies, scores, labels):
                rows.append({
                    'user': user_id,
                    'movie': m,
                    'edge_score': float(s),
                    'ground_truth': gt
                })
    
    # Build DataFrame
    df = pd.DataFrame(rows, columns=['user','movie','edge_score','ground_truth'])
    return df


In [33]:
df_for_recs = build_eval_df()

We will use 3 scorings we mentioned in the discussion and 9 negative sampled edges for each positive edge in the test set.

In [34]:
def evaluate_recommendations(df, ks):
    """
    Expects args:
        df: pandas df with columns ['user', 'movie', 'edge_score', 'ground_truth']
        ks (list of ints): Numbers of top recommendations to consider

    Returns NDCG@k, Recall@k, MRR@k for the given k
    """
    # Group by user
    users = df['user'].unique()
    for k in ks:
        ndcg_list = []
        recall_list = []
        rr_list = []
    
        for user in users:
            user_df = df[df['user'] == user]
            # Sort by predicted score
            ranked = user_df.sort_values('edge_score', ascending=False)
            # Top k predictions
            topk = ranked.head(k)
            # Ground truth relevance values
            rel = topk['ground_truth'].values
            # Compute DCG@k
            gains = (2**rel - 1)
            discounts = np.log2(np.arange(2, k + 2))
            dcg = np.sum(gains / discounts)
    
            # Compute IDCG@k (ideal ranking)
            ideal_rel = np.sort(user_df['ground_truth'].values)[::-1][:k]
            ideal_gains = (2**ideal_rel - 1)
            idcg = np.sum(ideal_gains / discounts)
            ndcg = dcg / idcg if idcg > 0 else 0.0
            ndcg_list.append(ndcg)
    
            # Recall@k: number relevant in topk / total relevant
            total_rel = user_df['ground_truth'].sum()
            recall = rel.sum() / total_rel if total_rel > 0 else 0.0
            recall_list.append(recall)
    
            # MRR@k: reciprocal of rank of the top-ranked true edge
            rr = 0.0
            for idx, val in enumerate(rel, start=1):
                if val == 1:
                    rr = 1.0 / idx
                    break
            rr_list.append(rr)
    
        dict_of_results = {
            f'NDCG@{k}': np.mean(ndcg_list),
            f'Recall@{k}': np.mean(recall_list),
            f'MRR@{k}': np.mean(rr_list)
        }
        # Aggregate metrics
        print(f'METRICS FOR {k}:\n{dict_of_results}')

WITH EMBEDDINGS USING GENRES AS FEATURES:

In [20]:
evaluate_recommendations(df=df_for_recs, ks=[3,5,10])

METRICS FOR 3:
{'NDCG@3': 0.6188359648726435, 'Recall@3': 0.24589585919053558, 'MRR@3': 0.7460622678013983}
METRICS FOR 5:
{'NDCG@5': 0.6134050517976642, 'Recall@5': 0.37008681073168465, 'MRR@5': 0.7596879488183835}
METRICS FOR 10:
{'NDCG@10': 0.6301885242251088, 'Recall@10': 0.5555697502992681, 'MRR@10': 0.7646190233146754}


WITH EMBEDDINGS *NOT* USING GENRES AS FEATURES:

In [35]:
evaluate_recommendations(df=df_for_recs, ks=[3,5,10])

METRICS FOR 3:
{'NDCG@3': 0.5755917869833315, 'Recall@3': 0.2290656745774257, 'MRR@3': 0.7119372336763641}
METRICS FOR 5:
{'NDCG@5': 0.5711759024683697, 'Recall@5': 0.3467601615068751, 'MRR@5': 0.7276317819796082}
METRICS FOR 10:
{'NDCG@10': 0.58982281741011, 'Recall@10': 0.5233506631763168, 'MRR@10': 0.7337179924136445}
