In [65]:
!pip install node2vec -q
!pip install networkx



In [66]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import csv
import ast
import random
import networkx as nx
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [67]:
os.chdir(r"C:\Users\maxmo\Dropbox\GDS\graph_data_analytics\2025-wt-a2-what-to-watch")
#os.chdir(r"/efs/users/readm2/graph_data/")

genome_scores = pd.read_csv(r'processed_data\genome_scores_processed_small.csv')
movies_processed = pd.read_csv(r'processed_data\movies_processed.csv')
ratings_train = pd.read_csv(r'processed_data\ratings_train_small.csv')
movies_test_cold_start = pd.read_csv(r'processed_data\movies_test_cold_start.csv')
ratings_test_cold_start = pd.read_csv(r'processed_data\ratings_test_cold_start.csv')
ratings_test = pd.read_csv(r'processed_data\ratings_test_small.csv')
ratings_val_cold_start = pd.read_csv(r'processed_data\ratings_val_cold_start.csv')
ratings_val = pd.read_csv(r'processed_data\ratings_val_small.csv')

In [68]:
user_ids = ratings_train['userId'].unique()
user_ids = np.append(user_ids, ratings_test['userId'].unique())
user_ids = np.append(user_ids, ratings_val['userId'].unique())
movie_ids = movies_processed['movieId'].unique()
# unique_genres = {'Action',
#  'Adventure',
#  'Animation',
#  'Children',
#  'Comedy',
#  'Crime',
#  'Documentary',
#  'Drama',
#  'Fantasy',
#  'Film-Noir',
#  'Horror',
#  'IMAX',
#  'Musical',
#  'Mystery',
#  'Romance',
#  'Sci-Fi',
#  'Thriller',
#  'War',
#  'Western'}
tags = genome_scores['tagId'].unique()

In [69]:
movie2id = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
N_nodes = len(movie_ids)
user2id = {user_id: idx for idx, user_id in enumerate(user_ids)}
N_nodes += len(user_ids)
tag2id = {tag_id: idx for idx, tag_id in enumerate(tags)}
N_nodes += len(tags)

In [70]:
film_graph = nx.Graph()

# Add movie, user, and tag nodes
film_graph.add_nodes_from(movie2id.values(), type='movie')
film_graph.add_nodes_from(user2id.values(), type='user')
film_graph.add_nodes_from(tag2id.values(), type='tag')

# Example: Add edges between users and movies
for _, row in ratings_train.iterrows():
    user_node = user2id[row['userId']]
    movie_node = movie2id[row['movieId']]
    film_graph.add_edge(user_node, movie_node, value=row['rating'])

In [71]:
from node2vec import Node2Vec

model = Node2Vec(
    film_graph,
    quiet=True,
    weight_key="value",
    dimensions=64,
    walk_length=10,
    num_walks=40,
    p=1,
    q=1, 
)

In [72]:
model = model.fit(window=2, sg=1, min_count=0, epochs=1, workers=4) 
node_embeddings = model.wv.vectors

Code below saves node embeddings - not essential to running


In [73]:
node_embeddings_df = pd.DataFrame(node_embeddings)
node_embeddings_df['node_id'] = list(model.wv.index_to_key)  # Add node IDs as a column
#node_embeddings_df.to_csv('node2vec_embeddings.csv', index=False)

In [74]:
print(f"Graph has {film_graph.number_of_nodes()} nodes and {film_graph.number_of_edges()} edges.")

print(f"Node embeddings shape: {node_embeddings.shape}")
print(f"Embedding of the first node: {node_embeddings[0]}")
nodes_with_embeddings = len(model.wv)
print(f"Number of nodes with embeddings: {nodes_with_embeddings}")
print(f"Percentage of nodes with embeddings: {nodes_with_embeddings / film_graph.number_of_nodes() * 100:.2f}%")

Graph has 13183 nodes and 92477 edges.
Node embeddings shape: (13183, 64)
Embedding of the first node: [ 0.25433433 -0.30022815  0.8645895   0.31263313 -0.65241337 -1.0651687
  0.00119982  0.2600192  -0.57888275 -0.33538854  0.83796704 -0.22136769
 -0.25338152 -0.48594394 -0.08765536  0.3872368  -0.0251541  -0.07511009
  0.393928    0.6824668   0.84505457  0.56635803  0.9122253  -0.02319184
  0.12156586  0.2866141  -0.96687084  0.195786    0.11445902 -0.33671665
 -0.44628543  0.28666538 -0.93275815 -0.41120598 -0.17489849 -0.3704819
  0.41983953  0.10155404  0.66610134  0.08053827  0.3112603   0.03726854
 -0.3255775  -0.88306993  0.5605892  -0.4197607  -0.00270069  0.14203608
  0.16408764  0.35093817 -0.07280838  0.0656449   0.54669076  0.6924971
  0.6865796   0.25357917  0.47973835  0.04599128 -0.41188723  0.23745078
  0.02832458 -0.73771775 -0.14547452 -0.2693728 ]
Number of nodes with embeddings: 13183
Percentage of nodes with embeddings: 100.00%


test set


In [75]:
from sklearn.metrics import mean_squared_error

# Extract embeddings for users and movies
user_embeddings = {user_id: model.wv[user2id[user_id]] for user_id in ratings_test['userId'].unique() if user2id[user_id] in model.wv}
movie_embeddings = {movie_id: model.wv[movie2id[movie_id]] for movie_id in ratings_test['movieId'].unique() if movie2id[movie_id] in model.wv}

# Predict ratings based on cosine similarity
predicted_ratings = []
actual_ratings = []

for _, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    if user_id in user_embeddings and movie_id in movie_embeddings:
        user_emb = user_embeddings[user_id]
        movie_emb = movie_embeddings[movie_id]
        similarity = np.dot(user_emb, movie_emb) / (np.linalg.norm(user_emb) * np.linalg.norm(movie_emb))
        predicted_ratings.append(similarity)
        actual_ratings.append(row['rating'])

# Scale predicted ratings to match the rating scale (e.g., 0.5 to 5.0)
predicted_ratings = np.interp(predicted_ratings, (min(predicted_ratings), max(predicted_ratings)), (0.5, 5.0))

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"RMSE: {rmse}")

RMSE: 2.5835654455050556


In [76]:
def evaluate_ranking_tensor(user_embeddings, movie_embeddings, test_user, test_movie, k=3, num_negatives=30):
    hits, mrr = [], []

    for u, pos_m in zip(test_user, test_movie):
        # Skip if user or movie out of bounds
        if u not in user_embeddings or pos_m not in movie_embeddings:
            continue

        # Sample negative movies
        neg_movies = set()
        while len(neg_movies) < num_negatives:
            neg = random.choice(list(movie_embeddings.keys()))
            if neg != pos_m:
                neg_movies.add(neg)

        all_movies = [pos_m] + list(neg_movies)
        scores = [
            np.dot(user_embeddings[u], movie_embeddings[m]) / (np.linalg.norm(user_embeddings[u]) * np.linalg.norm(movie_embeddings[m]))
            for m in all_movies
        ]

        # Sort scores and find rank of positive movie
        ranking = np.argsort(scores)[::-1]
        rank = np.where(ranking == 0)[0][0]  # position of the positive sample

        hits.append(1 if rank < k else 0)
        mrr.append(1.0 / (rank + 1))

    hits_at_k = sum(hits) / len(hits) if hits else 0.0
    mean_mrr = sum(mrr) / len(mrr) if mrr else 0.0
    return hits_at_k, mean_mrr

In [92]:
evaluate_ranking_tensor(user_embeddings, movie_embeddings, ratings_test['userId'].values, ratings_test['movieId'].values, k=3, num_negatives=30)

(0.09997932887755805, 0.13161921264669313)

In [83]:
seen_users = set(ratings_train['userId'].unique())
seen_movies = set(ratings_train['movieId'].unique())
ratings_test_filtered = ratings_test[
    (ratings_test['userId'].isin(seen_users)) &
    (ratings_test['movieId'].isin(seen_movies))
]

ratings_test_filtered['user_idx'] = ratings_test_filtered['userId'].map(user2id)
ratings_test_filtered['movie_idx'] = ratings_test_filtered['movieId'].map(movie2id)

user_movie_test_edges_list = list(zip(
    ratings_test_filtered['userId'].map(user2id),
    ratings_test_filtered['movieId'].map(movie2id)
))

user_movie_edges_list = list(zip(
    ratings_train['userId'].map(user2id),
    ratings_train['movieId'].map(movie2id)
))

user_movie_val_edges_list = list(zip(
    ratings_val['userId'].map(user2id),
    ratings_val['movieId'].map(movie2id)
))
user_embeddings = {user_id: model.wv[user2id[user_id]] for user_id in ratings_test_filtered['userId'].unique() if user2id[user_id] in model.wv}
movie_embeddings = {movie_id: model.wv[movie2id[movie_id]] for movie_id in ratings_test_filtered['movieId'].unique() if movie2id[movie_id] in model.wv}

In [None]:
all_movie_idx = pd.Series(movie_ids).map(movie2id).to_numpy()
import torch

def score_edges(user_embeddings, movie_embeddings, user_ids, movie_ids):
    """
    Scores edges for user-movie pairs based on cosine similarity.

    Args:
        user_embeddings (dict): Dictionary mapping user IDs to their embeddings.
        movie_embeddings (dict): Dictionary mapping movie IDs to their embeddings.
        user_ids (list): List of user IDs.
        movie_ids (list): List of movie IDs.

    Returns:
        list: List of scores for the given user-movie pairs.
    """
    scores = []
    for user_id, movie_id in zip(user_ids, movie_ids):
        if user_id in user_embeddings and movie_id in movie_embeddings:
            user_emb = user_embeddings[user_id]
            movie_emb = movie_embeddings[movie_id]
            similarity = np.dot(user_emb, movie_emb) / (np.linalg.norm(user_emb) * np.linalg.norm(movie_emb))
            scores.append(similarity)
        else:
            scores.append(None)  # Handle missing embeddings
    return scores

def build_eval_df(test_pos_edges=user_movie_test_edges_list,
    user_movie_edges_list=user_movie_edges_list,
    user_movie_val_edges_list=user_movie_val_edges_list,
    all_movie_ids=all_movie_idx):

    rows = []
    
    # Group the test positives by user
    from collections import defaultdict
    pos_by_user = defaultdict(list)
    for u, m in test_pos_edges:
        pos_by_user[u].append(m)
    
    for user_id, pos_movies in pos_by_user.items():
        # Precompute forbidden set for this user
        forbidden = set(pos_movies) \
                    | {m for (u, m) in user_movie_edges_list    if u == user_id} \
                    | {m for (u, m) in user_movie_val_edges_list if u == user_id}
        
        for pos_m in pos_movies:
            # 1 positive
            batch_movies = [pos_m]
            labels       = [1]
            
            # 9 negatives sampled uniformly
            neg_samples = []
            while len(neg_samples) < 9:
                m = all_movie_ids[torch.randint(len(all_movie_ids), (1,)).item()]
                if m not in forbidden:
                    neg_samples.append(m)
                    forbidden.add(m)  # avoid duplicates
            batch_movies += neg_samples
            labels       += [0] * len(neg_samples)
            
            # Score the batch of 10 edges
            u_tensor  = torch.tensor([user_id] * len(batch_movies), device=device)
            m_tensor  = torch.tensor(batch_movies,      device=device)
            with torch.no_grad():
                scores = score_edges(user_embeddings, movie_embeddings, u_tensor.cpu().numpy(), m_tensor.cpu().numpy())
            
            invalid_count = 0
            # Accumulate rows
            for m, s, gt in zip(batch_movies, scores, labels):
                if s is not None:
                    rows.append({
                        'user': user_id,
                        'movie': m,
                        'edge_score': float(s),
                        'ground_truth': gt
                    })
                else:
                    invalid_count += 1
    
    # Build DataFrame
    df = pd.DataFrame(rows, columns=['user','movie','edge_score','ground_truth'])
    return df

def evaluate_recommendations(df, ks):
    """
    Expects args:
        df: pandas df with columns ['user', 'movie', 'edge_score', 'ground_truth']
        ks (list of ints): Numbers of top recommendations to consider

    Returns NDCG@k, Recall@k, MRR@k for the given k
    """
    # Group by user
    users = df['user'].unique()
    for k in ks:
        ndcg_list = []
        recall_list = []
        rr_list = []
    
        for user in users:
            user_df = df[df['user'] == user]
            # Sort by predicted score
            ranked = user_df.sort_values('edge_score', ascending=False)
            # Top k predictions
            topk = ranked.head(k)
            # Ground truth relevance values
            rel = topk['ground_truth'].values
            # Compute DCG@k
            gains = (2**rel - 1)
            discounts = np.log2(np.arange(2, k + 2))
            dcg = np.sum(gains / discounts)
    
            # Compute IDCG@k (ideal ranking)
            ideal_rel = np.sort(user_df['ground_truth'].values)[::-1][:k]
            ideal_gains = (2**ideal_rel - 1)
            idcg = np.sum(ideal_gains / discounts)
            ndcg = dcg / idcg if idcg > 0 else 0.0
            ndcg_list.append(ndcg)
    
            # Recall@k: number relevant in topk / total relevant
            total_rel = user_df['ground_truth'].sum()
            recall = rel.sum() / total_rel if total_rel > 0 else 0.0
            recall_list.append(recall)
    
            # MRR@k: reciprocal of rank of the top-ranked true edge
            rr = 0.0
            for idx, val in enumerate(rel, start=1):
                if val == 1:
                    rr = 1.0 / idx
                    break
            rr_list.append(rr)
    
        dict_of_results = {
            f'NDCG@{k}': np.mean(ndcg_list),
            f'Recall@{k}': np.mean(recall_list),
            f'MRR@{k}': np.mean(rr_list)
        }
        # Aggregate metrics
        print(f'METRICS FOR {k}:\n{dict_of_results}')

In [85]:
df_for_recs = build_eval_df(
        test_pos_edges=user_movie_test_edges_list,
        user_movie_edges_list=user_movie_edges_list,
        user_movie_val_edges_list=user_movie_val_edges_list,
        all_movie_ids=all_movie_idx)

user_movie_counts = df_for_recs.groupby('user')['movie'].count()
high_k_users = user_movie_counts[user_movie_counts > 9].index
df_high_k = df_for_recs[df_for_recs['user'].isin(high_k_users)]
evaluate_recommendations(df=df_high_k, ks=[3,5,10])

METRICS FOR 3:
{'NDCG@3': 0.125, 'Recall@3': 0.125, 'MRR@3': 0.125}
METRICS FOR 5:
{'NDCG@5': 0.22719117066349184, 'Recall@5': 0.375, 'MRR@5': 0.18125}
METRICS FOR 10:
{'NDCG@10': 0.22719117066349184, 'Recall@10': 0.375, 'MRR@10': 0.18125}


In [80]:
print(df_for_recs.describe())
print(df_for_recs.info())
print(df_for_recs.head())

              user        movie  edge_score  ground_truth
count   245.000000   245.000000  245.000000    245.000000
mean   6989.346939  2977.857143    0.021243      0.065306
std     760.887870  2477.376834    0.115881      0.247571
min    5543.000000     6.000000   -0.369549      0.000000
25%    6233.000000  1101.000000   -0.048966      0.000000
50%    7038.000000  2087.000000    0.024426      0.000000
75%    7666.000000  4776.000000    0.096923      0.000000
max    8159.000000  8965.000000    0.275865      1.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   user          245 non-null    int64  
 1   movie         245 non-null    int64  
 2   edge_score    245 non-null    float64
 3   ground_truth  245 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 7.8 KB
None
   user  movie  edge_score  ground_truth
0  5543   4148    0.

In [None]:

# Diagnostics for the output DataFrame
print("\nDiagnostics for df_for_recs:")
print(f"Type: {type(df_for_recs)}")
print(f"Shape: {df_for_recs.shape}")
print(f"Columns: {df_for_recs.columns}")
print(f"Sample rows:\n{df_for_recs.head()}")
print(f"Summary statistics:\n{df_for_recs.describe()}")


Diagnostics for df_for_recs:
Type: <class 'pandas.core.frame.DataFrame'>
Shape: (281, 4)
Columns: Index(['user', 'movie', 'edge_score', 'ground_truth'], dtype='object')
Sample rows:
   user  movie  edge_score  ground_truth
0  5543   1584    0.491174             0
1  5543    745   -0.335031             0
2  5543    153    0.330176             0
3  5543   3481    0.249001             0
4  5583   2761    0.111429             0
Summary statistics:
              user        movie  edge_score  ground_truth
count   281.000000   281.000000  281.000000     281.00000
mean   7005.544484  3281.298932   -0.002311       0.05694
std     732.296368  2478.298637    0.355578       0.23214
min    5543.000000    16.000000   -0.880942       0.00000
25%    6486.000000  1250.000000   -0.268225       0.00000
50%    7060.000000  2541.000000   -0.012683       0.00000
75%    7641.000000  4993.000000    0.283381       0.00000
max    8159.000000  8984.000000    0.800210       1.00000


In [86]:
print(len(seen_movies), len(seen_users))

1259 2727


In [88]:
print(f"Number of users with embeddings: {len(user_embeddings)}")
print(f"Number of movies with embeddings: {len(movie_embeddings)}")
print(f"Number of test edges: {len(user_movie_test_edges_list)}")
print(f"Number of connected components: {nx.number_connected_components(film_graph)}")
print(f"Number of rows in df_for_recs: {len(df_for_recs)}")

Number of users with embeddings: 2727
Number of movies with embeddings: 1258
Number of test edges: 29026
Number of connected components: 9375
Number of rows in df_for_recs: 286


In [None]:
print(len(user_movie_test_edges_list))
print(len(user_movie_edges_list))
print(len(user_movie_val_edges_list))
print(len(all_movie_idx))


29026
92489
31620
13183


NameError: name 'rows' is not defined