In [None]:
!pip install node2vec -q
!pip install networkx

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import csv
import ast
import random
import networkx as nx

In [None]:
os.chdir(r"/efs/users/readm2/graph_data/")

genome_scores = pd.read_csv('processed_data/genome_scores_processed.csv')
movies_processed = pd.read_csv('processed_data/movies_processed.csv')
ratings_train = pd.read_csv('processed_data/ratings_train.csv')
movies_test_cold_start = pd.read_csv('processed_data/movies_test_cold_start.csv')
ratings_test_cold_start = pd.read_csv('processed_data/ratings_test_cold_start.csv')
ratings_test = pd.read_csv('processed_data/ratings_test.csv')
ratings_val_cold_start = pd.read_csv('processed_data/ratings_val_cold_start.csv')
ratings_val = pd.read_csv('processed_data/ratings_val.csv')

In [None]:
user_ids = ratings_train['userId'].unique()
user_ids = np.append(user_ids, ratings_test['userId'].unique())
user_ids = np.append(user_ids, ratings_val['userId'].unique())
movie_ids = movies_processed['movieId'].unique()
# unique_genres = {'Action',
#  'Adventure',
#  'Animation',
#  'Children',
#  'Comedy',
#  'Crime',
#  'Documentary',
#  'Drama',
#  'Fantasy',
#  'Film-Noir',
#  'Horror',
#  'IMAX',
#  'Musical',
#  'Mystery',
#  'Romance',
#  'Sci-Fi',
#  'Thriller',
#  'War',
#  'Western'}
tags = genome_scores['tagId'].unique()

In [None]:
movie2id = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
N_nodes = len(movie_ids)
user2id = {user_id: idx for idx, user_id in enumerate(user_ids)}
N_nodes += len(user_ids)
tag2id = {tag_id: idx for idx, tag_id in enumerate(tags)}
N_nodes += len(tags)

In [None]:
film_graph = nx.Graph()

# Add movie, user, and tag nodes
film_graph.add_nodes_from(movie2id.values(), type='movie')
film_graph.add_nodes_from(user2id.values(), type='user')
film_graph.add_nodes_from(tag2id.values(), type='tag')

# Example: Add edges between users and movies
for _, row in ratings_train.iterrows():
    user_node = user2id[row['userId']]
    movie_node = movie2id[row['movieId']]
    film_graph.add_edge(user_node, movie_node, value=row['rating'])

In [None]:
from node2vec import Node2Vec

model = Node2Vec(
    film_graph,
    quiet=True,
    weight_key="value",
    dimensions=8,
    walk_length=4,
    num_walks=4,
    p=1,
    q=0.5,
    workers=8, 
)

In [None]:
model = model.fit(window=2, sg=1, min_count=0, epochs=1, workers=4) 
node_emb_node2vec_kc_homophily = model.wv.vectors

Code below saves node embeddings - not essential to running


In [None]:
node_embeddings_df = pd.DataFrame(node_emb_node2vec_kc_homophily)
node_embeddings_df['node_id'] = list(model.wv.index_to_key)  # Add node IDs as a column
node_embeddings_df.to_csv('node2vec_embeddings.csv', index=False)

In [None]:
print(f"Graph has {film_graph.number_of_nodes()} nodes and {film_graph.number_of_edges()} edges.")

print(f"Node embeddings shape: {node_emb_node2vec_kc_homophily.shape}")
print(f"Embedding of the first node: {node_emb_node2vec_kc_homophily[0]}")
nodes_with_embeddings = len(model.wv)
print(f"Number of nodes with embeddings: {nodes_with_embeddings}")
print(f"Percentage of nodes with embeddings: {nodes_with_embeddings / film_graph.number_of_nodes() * 100:.2f}%")

test set


In [None]:
from sklearn.metrics import mean_squared_error

# Extract embeddings for users and movies
user_embeddings = {user_id: model.wv[user2id[user_id]] for user_id in ratings_test['userId'].unique() if user2id[user_id] in model.wv}
movie_embeddings = {movie_id: model.wv[movie2id[movie_id]] for movie_id in ratings_test['movieId'].unique() if movie2id[movie_id] in model.wv}

# Predict ratings based on cosine similarity
predicted_ratings = []
actual_ratings = []

for _, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    if user_id in user_embeddings and movie_id in movie_embeddings:
        user_emb = user_embeddings[user_id]
        movie_emb = movie_embeddings[movie_id]
        similarity = np.dot(user_emb, movie_emb) / (np.linalg.norm(user_emb) * np.linalg.norm(movie_emb))
        predicted_ratings.append(similarity)
        actual_ratings.append(row['rating'])

# Scale predicted ratings to match the rating scale (e.g., 0.5 to 5.0)
predicted_ratings = np.interp(predicted_ratings, (min(predicted_ratings), max(predicted_ratings)), (0.5, 5.0))

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"RMSE: {rmse}")