In [None]:
!pip install torch-geometric
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.5.1+cu121.html

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
from torch import nn, optim, Tensor
from torch_sparse import SparseTensor
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing

In [41]:
def create_bipartite_edge_tensors(df, src_index_column, dst_index_column, link_index_col, rating_threshold=1):
    """
    Parses a CSV file containing edges between users and items
    This function provides the basic data to construct a sparse matrix in COO format

    Args:
        df (pd.DataFrame): DataFrame with edge data.
        src_index_column (str): Name of the column representing user IDs.
        dst_index_column (str): Name of the column representing item IDs.
        link_index_col (str): Name of the column for interaction values (in our case the ratings).
        rating_threshold (int, optional): Value to classify edges as positive. Default is 1
        (for our case of MSE we need all the ratings, if we want to use other loss we have to change this value)

    Returns:
        edge_index (torch.Tensor): A 2 x N tensor with source and destination node IDs for N edges.
            (edge_index[0] corresponds to the row indices, edge_index[1] corresponds to the column indices)
        edge_values (torch.Tensor): A 1D tensor with interaction values corresponding to the edges (the ratings).
    """
    # Extract relevant columns
    src = df[src_index_column].values
    dst = df[dst_index_column].values
    link_vals = df[link_index_col].values

    # Apply rating threshold
    mask = link_vals >= rating_threshold

    # Filter edges based on the mask
    src_filtered = src[mask]
    dst_filtered = dst[mask]
    link_vals_filtered = link_vals[mask]

    # Create edge_index and edge_values
    edge_index = torch.tensor([src_filtered, dst_filtered], dtype=torch.long)
    edge_values = torch.tensor(link_vals_filtered, dtype=torch.long)

    return edge_index, edge_values

In [42]:
# R is the user-item interaction matrix
# Adj is the Adjacency matrix

def r_to_adj_matrix(input_edge_index, input_edge_values, num_users, num_movies):
    """
    Converts edge index and values to an adjacency matrix in COO format.

    Args:
        input_edge_index (torch.Tensor): 2 x N tensor with source and destination indices.
        input_edge_values (torch.Tensor): 1D tensor with edge weights/values.
        num_users (int): Number of user nodes.
        num_movies (int): Number of movie nodes.

    Returns:
        adj_mat_coo_indices (torch.Tensor): 2 x M tensor with COO format indices.
        adj_mat_coo_values (torch.Tensor): 1D tensor with corresponding values.
    """
    # Initialize R matrix with sparse representation
    R = torch.sparse_coo_tensor(
        indices=input_edge_index,
        values=input_edge_values,
        size=(num_users, num_movies)
    ).to_dense()

    # Transpose R to create R_transpose
    R_transpose = R.T

    # Create adjacency matrix in dense format
    adj_mat = torch.zeros((num_users + num_movies, num_users + num_movies), dtype=torch.float32)
    adj_mat[:num_users, num_users:] = R
    adj_mat[num_users:, :num_users] = R_transpose

    # Convert adjacency matrix to sparse COO format
    adj_mat_coo = adj_mat.to_sparse()
    adj_mat_coo_indices = adj_mat_coo.indices()
    adj_mat_coo_values = adj_mat_coo.values()

    return adj_mat_coo_indices, adj_mat_coo_values


def adj_to_r_matrix(input_edge_index, input_edge_values, num_users, num_movies):
    """
    Converts an adjacency matrix edge index and values to a user-item interaction matrix (R) in COO format.

    Args:
        input_edge_index (torch.Tensor): 2 x N tensor with source and destination indices.
        input_edge_values (torch.Tensor): 1D tensor with edge weights/values.
        num_users (int): Number of user nodes.
        num_movies (int): Number of movie nodes.

    Returns:
        r_matrix_indices (torch.Tensor): 2 x M tensor with COO format indices for the user-item interaction matrix.
        r_matrix_values (torch.Tensor): 1D tensor with corresponding values for the user-item interaction matrix.
    """
    # Create sparse adjacency matrix
    sparse_adj = SparseTensor(
        row=input_edge_index[0],
        col=input_edge_index[1],
        value=input_edge_values,
        sparse_sizes=(num_users + num_movies, num_users + num_movies)
    )

    # Convert to dense adjacency matrix
    adj_mat = sparse_adj.to_dense()

    # Extract the interaction matrix (R)
    interact_mat = adj_mat[:num_users, num_users:]

    # Convert interaction matrix to sparse COO format
    interact_mat_coo = interact_mat.to_sparse()
    r_matrix_indices = interact_mat_coo.indices()
    r_matrix_values = interact_mat_coo.values()

    return r_matrix_indices, r_matrix_values

In [43]:
# LightGCN model 
class LightGCN(MessagePassing):
   def __init__(self, num_users, num_items, embedding_dim=64, K=3):
       """
       Initialize a LightGCN model for collaborative filtering.

       Parameters:
           num_users: Total number of users in the system
           num_items: Total number of items in the system  
           embedding_dim: Size of the embedding vectors (default: 64)
           K: Number of message passing layers (default: 3)
       """
       super().__init__()
       self.num_users, self.num_items = num_users, num_items
       self.embedding_dim, self.K = embedding_dim, K

       # Initialize embeddings for users and items with random normal distribution
       self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim)  
       self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim)
       nn.init.normal_(self.users_emb.weight, std=0.1)
       nn.init.normal_(self.items_emb.weight, std=0.1)

       # Final linear layer to produce ratings
       self.out = nn.Linear(embedding_dim * 2, 1)

   def forward(self, edge_index: Tensor, edge_values: Tensor, num_users, num_movies):
       """
       Forward pass of the LightGCN model.
       
       Args:
           edge_index: Sparse adjacency matrix representing user-item interactions
           edge_values: Values/weights of the interactions
           num_users: Number of users
           num_movies: Number of movies/items
       """
       # Normalize adjacency matrix
       edge_index_norm = gcn_norm(edge_index, add_self_loops=False)

       # Initialize embeddings for layer 0
       emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight])  
       embs = [emb_0]
       emb_k = emb_0

       # Perform K iterations of message passing
       for i in range(self.K):
           emb_k = self.propagate(edge_index=edge_index_norm[0], x=emb_k, norm=edge_index_norm[1])
           embs.append(emb_k)

       # Aggregate embeddings from all layers
       embs = torch.stack(embs, dim=1)
       emb_final = torch.mean(embs, dim=1)

       # Split aggregated embeddings back into user and item embeddings
       users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items])

       r_mat_edge_index, _ = adj_to_r_matrix(edge_index, edge_values, num_users, num_movies)
       scr, dst = r_mat_edge_index[0], r_mat_edge_index[1]
       users_emb = users_emb_final[scr]
       items_emb = items_emb_final[dst]

       # Generate final predictions
       output = torch.cat([users_emb, items_emb], dim=1)
       output = self.out(output)
       return output

   def message(self, x_j, norm):
       """
       Message passing operation for graph convolution.
       """
       return norm.view(-1, 1) * x_j

In [44]:
from collections import defaultdict
def get_recommendation_metrics(input_edge_index, input_edge_values, pred_ratings, num_movies, k=10, threshold=3.5):
    """
    Calculate various recommendation metrics including Recall, Precision, Gini Index and Coverage.

    Args:
        input_edge_index (Tensor): Edge indices
        input_edge_values (Tensor): True ratings
        pred_ratings (Tensor): Predicted ratings
        k (int): Number of top items to consider
        threshold (float): Rating threshold for relevant items

    Returns:
        tuple: Recall, Precision, Gini Index, Coverage
    """
    with torch.no_grad():
        user_item_rating_list = defaultdict(list)
        item_recommendation_count = defaultdict(int)
        total_recommendations = 0
        unique_items = set()

        # Organize predictions by user
        for i in range(len(input_edge_index[0])):
            src = input_edge_index[0][i].item()
            dst = input_edge_index[1][i].item()
            true_rating = input_edge_values[i].item()
            pred_rating = pred_ratings[i].item()
            user_item_rating_list[src].append((pred_rating, true_rating, dst))
            unique_items.add(dst)

        # Calculate recall and precision, track recommended items
        recall = dict()
        precision = dict()

        for user_id, user_ratings in user_item_rating_list.items():
            user_ratings.sort(key=lambda x: x[0], reverse=True)
            top_k_items = user_ratings[:k]

            # Count recommendations per item
            for _, _, item_id in top_k_items:
                item_recommendation_count[item_id] += 1
                total_recommendations += 1

            n_rel = sum((true_r >= threshold) for (_, true_r, _) in user_ratings)
            n_rec_k = sum((est_r >= threshold) for (est_r, _, _) in top_k_items)
            n_rel_and_rec_k = sum(((true_r >= threshold) and (est_r >= threshold))
                                for (est_r, true_r, _) in top_k_items)

            recall[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
            precision[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Calculate Gini Index
        proportions = []
        for item in unique_items:
            count = item_recommendation_count.get(item, 0)
            proportion = count / total_recommendations if total_recommendations > 0 else 0
            proportions.append(proportion)

        proportions.sort()
        n = len(proportions)
        gini = 0
        for i, proportion in enumerate(proportions):
            gini += (2 * (i + 1) - n - 1) * proportion
        gini = gini / (n - 1) if n > 1 else 0

        # Calculate Coverage
        recommended_items = len([x for x in item_recommendation_count.values() if x > 0])
        coverage = recommended_items / num_movies if num_movies > 0 else 0

        overall_recall = sum(rec for rec in recall.values()) / len(recall)
        overall_precision = sum(prec for prec in precision.values()) / len(precision)

        return overall_recall, overall_precision, gini, coverage

# **100K Dataset**

In [45]:
rating_path = "" # insert here the path of 'ratings.csv' dataset

rating_df = pd.read_csv(rating_path, delimiter='\t', header=None)
rating_df.columns = ['userId', 'movieId', 'rating', 'timestamp']

lbl_user = preprocessing.LabelEncoder( )
lbl_movie = preprocessing.LabelEncoder( )

rating_df.userId = lbl_user.fit_transform(rating_df.userId.values)
rating_df.movieId = lbl_movie.fit_transform(rating_df.movieId.values)

num_users = len(rating_df['userId'].unique())
num_movies = len(rating_df['movieId'].unique())

In [46]:
edge_index, edge_values = create_bipartite_edge_tensors(rating_df, 'userId', 'movieId', 'rating')
print(f"{len(edge_index)} x {len(edge_index[0])}")

edge_index = torch.LongTensor(edge_index)
edge_values = torch.tensor(edge_values)

2 x 100000


  edge_values = torch.tensor(edge_values)


In [47]:
num_interactions = edge_index.shape[1]
all_indices = [i for i in range(num_interactions)]

train_indices, test_indices = train_test_split(all_indices, test_size=0.2, random_state=1)
val_indices, test_indices = train_test_split(test_indices, test_size=0.5, random_state=1)

train_edge_index = edge_index[:, train_indices]
train_edge_value = edge_values[train_indices]

val_edge_index = edge_index[:, val_indices]
val_edge_value = edge_values[val_indices]

test_edge_index = edge_index[:, test_indices]
test_edge_value = edge_values[test_indices]


# Create Adj matrix in COO format
train_edge_index, train_edge_values = r_to_adj_matrix(train_edge_index, train_edge_value, num_users, num_movies)
val_edge_index, val_edge_values = r_to_adj_matrix(val_edge_index, val_edge_value, num_users, num_movies)
test_edge_index, test_edge_values = r_to_adj_matrix(test_edge_index, test_edge_value, num_users, num_movies)

# Create R matrix in COO format (from the Adj matrix in COO format)
r_mat_train_edge_index, r_mat_train_edge_values = adj_to_r_matrix(train_edge_index, train_edge_values, num_users, num_movies)
r_mat_val_edge_index, r_mat_val_edge_values = adj_to_r_matrix(val_edge_index, val_edge_values, num_users, num_movies)
r_mat_test_edge_index, r_mat_test_edge_values = adj_to_r_matrix(test_edge_index, test_edge_values, num_users, num_movies) 

In [53]:
model = LightGCN(num_users, num_movies)

ITERATIONS = 25000
ITERS_PER_EVAL = 200
ITERS_PER_LR_DECAY = 200
N_ELEMENTS_REC = 10

model.train()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.01)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
loss_function = nn.MSELoss()

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = model.to(device)

train_edge_index = train_edge_index.to(device)
train_edge_values = train_edge_values.to(device)
val_edge_index = val_edge_index.to(device)
val_edge_values = val_edge_values.to(device)
test_edge_index = test_edge_index.to(device)
test_edge_values = test_edge_values.to(device)

r_mat_train_edge_index = r_mat_train_edge_index.to(device)
r_mat_train_edge_values = r_mat_train_edge_values.to(device)
r_mat_val_edge_index = r_mat_val_edge_index.to(device)
r_mat_val_edge_values = r_mat_val_edge_values.to(device)
r_mat_test_edge_index = r_mat_test_edge_index.to(device)
r_mat_test_edge_values = r_mat_test_edge_values.to(device)

Using device: cuda


In [55]:
for iter in range(ITERATIONS):
    # FORWARD PASS
    pred_ratings = model(train_edge_index, train_edge_values, num_users, num_movies)
    train_loss = loss_function(pred_ratings, r_mat_train_edge_values.view(-1, 1))

    # BACKWARD PASS
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    # VALIDATION SET
    if iter % ITERS_PER_EVAL == 0:
        model.eval()
        with torch.no_grad():
            pred_ratings = model(val_edge_index, val_edge_values, num_users, num_movies)
            val_loss = loss_function(pred_ratings, r_mat_val_edge_values.view(-1, 1)).sum()

            recall, precision, gini, coverage = get_recommendation_metrics(
                r_mat_val_edge_index, r_mat_val_edge_values, pred_ratings, num_movies, k=N_ELEMENTS_REC
            )
            print(
                f"Iter {iter}/{ITERATIONS}, Train loss: {train_loss.item():.4f}, "
                f"Val loss: {val_loss.item():.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, Gini: {gini:.4f}, Coverage: {coverage:.4f}"
            )
            model.train()

    if iter % ITERS_PER_LR_DECAY == 0 and iter != 0:
        scheduler.step()

Iter 0/25000, Train loss: 14.1444, Val loss: 14.2225, Recall: 0.0000, Precision: 0.0000, Gini: 0.6252, Coverage: 0.5922
Iter 200/25000, Train loss: 2.3885, Val loss: 2.3656, Recall: 0.0259, Precision: 0.1499, Gini: 0.7457, Coverage: 0.4673
Iter 400/25000, Train loss: 1.7089, Val loss: 1.6842, Recall: 0.1130, Precision: 0.3223, Gini: 0.7459, Coverage: 0.4667
Iter 600/25000, Train loss: 1.5281, Val loss: 1.5029, Recall: 0.1629, Precision: 0.3767, Gini: 0.7457, Coverage: 0.4679
Iter 800/25000, Train loss: 1.4335, Val loss: 1.4089, Recall: 0.1923, Precision: 0.3993, Gini: 0.7455, Coverage: 0.4679
Iter 1000/25000, Train loss: 1.3715, Val loss: 1.3479, Recall: 0.2159, Precision: 0.4223, Gini: 0.7450, Coverage: 0.4679
Iter 1200/25000, Train loss: 1.3262, Val loss: 1.3039, Recall: 0.2320, Precision: 0.4416, Gini: 0.7449, Coverage: 0.4679
Iter 1400/25000, Train loss: 1.2909, Val loss: 1.2699, Recall: 0.2420, Precision: 0.4513, Gini: 0.7446, Coverage: 0.4667
Iter 1600/25000, Train loss: 1.2624, 

In [56]:
model.eval()
with torch.no_grad():
    pred_ratings = model.forward(test_edge_index, test_edge_values, num_users, num_movies)
    test_loss = loss_function(pred_ratings, r_mat_test_edge_values.view(-1, 1))
    recall, precision, gini, coverage = get_recommendation_metrics(
        r_mat_test_edge_index, r_mat_test_edge_values, pred_ratings, num_movies, k=N_ELEMENTS_REC
    )
    print(f"Test metrics:\nLoss: {test_loss.item():.4f}\nRecall: {recall:.4f}\n"
          f"Precision: {precision:.4f}\nGini Index: {gini:.4f}\nCoverage: {coverage:.4f}")

Test metrics:
Loss: 1.0631
Recall: 0.3539
Precision: 0.5324
Gini Index: 0.7512
Coverage: 0.4608


# **1M Dataset**

In [69]:
rating_path = "" # insert here the path of 'ratings.csv' dataset

rating_df = pd.read_csv(rating_path, delimiter='::', header=None)
rating_df.columns = ['userId', 'movieId', 'rating', 'timestamp']

lbl_user = preprocessing.LabelEncoder( )
lbl_movie = preprocessing.LabelEncoder( )

rating_df.userId = lbl_user.fit_transform(rating_df.userId.values)
rating_df.movieId = lbl_movie.fit_transform(rating_df.movieId.values)

num_users = len(rating_df['userId'].unique())
num_movies = len(rating_df['movieId'].unique())

  rating_df = pd.read_csv(rating_path, delimiter='::', header=None)


In [70]:
edge_index, edge_values = create_bipartite_edge_tensors(rating_df, 'userId', 'movieId', 'rating')
print(f"{len(edge_index)} x {len(edge_index[0])}")

edge_index = torch.LongTensor(edge_index)
edge_values = torch.tensor(edge_values)

2 x 1000209


  edge_values = torch.tensor(edge_values)


In [71]:
num_interactions = edge_index.shape[1]
all_indices = [i for i in range(num_interactions)]

train_indices, test_indices = train_test_split(all_indices, test_size=0.2, random_state=1)
val_indices, test_indices = train_test_split(test_indices, test_size=0.5, random_state=1)


train_edge_index = edge_index[:, train_indices]
train_edge_value = edge_values[train_indices]

val_edge_index = edge_index[:, val_indices]
val_edge_value = edge_values[val_indices]

test_edge_index = edge_index[:, test_indices]
test_edge_value = edge_values[test_indices]


train_edge_index, train_edge_values = r_to_adj_matrix(train_edge_index, train_edge_value, num_users, num_movies)
val_edge_index, val_edge_values = r_to_adj_matrix(val_edge_index, val_edge_value, num_users, num_movies)
test_edge_index, test_edge_values = r_to_adj_matrix(test_edge_index, test_edge_value, num_users, num_movies)

r_mat_train_edge_index, r_mat_train_edge_values = adj_to_r_matrix(train_edge_index, train_edge_values, num_users, num_movies)
r_mat_val_edge_index, r_mat_val_edge_values = adj_to_r_matrix(val_edge_index, val_edge_values, num_users, num_movies)
r_mat_test_edge_index, r_mat_test_edge_values = adj_to_r_matrix(test_edge_index, test_edge_values, num_users, num_movies)

In [77]:
model = LightGCN(num_users, num_movies)

ITERATIONS = 10000
ITERS_PER_EVAL = 200
ITERS_PER_LR_DECAY = 200
N_ELEMENTS_REC = 10

model.train()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.01)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
loss_function = nn.MSELoss()

In [78]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = model.to(device)

train_edge_index = train_edge_index.to(device)
train_edge_values = train_edge_values.to(device)
val_edge_index = val_edge_index.to(device)
val_edge_values = val_edge_values.to(device)
test_edge_index = test_edge_index.to(device)
test_edge_values = test_edge_values.to(device)

r_mat_train_edge_index = r_mat_train_edge_index.to(device)
r_mat_train_edge_values = r_mat_train_edge_values.to(device)
r_mat_val_edge_index = r_mat_val_edge_index.to(device)
r_mat_val_edge_values = r_mat_val_edge_values.to(device)
r_mat_test_edge_index = r_mat_test_edge_index.to(device)
r_mat_test_edge_values = r_mat_test_edge_values.to(device)

Using device: cuda


In [79]:
for iter in range(ITERATIONS):
    # FORWARD PASS
    pred_ratings = model(train_edge_index, train_edge_values, num_users, num_movies)
    train_loss = loss_function(pred_ratings, r_mat_train_edge_values.view(-1, 1))

    # BACKWARD PASS
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    # VALIDATION SET
    if iter % ITERS_PER_EVAL == 0:
        model.eval()
        with torch.no_grad():
            pred_ratings = model(val_edge_index, val_edge_values, num_users, num_movies)
            val_loss = loss_function(pred_ratings, r_mat_val_edge_values.view(-1, 1)).sum()

            recall, precision, gini, coverage = get_recommendation_metrics(
                r_mat_val_edge_index, r_mat_val_edge_values, pred_ratings, num_movies, k=N_ELEMENTS_REC
            )
            print(
                f"Iter {iter}/{ITERATIONS}, Train loss: {train_loss.item():.4f}, "
                f"Val loss: {val_loss.item():.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, Gini: {gini:.4f}, Coverage: {coverage:.4f}"
            )
            model.train()

    if iter % ITERS_PER_LR_DECAY == 0 and iter != 0:
        scheduler.step()

Iter 0/10000, Train loss: 13.5167, Val loss: 13.5193, Recall: 0.0000, Precision: 0.0000, Gini: 0.7184, Coverage: 0.7407
Iter 200/10000, Train loss: 4.0989, Val loss: 4.0721, Recall: 0.0002, Precision: 0.0054, Gini: 0.8286, Coverage: 0.6317
Iter 400/10000, Train loss: 2.2994, Val loss: 2.2912, Recall: 0.0655, Precision: 0.3476, Gini: 0.8286, Coverage: 0.6309
Iter 600/10000, Train loss: 1.9006, Val loss: 1.8940, Recall: 0.1297, Precision: 0.4678, Gini: 0.8284, Coverage: 0.6311
Iter 800/10000, Train loss: 1.7153, Val loss: 1.7091, Recall: 0.1686, Precision: 0.5194, Gini: 0.8283, Coverage: 0.6311
Iter 1000/10000, Train loss: 1.6030, Val loss: 1.5970, Recall: 0.1948, Precision: 0.5440, Gini: 0.8281, Coverage: 0.6311
Iter 1200/10000, Train loss: 1.5253, Val loss: 1.5194, Recall: 0.2145, Precision: 0.5607, Gini: 0.8280, Coverage: 0.6311
Iter 1400/10000, Train loss: 1.4671, Val loss: 1.4614, Recall: 0.2310, Precision: 0.5751, Gini: 0.8278, Coverage: 0.6311
Iter 1600/10000, Train loss: 1.4212, 

In [80]:
model.eval()
with torch.no_grad():
    pred_ratings = model.forward(test_edge_index, test_edge_values, num_users, num_movies)
    test_loss = loss_function(pred_ratings, r_mat_test_edge_values.view(-1, 1))
    recall, precision, gini, coverage = get_recommendation_metrics(
        r_mat_test_edge_index, r_mat_test_edge_values, pred_ratings, num_movies, k=N_ELEMENTS_REC
    )
    print(f"Test metrics:\nLoss: {test_loss.item():.4f}\nRecall: {recall:.4f}\n"
          f"Precision: {precision:.4f}\nGini Index: {gini:.4f}\nCoverage: {coverage:.4f}")

Test metrics:
Loss: 1.1216
Recall: 0.3820
Precision: 0.6723
Gini Index: 0.8212
Coverage: 0.6371
