# Experimentation and Results

## Objective of the project 

This study seeks to conduct a thorough comparative analysis of these three models, focusing
on their performance with regards to accuracy, computational complexity, scalability, and their
effectiveness in handling data sparsity and dynamically changing environments. By evaluat-
ing these aspects, the research aims to illuminate the operational strengths and weaknesses
of each model, providing clear insights that could guide the development and deployment of
future recommender systems. Through this comparative framework, we aspire to answer which
model, under what conditions, provides the most reliable and robust recommendations, thereby
significantly contributing to the optimization of digital services.

In [13]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter, defaultdict
from surprise import Dataset, Reader, KNNBasic, SVD, CoClustering, accuracy
#from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from surprise.accuracy import rmse, mae

In [5]:
links_df = pd.read_csv('MovieLens_100k/links.csv')
movies_df = pd.read_csv('MovieLens_100k/movies.csv')
ratings_df = pd.read_csv('MovieLens_100k/ratings.csv')
tags_df = pd.read_csv('MovieLens_100k/tags.csv')

datasets = {
    "Links": links_df,
    "Movies": movies_df,
    "Ratings": ratings_df,
    "Tags": tags_df
}

datasets_info = {name: df.head() for name, df in datasets.items()}
datasets_info

{'Links':    movieId  imdbId   tmdbId
 0        1  114709    862.0
 1        2  113497   8844.0
 2        3  113228  15602.0
 3        4  114885  31357.0
 4        5  113041  11862.0,
 'Movies':    movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
 'Ratings':    userId  movieId  rating  timestamp
 0       1        1     4.0  964982703
 1       1        3     4.0  964981247
 2       1        6     4.0  964982224
 3       1       47     5.0  9

## Dataset structure

In [6]:
# Check for missing values in each dataset
missing_values = {name: df.isnull().sum() for name, df in datasets.items()}

# Print the information about missing values
for name, missing in missing_values.items():
    print(f"Missing values in {name} dataset:\n{missing}\n")

Missing values in Links dataset:
movieId    0
imdbId     0
tmdbId     8
dtype: int64

Missing values in Movies dataset:
movieId    0
title      0
genres     0
dtype: int64

Missing values in Ratings dataset:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Missing values in Tags dataset:
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64



In [7]:
# Print the shape of each DataFrame
for name, df in datasets.items():
    print(f"The shape of the {name} DataFrame is: {df.shape}")

The shape of the Links DataFrame is: (9742, 3)
The shape of the Movies DataFrame is: (9742, 3)
The shape of the Ratings DataFrame is: (100836, 4)
The shape of the Tags DataFrame is: (3683, 4)


In [8]:
distribution_of_ratings = ratings_df.groupby('rating').size().reset_index(name='count')
distribution_of_ratings

Unnamed: 0,rating,count
0,0.5,1370
1,1.0,2811
2,1.5,1791
3,2.0,7551
4,2.5,5550
5,3.0,20047
6,3.5,13136
7,4.0,26818
8,4.5,8551
9,5.0,13211


# Hypergraph-based Models: Node2Vec Algorithm 

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
import hypernetx as hnx
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, regularizers
from collections import defaultdict
from joblib import Parallel, delayed
import time
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt


# Load the dataset
ratings_df = pd.read_csv('MovieLens_100k/ratings.csv')
movies_df = pd.read_csv('MovieLens_100k/movies.csv')

train_df, test_df = train_test_split(ratings_df, test_size=0.20, random_state=42)

# Build the hypergraph
edges = defaultdict(list)
for _, row in train_df.iterrows():
    user_node = f'user_{row["userId"]}'
    movie_node = f'movie_{row["movieId"]}'
    rating = row["rating"]
    hyperedge = f'{movie_node}_rating_{rating}'
    edges[hyperedge].append(user_node)
    edges[hyperedge].append(movie_node)

H = hnx.Hypergraph(edges)
print(f"Hypergraph created with {len(H.nodes)} nodes and {len(H.edges)} edges.")
print(f"Number of nodes in hypergraph: {len(H.nodes)}")
print(f"Sample nodes: {list(H.nodes)[:5]}")


# Create adjacency matrix for hypergraph
def create_hypergraph_adjacency_matrix(hypergraph):
    node_list = list(hypergraph.nodes)
    node_idx = {node: idx for idx, node in enumerate(node_list)}
    n = len(node_list)
    
    data = []
    row = []
    col = []

    for edge in hypergraph.edges:
        edge_nodes = list(hypergraph.edges[edge])
        for i in range(len(edge_nodes)):
            for j in range(i + 1, len(edge_nodes)):
                node_i = node_idx[edge_nodes[i]]
                node_j = node_idx[edge_nodes[j]]
                data.append(1)
                row.append(node_i)
                col.append(node_j)
                data.append(1)
                row.append(node_j)
                col.append(node_i)

    adj_matrix = csr_matrix((data, (row, col)), shape=(n, n))
    print(f"Adjacency matrix created with shape {adj_matrix.shape}")
    return adj_matrix, node_idx

adj_matrix, node_to_idx = create_hypergraph_adjacency_matrix(H)
adj_matrix_sparse = tf.sparse.SparseTensor(indices=np.array([adj_matrix.nonzero()[0], adj_matrix.nonzero()[1]]).T,
                                           values=adj_matrix.data.astype(np.float32),
                                           dense_shape=adj_matrix.shape)
adj_matrix_sparse = tf.sparse.reorder(adj_matrix_sparse)


# Evaluation metrics functions
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    return precisions, recalls

def compute_mse(predictions):
    """Compute Mean Squared Error (MSE)."""
    mse = np.mean([(true_r - est) ** 2 for (_, _, true_r, est, _) in predictions])
    return mse

def compute_rmse(predictions):
    """Compute Root Mean Squared Error (RMSE)."""
    mse = compute_mse(predictions)
    rmse = np.sqrt(mse)
    return rmse
    
def compute_mae(predictions):
    mae = np.mean([abs(true_r - est) for (_, _, true_r, est, _) in predictions])
    return mae

# Functions to generate sparse and new user data
def get_sparse_data(ratings, frac=0.1):
    sparse_ratings_df = ratings.sample(frac=frac, random_state=42) 
    return sparse_ratings_df

def get_new_user_data(ratings, frac=0.1):
    new_user_ratings_df = ratings[ratings['userId'].isin(ratings['userId'].sample(frac=frac, random_state=42))]
    return new_user_ratings_df

def evaluate_model(test, embeddings, user_mapping, movie_mapping, scenario, algorithm):
    def predict_rating(user, movie):
        if user in user_mapping and movie in movie_mapping:
            user_idx = user_mapping[user]
            movie_idx = movie_mapping[movie]
            if user_idx >= embeddings.shape[0] or movie_idx >= embeddings.shape[0]:
                return 0
            user_emb = embeddings[user_idx]
            movie_emb = embeddings[movie_idx]
            return np.dot(user_emb, movie_emb)
        else:
            return 0

    predictions = []
    for _, row in test.iterrows():
        uid = row['userId']
        mid = row['movieId']
        true_r = row['rating']
        est = predict_rating(uid, mid)
        predictions.append((uid, mid, true_r, est, None))

    mse = compute_mse(predictions)
    rmse = compute_rmse(predictions)
    mae = compute_mae(predictions)
    precisions, recalls = precision_recall_at_k(predictions, k=10)

    avg_precision = np.mean(list(precisions.values()))
    avg_recall = np.mean(list(recalls.values()))

    results = pd.DataFrame({
        'Scenario': [scenario],
        'Algorithm': [algorithm],
        'MSE':[mse],
        'RMSE': [rmse],
        'MAE': [mae],
        'Precision@10': [avg_precision],
        'Recall@10': [avg_recall]
    })
    
    return results

Hypergraph created with 9593 nodes and 26961 edges.
Number of nodes in hypergraph: 9593
Sample nodes: ['user_509.0', 'movie_7347.0', 'user_380.0', 'user_274.0', 'user_474.0']
Adjacency matrix created with shape (9593, 9593)


## Original Dataset Node2Vec 

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
import hypernetx as hnx
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, regularizers
from collections import defaultdict
from joblib import Parallel, delayed
import time
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

class Node2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Node2Vec, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                                          embeddings_initializer=tf.keras.initializers.RandomNormal(stddev=1.0),
                                          embeddings_regularizer=tf.keras.regularizers.l2(1e-5))  # Reduced regularization
        self.dropout = layers.Dropout(0.5)
        self.batch_norm = layers.BatchNormalization()
        self.dense1 = layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-5))
        self.dense2 = layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-5))
        self.dense3 = layers.Dense(1, activation='linear')
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = tf.reduce_mean(x, axis=1)
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = self.dense1(x)
        x = self.dropout(x)
        x = self.dense2(x)
        return self.dense3(x)

def train_node2vec_model(H, train, test, scenario):
    user_ids = train['userId'].unique()
    movie_ids = train['movieId'].unique()

    user_mapping = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_mapping = {movie_id: idx + len(user_ids) for idx, movie_id in enumerate(movie_ids)}

    train['user_idx'] = train['userId'].map(user_mapping)
    train['movie_idx'] = train['movieId'].map(movie_mapping)

    test['user_idx'] = test['userId'].map(user_mapping)
    test['movie_idx'] = test['movieId'].map(movie_mapping)

    # Generate random walks from the hypergraph
    def random_walk(hypergraph, start_node, walk_length):
        walk = [start_node]
        while len(walk) < walk_length:
            cur = walk[-1]
            neighbors = list(hypergraph.neighbors(cur))
            if neighbors:
                walk.append(np.random.choice(neighbors))
            else:
                break
        return [str(node) for node in walk]

    def generate_walks(hypergraph, num_walks, walk_length):
        print("Generating random walks...")
        nodes = list(hypergraph.nodes)
        walks = Parallel(n_jobs=-1)(delayed(random_walk)(hypergraph, np.random.choice(nodes), walk_length) for _ in range(num_walks))
        print("Random walks generation completed.")
        return walks

    num_walks = 100  
    walk_length = 50  
    dimensions = 128  
    window_size = 5 
    epochs = 150  
    learning_rate = 0.001  

    walks = generate_walks(H, num_walks, walk_length)

    # Convert walks to integer indices
    node_to_idx = {node: idx for idx, node in enumerate(H.nodes)}
    walks_indices = [[node_to_idx[node] for node in walk if node in node_to_idx] for walk in walks]
    vocab_size = len(node_to_idx)

    X = []
    y = []
    for walk in walks_indices:
        if len(walk) > window_size:
            for i in range(len(walk) - window_size):
                context = walk[i:i + window_size]
                target = walk[i + window_size]
                X.append(context)
                y.append(target)

    X = np.array(X)
    y = np.array(y)

    if X.size == 0 or y.size == 0:
        print("No data generated for training. Check the random walk and context-target extraction steps.")
        return pd.DataFrame({
            'Scenario': [scenario],
            'Algorithm': ['Node2Vec'],
            'MSE': [None],
            'RMSE': [None],
            'MAE': [None],
            'Precision@10': [None],
            'Recall@10': [None],
            'Running Time (s)': [None]
        })
    else:
        dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(256).shuffle(buffer_size=1024).repeat()
        steps_per_epoch = len(X) // 256
        if steps_per_epoch == 0:
            steps_per_epoch = 1

        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=learning_rate,
            decay_steps=1000,
            decay_rate=0.96,
            staircase=True)

        model = Node2Vec(vocab_size, dimensions)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), loss='mean_squared_error', metrics=['accuracy'])
        print("Starting model training...")
        start_time = time.time()
        history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)
        end_time = time.time()
        print(f"Training completed in {end_time - start_time:.2f} seconds")

        embeddings = model.embedding.get_weights()[0]

    results = evaluate_model(test, embeddings, user_mapping, movie_mapping, scenario, "Node2Vec")
    results['Running Time (s)'] = end_time - start_time
    
    return results


# Evaluate Node2Vec model for different scenarios
results_node2vec_normal = train_node2vec_model(H, train_df, test_df, "Normal")

Generating random walks...
Random walks generation completed.
Starting model training...
Epoch 1/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.0000e+00 - loss: 2246440.7500
Epoch 2/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0000e+00 - loss: 2344078.0000
Epoch 3/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0000e+00 - loss: 2053699.5000
Epoch 4/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.0000e+00 - loss: 1756091.2500
Epoch 5/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 1661303.6250
Epoch 6/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.0000e+00 - loss: 1549297.2500
Epoch 7/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0000e+00 - loss: 1713258.7500
E

In [56]:
results_node2vec_normal

Unnamed: 0,Scenario,Algorithm,MSE,RMSE,MAE,Precision@10,Recall@10,Running Time (s)
0,Normal,Node2Vec,46.897165,6.84815,4.982268,0.474496,0.11272,59.128732


# Sparse data Node2vec

In [48]:
sparse_train_df = get_sparse_data(train_df, frac=0.1)

# Sparse 
# Build the hypergraph
edges_sparse = defaultdict(list)
for _, row in sparse_train_df.iterrows():
    user_node = f'user_{row["userId"]}'
    movie_node = f'movie_{row["movieId"]}'
    rating = row["rating"]
    hyperedge = f'{movie_node}_rating_{rating}'
    edges_sparse[hyperedge].append(user_node)
    edges_sparse[hyperedge].append(movie_node)

H_sparse = hnx.Hypergraph(edges_sparse)


def train_node2vec_model(H, train, test, scenario):
    user_ids = train['userId'].unique()
    movie_ids = train['movieId'].unique()

    user_mapping = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_mapping = {movie_id: idx + len(user_ids) for idx, movie_id in enumerate(movie_ids)}

    train['user_idx'] = train['userId'].map(user_mapping)
    train['movie_idx'] = train['movieId'].map(movie_mapping)

    test['user_idx'] = test['userId'].map(user_mapping)
    test['movie_idx'] = test['movieId'].map(movie_mapping)

    # Generate random walks from the hypergraph
    def random_walk(hypergraph, start_node, walk_length):
        walk = [start_node]
        while len(walk) < walk_length:
            cur = walk[-1]
            neighbors = list(hypergraph.neighbors(cur))
            if neighbors:
                walk.append(np.random.choice(neighbors))
            else:
                break
        return [str(node) for node in walk]

    def generate_walks(hypergraph, num_walks, walk_length):
        print("Generating random walks...")
        nodes = list(hypergraph.nodes)
        walks = Parallel(n_jobs=-1)(delayed(random_walk)(hypergraph, np.random.choice(nodes), walk_length) for _ in range(num_walks))
        print("Random walks generation completed.")
        return walks

    num_walks = 100  
    walk_length = 50  
    dimensions = 128  
    window_size = 5 
    epochs = 150  
    learning_rate = 0.001  

    walks = generate_walks(H_sparse, num_walks, walk_length)

    # Convert walks to integer indices
    node_to_idx = {node: idx for idx, node in enumerate(H_sparse.nodes)}
    walks_indices = [[node_to_idx[node] for node in walk if node in node_to_idx] for walk in walks]
    vocab_size = len(node_to_idx)
    
    X = []
    y = []
    for walk in walks_indices:
        if len(walk) > window_size:
            for i in range(len(walk) - window_size):
                context = walk[i:i + window_size]
                target = walk[i + window_size]
                X.append(context)
                y.append(target)

    X = np.array(X)
    y = np.array(y)

    if X.size == 0 or y.size == 0:
        print("No data generated for training. Check the random walk and context-target extraction steps.")
        return pd.DataFrame({
            'Scenario': [scenario],
            'Algorithm': ['Node2Vec'],
            'MSE': [None],
            'RMSE': [None],
            'MAE': [None],
            'Precision@10': [None],
            'Recall@10': [None],
            'Running Time (s)': [None]
        })
    else:
        dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(256).shuffle(buffer_size=1024).repeat()
        steps_per_epoch = len(X) // 256
        if steps_per_epoch == 0:
            steps_per_epoch = 1

        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=learning_rate,
            decay_steps=1000,
            decay_rate=0.96,
            staircase=True)

        model = Node2Vec(vocab_size, dimensions)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), loss='mean_squared_error', metrics=['accuracy'])
        print("Starting model training...")
        start_time = time.time()
        history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)
        end_time = time.time()
        print(f"Training completed in {end_time - start_time:.2f} seconds")

        embeddings = model.embedding.get_weights()[0]

    results = evaluate_model(test, embeddings, user_mapping, movie_mapping, scenario, "Node2Vec")
    results['Running Time (s)'] = end_time - start_time
    
    return results


results_node2vec_sparse = train_node2vec_model(H_sparse, sparse_train_df, test_df, "Sparse")

Generating random walks...
Random walks generation completed.
Starting model training...
Epoch 1/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 7.6526e-04 - loss: 1451524.5000
Epoch 2/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0000e+00 - loss: 1294319.2500
Epoch 3/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0000e+00 - loss: 1151162.1250
Epoch 4/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0000e+00 - loss: 994514.1250 
Epoch 5/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0000e+00 - loss: 747250.3750
Epoch 6/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0000e+00 - loss: 751999.0000
Epoch 7/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0000e+00 - loss: 764305.7500
Epoc

In [49]:
results_node2vec_sparse

Unnamed: 0,Scenario,Algorithm,MSE,RMSE,MAE,Precision@10,Recall@10,Running Time (s)
0,Sparse,Node2Vec,48.482455,6.962934,5.095941,0.407286,0.099925,38.807848


# New user Node2Vec

In [50]:
new_user_train_df = get_new_user_data(train_df, frac=0.1)

# Build the hypergraph
edges_new_user = defaultdict(list)
for _, row in new_user_train_df.iterrows():
    user_node = f'user_{row["userId"]}'
    movie_node = f'movie_{row["movieId"]}'
    rating = row["rating"]
    hyperedge = f'{movie_node}_rating_{rating}'
    edges_new_user[hyperedge].append(user_node)
    edges_new_user[hyperedge].append(movie_node)

H_new_user = hnx.Hypergraph(edges_new_user)


def train_node2vec_model(H, train, test, scenario):
    user_ids = train['userId'].unique()
    movie_ids = train['movieId'].unique()

    user_mapping = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_mapping = {movie_id: idx + len(user_ids) for idx, movie_id in enumerate(movie_ids)}

    train['user_idx'] = train['userId'].map(user_mapping)
    train['movie_idx'] = train['movieId'].map(movie_mapping)

    test['user_idx'] = test['userId'].map(user_mapping)
    test['movie_idx'] = test['movieId'].map(movie_mapping)

    # Generate random walks from the hypergraph
    def random_walk(hypergraph, start_node, walk_length):
        walk = [start_node]
        while len(walk) < walk_length:
            cur = walk[-1]
            neighbors = list(hypergraph.neighbors(cur))
            if neighbors:
                walk.append(np.random.choice(neighbors))
            else:
                break
        return [str(node) for node in walk]

    def generate_walks(hypergraph, num_walks, walk_length):
        print("Generating random walks...")
        nodes = list(hypergraph.nodes)
        walks = Parallel(n_jobs=-1)(delayed(random_walk)(hypergraph, np.random.choice(nodes), walk_length) for _ in range(num_walks))
        print("Random walks generation completed.")
        return walks

    num_walks = 100  
    walk_length = 50  
    dimensions = 128  
    window_size = 5 
    epochs = 150  
    learning_rate = 0.001  

    walks = generate_walks(H_new_user, num_walks, walk_length)

    # Convert walks to integer indices
    node_to_idx = {node: idx for idx, node in enumerate(H_new_user.nodes)}
    walks_indices = [[node_to_idx[node] for node in walk if node in node_to_idx] for walk in walks]
    vocab_size = len(node_to_idx)
    
    X = []
    y = []
    for walk in walks_indices:
        if len(walk) > window_size:
            for i in range(len(walk) - window_size):
                context = walk[i:i + window_size]
                target = walk[i + window_size]
                X.append(context)
                y.append(target)

    X = np.array(X)
    y = np.array(y)

    if X.size == 0 or y.size == 0:
        print("No data generated for training. Check the random walk and context-target extraction steps.")
        return pd.DataFrame({
            'Scenario': [scenario],
            'Algorithm': ['Node2Vec'],
            'MSE': [None],
            'RMSE': [None],
            'MAE': [None],
            'Precision@10': [None],
            'Recall@10': [None],
            'Running Time (s)': [None]
        })
    else:
        dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(256).shuffle(buffer_size=1024).repeat()
        steps_per_epoch = len(X) // 256
        if steps_per_epoch == 0:
            steps_per_epoch = 1

        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=learning_rate,
            decay_steps=1000,
            decay_rate=0.96,
            staircase=True)

        model = Node2Vec(vocab_size, dimensions)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), loss='mean_squared_error', metrics=['accuracy'])
        print("Starting model training...")
        start_time = time.time()
        history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)
        end_time = time.time()
        print(f"Training completed in {end_time - start_time:.2f} seconds")

        embeddings = model.embedding.get_weights()[0]

    results = evaluate_model(test, embeddings, user_mapping, movie_mapping, scenario, "Node2Vec")
    results['Running Time (s)'] = end_time - start_time
    
    return results


results_node2vec_new_user = train_node2vec_model(H_new_user, new_user_train_df, test_df, "New User")

Generating random walks...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_idx'] = train['userId'].map(user_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['movie_idx'] = train['movieId'].map(movie_mapping)


Random walks generation completed.
Starting model training...
Epoch 1/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.0000e+00 - loss: 2410074.2500
Epoch 2/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.0000e+00 - loss: 2663334.2500
Epoch 3/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0000e+00 - loss: 2267722.0000
Epoch 4/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.0000e+00 - loss: 2033621.7500
Epoch 5/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.0000e+00 - loss: 1813715.5000
Epoch 6/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.0000e+00 - loss: 1780027.7500
Epoch 7/150
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.0000e+00 - loss: 1920920.2500
Epoch 8/150
[1m17/17[0m [

In [51]:
results_node2vec_new_user

Unnamed: 0,Scenario,Algorithm,MSE,RMSE,MAE,Precision@10,Recall@10,Running Time (s)
0,New User,Node2Vec,47.357681,6.881692,4.988233,0.442582,0.097093,59.448123


In [52]:
# # Combine Node2Vec results into a single DataFrame
results_node2vec_combined = pd.concat([results_node2vec_normal, results_node2vec_sparse, results_node2vec_new_user], ignore_index=True)

results_node2vec_combined

Unnamed: 0,Scenario,Algorithm,MSE,RMSE,MAE,Precision@10,Recall@10,Running Time (s)
0,Normal,Node2Vec,47.26765,6.875147,5.017519,0.473556,0.104442,57.877078
1,Sparse,Node2Vec,48.482455,6.962934,5.095941,0.407286,0.099925,38.807848
2,New User,Node2Vec,47.357681,6.881692,4.988233,0.442582,0.097093,59.448123
