In [1]:
!git clone https://github.com/Hrithik2212/watchfree.git

Cloning into 'watchfree'...
remote: Enumerating objects: 430, done.[K
remote: Counting objects: 100% (430/430), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 430 (delta 175), reused 395 (delta 140), pack-reused 0[K
Receiving objects: 100% (430/430), 268.95 KiB | 1.52 MiB/s, done.
Resolving deltas: 100% (175/175), done.


In [11]:
!pip install -q isbnlib

In [12]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!pip install -q kaggle
!kaggle datasets download -d odedgolden/movielens-1m-dataset

Downloading movielens-1m-dataset.zip to /content
 86% 5.00M/5.83M [00:00<00:00, 46.5MB/s]
100% 5.83M/5.83M [00:00<00:00, 52.8MB/s]


In [16]:
!mv /content/movies.dat /content/watchfree/mldev/data/
!mv /content/ratings.dat /content/watchfree/mldev/data/
!mv /content/users.dat /content/watchfree/mldev/data/


In [10]:
import sys
import os
# Append the parent directory to sys.path for relative imports
project_root = dir_path = "/content/watchfree/mldev/"# os.path.dirname(os.getcwd())
sys.path.append(project_root)

import numpy as np
import pandas as pd
import random
import scipy.sparse as sp
import tensorflow as tf
from tensorflow.keras.utils import Progbar
from src.utils import preprocess, metrics
from src import LightGCN

# Suppress warnings for cleaner notebook presentation
import warnings
warnings.simplefilter("ignore")

## Dataset

In [17]:
columns = ["movie_id", "title", "genre"]
movies = pd.read_table("/content/watchfree/mldev/data/movies.dat", names = columns, sep = "::", encoding = "latin1", engine = "python")
movies

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [18]:
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table("/content/watchfree/mldev/data/ratings.dat", names = columns, sep = "::", encoding = "latin1", engine='python')
ratings

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [19]:
ratings.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userid,1000209.0,3024.512,1728.413,1.0,1506.0,3070.0,4476.0,6040.0
movieid,1000209.0,1865.54,1096.041,1.0,1030.0,1835.0,2770.0,3952.0
rating,1000209.0,3.581564,1.117102,1.0,3.0,4.0,4.0,5.0
timestamp,1000209.0,972243700.0,12152560.0,956703932.0,965302637.0,973018006.0,975220939.0,1046455000.0


In [21]:
ratings = ratings.drop('timestamp' , axis=1)

In [22]:
columns = ['User_ID', 'Gender', 'Age', 'Occupation', 'Zip-code']
users = pd.read_table("/content/watchfree/mldev/data/users.dat", names = columns, sep = "::", encoding = "latin1", engine='python')
users

Unnamed: 0,User_ID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


### Train Test Split

In [23]:
# Split data into training and testing sets
train_size = 0.75
train, test = preprocess.stratified_split(ratings, 'user_id', train_size)
print(f'Train Shape: {train.shape}\nTest Shape: {test.shape}')
print(f'Do they have the same users?: {set(train.user_id) == set(test.user_id)}')


Train Shape: (750121, 3)
Test Shape: (250088, 3)
Do they have the same users?: True


In [24]:
# Assuming train and test DataFrames are already defined
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# Combine train and test data
combined = pd.concat([train, test]).reset_index(drop=True)

# Count unique users and movies
n_users = combined['user_id'].max()
n_movies = combined['movie_id'].max()
print('Number of users:', n_users)
print('Number of movies:', n_movies)

# Create user and item mappings
user2id = {uid: idx for idx, uid in enumerate(combined['user_id'].unique())}
book2id = {book: idx for idx, book in enumerate(combined['movie_id'].unique())}
id2user = {idx: uid for uid, idx in user2id.items()}
id2item = {idx: book for book, idx in book2id.items()}

# Apply mappings to train and test sets
train['user_id_new'] = train['user_id'].map(user2id)
train['movie_id_new'] = train['movie_id'].map(book2id)
test['user_id_new'] = test['user_id'].map(user2id)
test['movie_id_new'] = test['movie_id'].map(book2id)


# Check for NaNs after reindexing
print("NaNs in train_reindex user_id:", train['user_id_new'].isna().sum())
print("NaNs in test_reindex user_id:", test['user_id_new'].isna().sum())

# Create a DataFrame to keep track of which books each user has interacted with
interacted = train.groupby("user_id_new")["movie_id_new"].apply(set).reset_index()
interacted.rename(columns={"movie_id_new": "movie_interacted"}, inplace=True)

Number of users: 6040
Number of movies: 3952
NaNs in train_reindex user_id: 0
NaNs in test_reindex user_id: 0


In [25]:
interacted

Unnamed: 0,user_id_new,movie_interacted
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,1,"{2, 9, 16, 40, 41, 42, 43, 44, 45, 46, 47, 48,..."
2,2,"{134, 135, 136, 137, 138, 139, 140, 141, 142, ..."
3,3,"{1, 164, 165, 5, 6, 166, 167, 168, 169, 170, 4..."
4,4,"{2, 5, 20, 32, 33, 56, 61, 66, 72, 76, 105, 10..."
...,...,...
6035,6035,"{1, 2, 2052, 6, 9, 13, 14, 16, 17, 20, 22, 23,..."
6036,6036,"{512, 2561, 513, 1, 6, 520, 522, 524, 1036, 17..."
6037,6037,"{1441, 2371, 548, 132, 1000, 136, 168, 240, 59..."
6038,6038,"{1, 514, 516, 6, 7, 8, 520, 11, 26, 1565, 546,..."


### Adjacency matrix

The adjacency matrix is a data structure the represents a graph by encoding the connections and between nodes. In our case, nodes are both users and movies. Rows and columns consist of ALL the nodes and for every connection (reviewed movie) there is the value 1.

To first create the adjacency matrix we first create a user-item graph where similar to the adjacency matrix, connected users and movies are represented as 1 in a sparse array. Unlike the adjacency matrix, a user-item graph only has users for the columns/rows and items as the other, whereas the adjacency matrix has both users and items concatenated as rows and columns.

In this case, because the graph is undirected (meaning the connections between nodes do not have a specified direction) the adjacency matrix is symmetric. We use this to our advantage by transposing the user-item graph to create the adjacency matrix.

Our adjacency matrix will not include self-connections where each node is connected to itself.

In [26]:
# Create user-item interaction matrix
R = sp.dok_matrix((n_users, n_movies), dtype=np.float32)
for _, row in train.iterrows():
    R[row['user_id_new'], row['movie_id_new']] = 1

# Create adjacency matrix
adj_mat = sp.dok_matrix((n_users + n_movies, n_users + n_movies), dtype=np.float32)
adj_mat[:n_users, n_users:] = R
adj_mat[n_users:, :n_users] = R.T

In [27]:
# Calculate normalized adjacency matrix
D_values = np.array(adj_mat.sum(1))
D_inv_values = np.power(D_values + 1e-9, -0.5).flatten()
D_inv_values[np.isinf(D_inv_values)] = 0.0
D_inv_sq_root = sp.diags(D_inv_values)
norm_adj_mat = D_inv_sq_root.dot(adj_mat).dot(D_inv_sq_root)

In [28]:
# Convert to SparseTensor for TensorFlow
coo = norm_adj_mat.tocoo().astype(np.float32)
indices = np.mat([coo.row, coo.col]).transpose()
A_tilde = tf.SparseTensor(indices, coo.data, coo.shape)


## Model : _LightGCN_

LightGCN keeps neighbor aggregation while removing self-connections, feature transformation, and nonlinear activation, simplifying as well as improving performance.

Neighbor aggregation is done through graph convolutions to learn embeddings that represent nodes. The size of the embeddings can be changed to whatever number. In this notebook, we set the embedding dimension to 64.

In matrix form, graph convolution can be thought of as matrix multiplication. In the implementation we create a graph convolution layer that performs just this, allowing us to stack as many graph convolutions as we want. We have the number of layers as 10 in this notebook.
Custom training

For training, we batch a number of users from the training set and sample a single positive item (movie that has been reviewed) and a single negative item (movie that has not been reviewed) for each user.

![image.png](attachment:c4aa4ba6-d530-46b1-b558-6cf9f8ea58d8.png)

In [30]:
# Model configuration
N_LAYERS = 10
EMBED_DIM = 64
DECAY = 0.0001
EPOCHS = 1
BATCH_SIZE = 1024
LEARNING_RATE = 1e-2

# Initialize LightGCN model
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model = LightGCN.LightGCN(A_tilde, n_users, n_movies, N_LAYERS, EMBED_DIM, DECAY)

In [31]:
%%time
# Custom training loop with negative sampling and gradient updates
for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')
    n_batch = len(train) // BATCH_SIZE + (len(train) % BATCH_SIZE != 0)
    bar = Progbar(n_batch)

    for _ in range(n_batch):
        # Sample a batch of users for training
        users = np.random.choice(n_users, BATCH_SIZE, replace=False)

        # Function for negative sampling
        def sample_neg(user_interacted_items):
            while True:
                neg_item = random.randint(0, n_movies - 1)
                if neg_item not in user_interacted_items:
                    return neg_item

        # Sample positive and negative items for each user
        pos_items = [random.choice(list(interacted[interacted['user_id_new'] == u]['movie_interacted'].values[0])) for u in users]
        neg_items = [sample_neg(interacted[interacted['user_id_new'] == u]['movie_interacted'].values[0]) for u in users]

        with tf.GradientTape() as tape:
            # Call LightGCN with user and item embeddings
            new_user_embeddings, new_item_embeddings = model(
                (model.user_embedding, model.item_embedding)
            )

            # Embeddings after convolutions
            user_embeddings = tf.nn.embedding_lookup(new_user_embeddings, users)
            pos_item_embeddings = tf.nn.embedding_lookup(new_item_embeddings, pos_items)
            neg_item_embeddings = tf.nn.embedding_lookup(new_item_embeddings, neg_items)

            # Initial embeddings before convolutions
            old_user_embeddings = tf.nn.embedding_lookup(model.user_embedding, users)
            old_pos_item_embeddings = tf.nn.embedding_lookup(model.item_embedding, pos_items)
            old_neg_item_embeddings = tf.nn.embedding_lookup(model.item_embedding, neg_items)

            # Calculate training loss
            pos_scores = tf.reduce_sum(tf.multiply(user_embeddings, pos_item_embeddings), axis=1)
            neg_scores = tf.reduce_sum(tf.multiply(user_embeddings, neg_item_embeddings), axis=1)
            regularizer = (tf.nn.l2_loss(old_user_embeddings) +
                           tf.nn.l2_loss(old_pos_item_embeddings) +
                           tf.nn.l2_loss(old_neg_item_embeddings)) / BATCH_SIZE
            mf_loss = tf.reduce_mean(tf.nn.softplus(-(pos_scores - neg_scores)))
            emb_loss = DECAY * regularizer
            training_loss = mf_loss + emb_loss

        # Apply gradients
        grads = tape.gradient(training_loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        bar.add(1, values=[('training loss', float(training_loss))])

    # Evaluate on the test set
    test_loss = 0.0
    n_test_batch = len(test) // BATCH_SIZE + (len(test) % BATCH_SIZE != 0)
    for _ in range(n_test_batch):
        # Sample a batch of users for testing
        users = np.random.choice(n_users, BATCH_SIZE, replace=False)

        # Sample positive items for each user
        pos_items = [random.choice(list(interacted[interacted['user_id_new'] == u]['movie_interacted'].values[0])) for u in users]

        # Calculate test loss
        user_embeddings = tf.nn.embedding_lookup(new_user_embeddings, users)
        pos_item_embeddings = tf.nn.embedding_lookup(new_item_embeddings, pos_items)
        pos_scores = tf.reduce_sum(tf.multiply(user_embeddings, pos_item_embeddings), axis=1)
        test_loss += tf.reduce_mean(tf.nn.softplus(-pos_scores))

    test_loss /= n_test_batch
    print(f'Test loss: {test_loss.numpy()}')


Epoch 1/1
Test loss: 0.07308534532785416
CPU times: user 13min 21s, sys: 27.8 s, total: 13min 48s
Wall time: 15min 20s


In [48]:
# Generate recommendations
users = np.array([user2id[x] for x in test['user_id'].unique()])
recommendations = model.recommend(users, k=10)

# Replace new user and book IDs with original IDs
recommendations['user_id'] = recommendations['user_id'].map(id2user)
recommendations['movie_id'] = recommendations['movie_id'].map(id2item)

# Display the first 5 recommendations
recommendations.head(5)

Unnamed: 0,user_id,book_name,prediction
0,1,2858.0,7.071446
1,1,589.0,5.962182
2,1,593.0,5.944461
3,1,1196.0,5.932372
4,1,1210.0,5.899832


In [47]:
recommendations

Unnamed: 0,user_id,book_name,prediction,userid,movieid
0,0,127,7.071446,1,2858.0
1,0,77,5.962181,1,589.0
2,0,138,5.944461,1,593.0
3,0,86,5.932371,1,1196.0
4,0,82,5.899832,1,1210.0
...,...,...,...,...,...
60395,6039,530,7.944375,6040,858.0
60396,6039,314,7.936084,6040,2396.0
60397,6039,187,7.861442,6040,1580.0
60398,6039,128,7.854582,6040,2628.0


## **Evaluation Metrics**

The performance of our model is evaluated using the test set, which consists of the exact same users in the training set but with movies the users have reviewed that the model has not seen before. A good model will recommend movies that the user has also reviewed in the test set.

### **Precision@k**

Out of the movies that are recommended, what proportion is relevant. Relevant in this case is if the user has reviewed the movie.

A precision@10 of about 0.1 means that about 10% of the recommendations are relevant to the user. In other words, out of the 10 recommendations made, on average a user will have 1 movie that is actually relevant.

### **Recall@k**

Out of all the relevant movies (in the test set), how many are recommended.

A recall@10 of 0.1 means that about 10% of the relevant books were recommended. By definition you can see how even if all the recommendations made were relevant, recall@k is capped by k. A higher k means that more relevant movies can be recommended.

## **Mean Average Precision (MAP)**

Calculate the average precision for each user and average all the average precisions over all users. Penalizes incorrect rankings of movies.

## **Normalized Discounted Cumulative Gain (NDGC)**

Looks at both relevant movies and the ranking order of the relevant movies. Normalized by the total number of users.

In [53]:
# Evaluate model performance
top_k = recommendations.copy()
top_k['rank'] = top_k.groupby('user_id', sort=False).cumcount() + 1

# Calculate evaluation metrics
precision_at_k = metrics.precision_at_k(top_k, test, 'user_id', 'movie_id', 'rank')
recall_at_k = metrics.recall_at_k(top_k, test, 'user_id', 'movie_id', 'rank')
mean_average_precision = metrics.mean_average_precision(top_k, test, 'user_id', 'movie_id', 'rank')
ndcg = metrics.ndcg(top_k, test, 'user_id', 'movie_id', 'rank')

# Display evaluation metrics
print(f'Precision: {precision_at_k:.6f}',
      f'Recall: {recall_at_k:.6f}',
      f'MAP: {mean_average_precision:.6f}',
      f'NDCG: {ndcg:.6f}', sep='\n')

Precision: 0.215679
Recall: 0.069046
MAP: 0.033587
NDCG: 0.231854


In [56]:
top_k

Unnamed: 0,user_id,book_name,prediction,rank
0,1,2858.0,7.071446,1
1,1,589.0,5.962182,2
2,1,593.0,5.944461,3
3,1,1196.0,5.932372,4
4,1,1210.0,5.899832,5
...,...,...,...,...
60395,6040,858.0,7.944376,6
60396,6040,2396.0,7.936085,7
60397,6040,1580.0,7.861442,8
60398,6040,2628.0,7.854582,9
