In [1]:
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

import pickle

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Tensorflow version: {tf.__version__}")

2025-05-05 20:52:15.621430: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-05 20:52:15.646605: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-05 20:52:15.646631: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-05 20:52:15.646650: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-05 20:52:15.651801: I tensorflow/core/platform/cpu_feature_g

System version: 3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 16:33:10) 
[GCC 12.3.0]
Pandas version: 2.2.3
Tensorflow version: 2.14.0


In [2]:
# Customers: 111K
# Average number of transactions per customer: 13.7
# Average number of items per transaction: 3.2
# Total unique transactions: 2.9M
# Total interactions: ~9M

TRANSACTIONS_PATH = '../../../data/processed/model_ready.csv'
LIGHT_GCN_YAML = 'data/lightgcn.yaml'

In [3]:
TOP_K = 10 # top k items recommend
EPOCHS = 50 # go with 10
BATCH_SIZE = 2048 # if we have spare GPU/Colab RAM, push up to 8192 or even 16384
SEED = DEFAULT_SEED # set None for non-deterministic results

In [4]:
# Load data (takes about 1 minute)
transactions = pd.read_csv(TRANSACTIONS_PATH)
transactions.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,1,1,0.420415,2025-03-21 06:10:09
1,2,2,0.666342,2025-03-21 06:35:18
2,3,3,0.666342,2025-03-21 06:48:35
3,4,4,0.666342,2025-03-21 06:56:18
4,5,5,0.420415,2025-03-21 07:16:02


In [5]:
transactions.shape
# drop lines with no LoyaltyCardIdentifier or UPC
transactions = transactions.dropna(subset=['userID', 'itemID'])
df = transactions.copy()

In [6]:
# only UPCs that are not -1 or 0
df = df[df['itemID'] != -1]
df = df[df['itemID'] != 0]
# time aware train/test split
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [7]:
from datetime import timedelta

In [8]:
cutoff = df.timestamp.max() - timedelta(days=18)
train_df = df[df.timestamp < cutoff].reset_index(drop=True)
test_df  = df[df.timestamp >= cutoff].reset_index(drop=True)

print("Train:", train_df.timestamp.min(), "→", train_df.timestamp.max())
print("Test: ", test_df.timestamp.min(),  "→", test_df.timestamp.max())

Train: 2025-01-01 00:01:28 → 2025-03-13 23:43:54
Test:  2025-03-14 00:04:56 → 2025-03-31 23:56:35


In [9]:
# Filter out test dataset as to contain only customers that are present in the training set
test_df = test_df[test_df.userID.isin(train_df.userID)]

In [10]:
## Create and Train the Model

In [11]:
data = ImplicitCF(train=train_df, test=test_df, seed=SEED)

In [12]:
yaml_file = LIGHT_GCN_YAML
hparams = prepare_hparams(LIGHT_GCN_YAML,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.01,
                          eval_epoch=5,
                          top_k=TOP_K)

In [13]:
if not hasattr(np, "mat"):
    np.mat = np.matrix
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2025-05-05 20:52:26.063893: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-05 20:52:26.066984: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-05 20:52:26.067090: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [14]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)3.5s: train loss = 0.27127 = (mf)0.27062 + (embed)0.00065
Epoch 2 (train)3.1s: train loss = 0.10881 = (mf)0.10695 + (embed)0.00186
Epoch 3 (train)3.1s: train loss = 0.06119 = (mf)0.05828 + (embed)0.00290
Epoch 4 (train)3.1s: train loss = 0.04185 = (mf)0.03820 + (embed)0.00365
Epoch 5 (train)3.0s + (eval)1.9s: train loss = 0.03271 = (mf)0.02851 + (embed)0.00420, recall = 0.05913, ndcg = 0.05163, precision = 0.01791, map = 0.02729
Epoch 6 (train)3.1s: train loss = 0.02635 = (mf)0.02174 + (embed)0.00461
Epoch 7 (train)3.0s: train loss = 0.02373 = (mf)0.01880 + (embed)0.00492
Epoch 8 (train)3.0s: train loss = 0.02069 = (mf)0.01553 + (embed)0.00515
Epoch 9 (train)3.0s: train loss = 0.01932 = (mf)0.01399 + (embed)0.00533
Epoch 10 (train)3.0s + (eval)1.8s: train loss = 0.01755 = (mf)0.01210 + (embed)0.00545, recall = 0.05133, ndcg = 0.04475, precision = 0.01503, map = 0.02439
Epoch 11 (train)3.0s: train loss = 0.01677 = (mf)0.01123 + (embed)0.00553
Epoch 12 (train)3.0s: train l

In [15]:
topk_scores = model.recommend_k_items(test_df, top_k=TOP_K, remove_seen=True)

topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,1,19,8.544017
1,1,262,8.326998
2,1,540,7.442498
3,1,10,6.892238
4,1,41,6.819953


In [16]:
eval_map = map(test_df, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test_df, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test_df, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test_df, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.024021
NDCG:	0.047133
Precision@K:	0.015257
Recall@K:	0.049419


In [17]:

# Record results for tests - ignore this cell
store_metadata("map", eval_map)
store_metadata("ndcg", eval_ndcg)
store_metadata("precision", eval_precision)
store_metadata("recall", eval_recall)

In [18]:
user_file = 'data/user_embeddings.csv'
item_file = 'data/item_embeddings.csv'
model.infer_embedding(user_file, item_file)