## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
import mlflow
from loguru import logger
from load_dotenv import load_dotenv
import time
import json

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.cf_u2u import U2UCollaborativeFiltering
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
_ = load_dotenv(override = True)

## Arguments

In [4]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    run_name: str = "002-u2u-cf"
    notebook_persit_dp: str = None
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.run_name}")

        if not os.environ.get("MLFLOW_TRACKING_URI"):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-04-04 10:26:13.756[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m27[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 002-u2u-cf[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "run_name": "002-u2u-cf",
  "notebook_persit_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/002-u2u-cf",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u.parquet"
}


## Load data

In [5]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."
assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [6]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46


## Convert user_id and item_id to indices

In [7]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [8]:
train_df = train_df.pipe(idm.map_indices)
val_df = val_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [9]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10,3931,2905
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13,3931,89
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46,3931,758


In [10]:
assert train_df.groupby(args.user_col)[args.item_col].nunique().min() >= 5, "Each user must have at least five items."
assert train_df.groupby(args.item_col)[args.user_col].nunique().min() >= 10, "Each item must have at least ten users."

## Init model

In [11]:
def init_model(n_users, n_items):
    model = U2UCollaborativeFiltering(n_users, n_items)
    return model

## Overfit 1 batch

In [24]:
train_overfit_df = train_df.loc[lambda df: df["user_indice"].lt(20) & df["item_indice"].lt(1000)]

## Train model

In [12]:
users = train_df["user_indice"].values
items = train_df["item_indice"].values
rating = train_df["rating"].values

n_users = train_df["user_indice"].nunique()
n_items = train_df["item_indice"].nunique()

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")

[32m2025-04-04 10:26:50.807[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNumber of users: 16407, Number of items: 4817[0m


In [13]:
# Calculate the upper bound loss, the loss that we wanna beat
mean_rating = np.mean(rating)

naive_mse = (
    (
        val_df["rating"] 
        - val_df.assign(naive_pred = mean_rating)["naive_pred"]
    )
    .apply(lambda x: x ** 2)
    .mean()
)
logger.info(f"Naive MSE: {naive_mse} given mean rating: {mean_rating}")

[32m2025-04-04 10:26:52.772[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mNaive MSE: 1.8868801583202892 given mean rating: 4.403243531775936[0m


### Training

In [14]:
start_time = time.time()

model = init_model(n_users, n_items)
model.fit(users, items, rating)

end_time = time.time()
execution_time = end_time - start_time
logger.info(f"Execution time: {execution_time:.2f} seconds")

[32m2025-04-04 10:27:02.530[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mExecution time: 7.89 seconds[0m


# Test predict

In [15]:
val_sample_df = val_df.sample(1)
sample_user = val_sample_df["user_indice"].values
sample_item = val_sample_df["item_indice"].values
sample_rating = val_sample_df["rating"].values

logger.info(f"Sample user: {sample_user}, Sample item: {sample_item}, Sample rating: {sample_rating}")
model.predict(sample_user, sample_item, logging=True)

[32m2025-04-04 10:27:04.423[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mSample user: [11896], Sample item: [1503], Sample rating: [2.][0m
[32m2025-04-04 10:27:04.424[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_u2u[0m:[36mforward[0m:[36m81[0m - [34m[1mUser sim: [0.09363362 0.08985716 0.         0.         0.         0.
 0.         0.         0.         0.        ][0m
[32m2025-04-04 10:27:04.424[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_u2u[0m:[36mforward[0m:[36m82[0m - [34m[1mItem rating: [5. 5. 1. 5. 5. 5. 5. 5. 5. 5.][0m
[32m2025-04-04 10:27:04.424[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_u2u[0m:[36mforward[0m:[36m83[0m - [34m[1mLogit: 4.999999999999999[0m


array([0.99330715])

In [109]:
model.predict([15689], [4816])

array([0.99330715])

In [30]:
model.recommend(sample_user, keep_interacted=False, k=args.top_K)

Recommending items:   0%|          | 0/1 [00:00<?, ?user/s]

Recommending items: 100%|██████████| 1/1 [00:00<00:00,  1.50user/s]


{'user_indice': [13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760,
  13760],
 'recommendation': [4816,
  2454,
  2470,
  2467,
  2463,
  2461,
  2460,
  2457,
  

## Evaluate

### Recommendation metrics

In [43]:
val_user_indices = val_df["user_indice"].unique()
logger.info(f"Number of users in validation set: {len(val_user_indices)}")

[32m2025-04-03 22:44:16.807[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of users in validation set: 2424[0m


In [44]:
val_user_indices

array([12159,   870, 13529, ...,  5497,  6427,  2423])

In [45]:
recommendations = model.recommend(val_user_indices, keep_interacted=True, k=args.top_K)

Recommending items:   0%|          | 0/2424 [00:00<?, ?user/s]

Recommending items: 100%|██████████| 2424/2424 [23:00<00:00,  1.76user/s]


In [None]:
rec_df = pd.DataFrame(recommendations).pipe(create_rec_df, idm = idm)
rec_df.head(3)

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,12159,4816,0.993307,1.0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B0CGZ1XLPX
1,12159,2696,0.993307,2.0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B01K9KWA3U
2,12159,2647,0.993307,3.0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B01I2JJ7BI


In [18]:
rec_df.loc[rec_df["user_id"] == "AHZZM7BCJAF2UEMMBHZCLXBB2SVA"]

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
210000,16403,4816,0.993307,1.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B0CGZ1XLPX
210001,16403,1952,0.993307,2.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00SMBFZNG
210002,16403,3851,0.993307,3.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B084H2NHNN
210003,16403,1981,0.993307,4.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00TIBFEIA
210004,16403,1978,0.993307,5.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00TB8XZOK
...,...,...,...,...,...,...
210095,16403,2200,0.993307,96.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00YFJT29C
210096,16403,2199,0.993307,97.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00YCC5HNM
210097,16403,3760,0.993307,98.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B07XPQF1FJ
210098,16403,2190,0.993307,99.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00Y0959EE


In [24]:
label_df = create_label_df(val_df)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
12855326,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B091K4WYD1,4.0,1.0
4327569,AEJIJK6DNQRSGOSMSRTSIVHKEWRQ,B07KTYJ769,5.0,1.0
18287739,AHQSVMSTGTE5YW577ATYHRWWA3EQ,B07GZFM1ZM,3.0,1.0
5917697,AFQWFCSD3NNG5LIG6SO7DCUKCIJA,B08F1P3BCC,5.0,1.0
10962783,AGVPCCFOHWKJOO5B6G266N2ZHALQ,B09K4R9KDM,5.0,1.0
...,...,...,...,...
3348993,AFIGGCK7HZAP24TECVJXGOFP5IIA,B09G3MBH6V,1.0,8.0
7915094,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,8.0
3832028,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B08J9NQ6CS,4.0,9.0
2102221,AEN2KQVSR5TWRXNQS3OTFT4EZQCA,B0BRT7XFM5,5.0,9.0


In [22]:
label_df.loc[label_df["user_id"] == "AHZZM7BCJAF2UEMMBHZCLXBB2SVA"]

Unnamed: 0,user_id,parent_asin,rating,rating_rank
19254013,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B075QC3TZY,1.0,1.0


In [25]:
eval_df = merge_recs_with_target(rec_df, label_df, k=args.top_K)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
0,8.0,1196.0,0.993307,1,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B00C9TEBJQ,0,
64,8.0,3352.0,0.993307,2,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07D5CGJCM,0,
58,8.0,3336.0,0.993307,3,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07CVVP5X8,0,
26,8.0,1440.0,0.993307,4,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B00G5N0ZZQ,0,
59,8.0,3338.0,0.993307,5,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07CY7NLNN,0,
...,...,...,...,...,...,...,...,...
245775,16403.0,2199.0,0.993307,97,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00YCC5HNM,0,
245788,16403.0,3760.0,0.993307,98,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B07XPQF1FJ,0,
245774,16403.0,2190.0,0.993307,99,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00Y0959EE,0,
245773,16403.0,2188.0,0.993307,100,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00XTH0WL8,0,


In [28]:
ranking_report = log_ranking_metrics(args, eval_df)


invalid value encountered in divide



In [31]:
ranking_report.as_dict()

{'metrics': [{'metric': 'NDCGKMetric',
   'result': {'k': 10,
    'current': 1     0.000000
    2     0.000000
    3     0.000126
    4     0.000126
    5     0.000286
    6     0.000286
    7     0.000286
    8     0.000286
    9     0.000413
    10    0.000413
    dtype: float64,
    'current_value': 0.0004128194517193025,
    'reference': None,
    'reference_value': None}},
  {'metric': 'RecallTopKMetric',
   'result': {'k': 100, 'current': 0     0.000000
    1     0.000000
    2     0.000206
    3     0.000206
    4     0.000619
            ...   
    95    0.017010
    96    0.017423
    97    0.017423
    98    0.017836
    99    0.018042
    Length: 100, dtype: float64, 'current_value': 0.018041804180418043, 'reference': None, 'reference_value': None}},
  {'metric': 'PrecisionTopKMetric',
   'result': {'k': 100,
    'current': 0     0.000000
    1     0.000000
    2     0.000138
    3     0.000103
    4     0.000165
            ...   
    95    0.000262
    96    0.000264
    9

## Classification metrics

In [33]:
val_user_indices = val_df["user_indice"].values
val_item_indices = val_df["item_indice"].values

In [34]:
classifications = model.predict(val_user_indices, val_item_indices)

In [38]:
min_rel_score = 2
eval_classification_df = val_df.assign(
    classification_proba=classifications,
    label=lambda df: df[args.rating_col].gt(min_rel_score).astype(int),
)
eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,classification_proba,label
4668,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B0B787CN26,5.0,2021-10-27 19:43:57.873,12159,4440,0.500000,1
10425,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B0002MQGOA,5.0,2021-02-02 14:20:48.424,870,59,0.993307,1
10426,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B07HZLHPKP,5.0,2021-03-08 13:56:57.795,870,3472,0.988148,1
13265,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B07QWPVZJY,3.0,2021-12-11 00:34:19.152,13529,3630,0.993307,1
14423,AEFHRRLFCZQ3TWNYCBA7UD3NIXCA,B00D96J8IM,1.0,2021-10-17 20:54:19.325,1481,1262,0.993307,0
...,...,...,...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,14144,4693,0.993307,1
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,7343,3923,0.500000,1
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,5497,4335,0.993307,1
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,6427,4147,0.500000,1


In [72]:
eval_classification_df.loc[lambda df: (df["classification_proba"] > 0.88) & (df["classification_proba"] > 0.5)]

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,classification_proba,label
10425,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B0002MQGOA,5.0,2021-02-02 14:20:48.424,870,59,0.993307,1
10426,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B07HZLHPKP,5.0,2021-03-08 13:56:57.795,870,3472,0.988148,1
13265,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B07QWPVZJY,3.0,2021-12-11 00:34:19.152,13529,3630,0.993307,1
14423,AEFHRRLFCZQ3TWNYCBA7UD3NIXCA,B00D96J8IM,1.0,2021-10-17 20:54:19.325,1481,1262,0.993307,0
23297,AGK63CGC7N3MB2QN56EZNDZYNGIQ,B07HZLHPKP,3.0,2021-01-14 20:35:43.045,10201,3472,0.986969,1
...,...,...,...,...,...,...,...,...
32499474,AH5HVRS323QOWLMMWCEX3GJXNKLA,B07ZDXHZ6J,1.0,2021-01-24 15:11:12.439,12657,3784,0.987318,0
32967153,AG3JRUUWEDPQOLULW4QI5LE6LYSQ,B06W2PMP6C,5.0,2021-01-28 17:20:28.043,8393,2881,0.993307,1
33479788,AE2GH2QNHK6AVGUOTECE6FOECE2A,B091K4WYD1,2.0,2021-06-24 18:03:40.116,42,4086,0.993307,0
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,14144,4693,0.993307,1


In [77]:
classification_report = log_classification_metrics(args, eval_classification_df)

In [76]:
classification_report.as_dict()

{'metrics': [{'metric': 'ClassificationQualityMetric',
   'result': {'current': {'accuracy': 0.6498993963782697,
     'precision': 0.8529291274068005,
     'recall': 0.7079224753485209,
     'f1': 0.7736900780379041,
     'roc_auc': 0.5300361887884277,
     'log_loss': 0.671690219793583,
     'tpr': 0.7079224753485209,
     'tnr': 0.33271375464684017,
     'fpr': 0.6672862453531598,
     'fnr': 0.29207752465147907},
    'reference': None,
    'target_name': 'label'}},
  {'metric': 'ClassificationClassBalance', 'result': {}},
  {'metric': 'ClassificationConfusionMatrix',
   'result': {'current_matrix': {'labels': [0, 1],
     'values': [[179, 359], [859, 2082]]},
    'reference_matrix': None,
    'target_names': None}},
  {'metric': 'ClassificationQualityByClass',
   'result': {'columns': {'utility_columns': {'date': None,
      'id': None,
      'target': 'label',
      'prediction': 'classification_proba'},
     'num_feature_names': [],
     'cat_feature_names': [],
     'text_feature

In [82]:
params.model_dump()

{'testing': False,
 'log_to_mlflow': True,
 'experiment_name': 'first-attempt',
 'run_name': '002-u2u-cf',
 'notebook_persit_dp': '/home/dinhln/Desktop/real_time_recsys/notebooks/data/002-u2u-cf',
 'user_col': 'user_id',
 'item_col': 'parent_asin',
 'rating_col': 'rating',
 'timestamp_col': 'timestamp',
 'top_K': 100,
 'top_k': 10,
 'train_data_fp': '/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u.parquet',
 'val_data_fp': '/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u.parquet'}

In [88]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.model_dump()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items() if k != "top_k"}
        mlflow.log_params(params_)

    mlflow.end_run()

🏃 View run 002-u2u-cf at: http://138.2.61.6:5002/#/experiments/1/runs/2a74296daf2f4f0eab643b527b2254a0
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/1


## Persit

In [66]:
rec_df.to_csv(os.path.join(args.notebook_persit_dp, "rec_df.csv"), index = False)