## Setup

In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
import mlflow
from loguru import logger
from load_dotenv import load_dotenv
import time
import json

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.cf_i2i import I2ICollaborativeFiltering
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
_ = load_dotenv(override = True)

## Arguments

In [None]:
# Tag this cell as `parameters`
top_n = 10
min_sim_count=4

In [None]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    run_name: str = f"003-i2i-cf-min-sim-count={min_sim_count}"
    notebook_persit_dp: str = None
    group_name: str = "i2i-cf"
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10
    min_sim_count: int = min_sim_count

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not os.environ.get("MLFLOW_TRACKING_URI"):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-04-09 15:34:52.251[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m27[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 003-i2i-cf-min-sim-count=4[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "run_name": "003-i2i-cf-min-sim-count=4",
  "notebook_persit_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/003-i2i-cf-min-sim-count=4",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u.parquet"
}


## Load data

In [5]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."
assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [6]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46


## Convert user_id and item_id to indices

In [7]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [8]:
train_df = train_df.pipe(idm.map_indices)
val_df = val_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [9]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10,3931,2905
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13,3931,89
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46,3931,758


In [10]:
assert train_df.groupby(args.user_col)[args.item_col].nunique().min() >= 5, "Each user must have at least five items."
assert train_df.groupby(args.item_col)[args.user_col].nunique().min() >= 10, "Each item must have at least ten users."

## Init model

In [11]:
def init_model(n_users, n_items):
    model = I2ICollaborativeFiltering(n_users, n_items)
    return model

## Overfit 1 batch

In [12]:
train_overfit_df = train_df.loc[lambda df: df["user_indice"].lt(20) & df["item_indice"].lt(1000)]

## Train model

In [13]:
users = train_df["user_indice"].values
items = train_df["item_indice"].values
rating = train_df["rating"].values

n_users = train_df["user_indice"].nunique()
n_items = train_df["item_indice"].nunique()

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")

[32m2025-04-09 14:52:53.049[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mNumber of users: 16407, Number of items: 4817[0m


In [14]:
# Calculate the upper bound loss, the loss that we wanna beat
mean_rating = np.mean(rating)

naive_mse = (
    (
        val_df["rating"] 
        - val_df.assign(naive_pred = mean_rating)["naive_pred"]
    )
    .apply(lambda x: x ** 2)
    .mean()
)
logger.info(f"Naive MSE: {naive_mse} given mean rating: {mean_rating}")

[32m2025-04-09 14:52:54.812[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mNaive MSE: 1.8868801583202892 given mean rating: 4.403243531775936[0m


### Training

In [15]:
start_time = time.time()

model = init_model(n_users, n_items)
model.fit(users, items, rating)

end_time = time.time()
execution_time = end_time - start_time
logger.info(f"Execution time: {execution_time:.2f} seconds")

[32m2025-04-09 14:52:59.746[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mExecution time: 3.35 seconds[0m


# Test predict

In [43]:
val_sample_df = val_df.sample(1)
sample_user = val_sample_df["user_indice"].values
sample_item = val_sample_df["item_indice"].values
sample_rating = val_sample_df["rating"].values

logger.info(f"Sample user: {sample_user}, Sample item: {sample_item}, Sample rating: {sample_rating}")
model.predict(sample_user, sample_item, logging=True)

[32m2025-04-09 15:00:07.658[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mSample user: [7095], Sample item: [4383], Sample rating: [2.][0m
[32m2025-04-09 15:00:07.659[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_i2i[0m:[36mforward[0m:[36m63[0m - [34m[1mItem 4383 has no similar items. Return 0 instead.[0m


array([0.5])

In [46]:
model.predict([7095], [3588], logging=True, min_sim_count=3)

[32m2025-04-09 15:00:39.053[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_i2i[0m:[36mforward[0m:[36m82[0m - [34m[1mItem sim: [0.02969833 0.01686759 0.01126714 0.         0.         0.
 0.         0.         0.         0.        ][0m
[32m2025-04-09 15:00:39.054[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_i2i[0m:[36mforward[0m:[36m83[0m - [34m[1mUser rating: [5. 5. 5. 5. 5. 5. 5. 1. 1. 5.][0m
[32m2025-04-09 15:00:39.055[0m | [34m[1mDEBUG   [0m | [36msrc.algo.cf_i2i[0m:[36mforward[0m:[36m84[0m - [34m[1mLogit: 5.0[0m


array([0.99330715])

In [45]:
model.recommend(sample_user, keep_interacted=False, k=args.top_K, min_sim_count=3)

Recommending items:   0%|          | 0/1 [00:00<?, ?user/s]

{'user_indice': [7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095,
  7095],
 'recommendation': [4419,
  4435,
  2450,
  3585,
  3586,
  1348,
  162,
  3588,
  2154,
  340,
  3944,
  1768,
  2146,
  4434,
  1104,
  1998,
  770,
  4625,
  4624,
  858,
  3211,
  

## Evaluate

### Recommendation metrics

In [60]:
val_user_indices = val_df["user_indice"].unique()
logger.info(f"Number of users in validation set: {len(val_user_indices)}")

[32m2025-04-09 15:07:40.299[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of users in validation set: 2424[0m


In [61]:
val_user_indices

array([12159,   870, 13529, ...,  5497,  6427,  2423])

In [None]:
recommendations = model.recommend(val_user_indices, keep_interacted=True, top_n= top_n ,k=args.top_K, min_sim_count=args.min_sim_count)

Recommending items:   0%|          | 0/2424 [00:00<?, ?user/s]

In [63]:
rec_df = pd.DataFrame(recommendations).pipe(create_rec_df, idm = idm)
rec_df.head(3)

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,12159,2779,0.992503,1.0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B01M7VOJFW
1,12159,3719,0.991558,2.0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B07VJQ5VBV
2,12159,1925,0.991164,3.0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B00R92CL5E


In [64]:
rec_df.loc[rec_df["user_id"] == "AHZZM7BCJAF2UEMMBHZCLXBB2SVA"]

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
210000,16403,4242,0.993307,1.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B09LSJS48Q
210001,16403,3271,0.993307,2.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B07C1263TQ
210002,16403,3089,0.993307,3.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B075X8471B
210003,16403,4516,0.993307,4.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B0BGNG1294
210004,16403,2811,0.993307,5.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B01MTF2Z37
...,...,...,...,...,...,...
210095,16403,1532,0.500000,96.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00IF70R7Q
210096,16403,1525,0.500000,97.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00ICDAAXO
210097,16403,1526,0.500000,98.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00ICDABYC
210098,16403,1527,0.500000,99.0,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00IEM6DBG


In [65]:
label_df = create_label_df(val_df)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
12855326,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B091K4WYD1,4.0,1.0
4327569,AEJIJK6DNQRSGOSMSRTSIVHKEWRQ,B07KTYJ769,5.0,1.0
18287739,AHQSVMSTGTE5YW577ATYHRWWA3EQ,B07GZFM1ZM,3.0,1.0
5917697,AFQWFCSD3NNG5LIG6SO7DCUKCIJA,B08F1P3BCC,5.0,1.0
10962783,AGVPCCFOHWKJOO5B6G266N2ZHALQ,B09K4R9KDM,5.0,1.0
...,...,...,...,...
3348993,AFIGGCK7HZAP24TECVJXGOFP5IIA,B09G3MBH6V,1.0,8.0
7915094,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,8.0
3832028,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B08J9NQ6CS,4.0,9.0
2102221,AEN2KQVSR5TWRXNQS3OTFT4EZQCA,B0BRT7XFM5,5.0,9.0


In [66]:
label_df.loc[label_df["user_id"] == "AHZZM7BCJAF2UEMMBHZCLXBB2SVA"]

Unnamed: 0,user_id,parent_asin,rating,rating_rank
19254013,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B075QC3TZY,1.0,1.0


In [67]:
eval_df = merge_recs_with_target(rec_df, label_df, k=args.top_K)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
78,8.0,4243.0,0.993307,1,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B09LYWCHJX,0,
13,8.0,785.0,0.993307,2,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B006ZT4VA0,0,
73,8.0,4106.0,0.993307,3,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B092WMCV57,0,
48,8.0,3074.0,0.993307,4,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B075JZNHTD,0,
58,8.0,3286.0,0.993307,5,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07C5QRJZD,0,
...,...,...,...,...,...,...,...,...
245541,16403.0,1525.0,0.500000,97,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00ICDAAXO,0,
245542,16403.0,1526.0,0.500000,98,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00ICDABYC,0,
245543,16403.0,1527.0,0.500000,99,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00IEM6DBG,0,
245544,16403.0,1528.0,0.500000,100,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B00IEYHMPK,0,


In [68]:
ranking_report = log_ranking_metrics(args, eval_df)

In [56]:
ranking_report.as_dict()

{'metrics': [{'metric': 'NDCGKMetric',
   'result': {'k': 10,
    'current': 1     0.001650
    2     0.001751
    3     0.001877
    4     0.001972
    5     0.002207
    6     0.002269
    7     0.002407
    8     0.002537
    9     0.002661
    10    0.003138
    dtype: float64,
    'current_value': 0.002661314577453662,
    'reference': None,
    'reference_value': None}},
  {'metric': 'RecallTopKMetric',
   'result': {'k': 100, 'current': 0     0.001238
    1     0.001856
    2     0.002063
    3     0.002269
    4     0.002716
            ...   
    95    0.061765
    96    0.062315
    97    0.062522
    98    0.062934
    99    0.063278
    Length: 100, dtype: float64, 'current_value': 0.06327793493635078, 'reference': None, 'reference_value': None}},
  {'metric': 'PrecisionTopKMetric',
   'result': {'k': 100,
    'current': 0     0.001650
    1     0.001238
    2     0.000963
    3     0.000825
    4     0.000908
            ...   
    95    0.000975
    96    0.000974
    97 

## Classification metrics (WIP. Do it later when we have negative sampling samples)

In [None]:
# val_user_indices = val_df["user_indice"].values
# val_item_indices = val_df["item_indice"].values

In [None]:
# classifications = model.predict(val_user_indices, val_item_indices)

In [None]:
# min_rel_score = 2
# eval_classification_df = val_df.assign(
#     classification_proba=classifications,
#     label=lambda df: df[args.rating_col].gt(min_rel_score).astype(int),
# )
# eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,classification_proba,label
4668,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B0B787CN26,5.0,2021-10-27 19:43:57.873,12159,4440,0.500000,1
10425,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B0002MQGOA,5.0,2021-02-02 14:20:48.424,870,59,0.993307,1
10426,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B07HZLHPKP,5.0,2021-03-08 13:56:57.795,870,3472,0.988148,1
13265,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B07QWPVZJY,3.0,2021-12-11 00:34:19.152,13529,3630,0.993307,1
14423,AEFHRRLFCZQ3TWNYCBA7UD3NIXCA,B00D96J8IM,1.0,2021-10-17 20:54:19.325,1481,1262,0.993307,0
...,...,...,...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,14144,4693,0.993307,1
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,7343,3923,0.500000,1
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,5497,4335,0.993307,1
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,6427,4147,0.500000,1


In [None]:
# eval_classification_df.loc[lambda df: (df["classification_proba"] > 0.88) & (df["classification_proba"] > 0.5)]

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,classification_proba,label
10425,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B0002MQGOA,5.0,2021-02-02 14:20:48.424,870,59,0.993307,1
10426,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B07HZLHPKP,5.0,2021-03-08 13:56:57.795,870,3472,0.988148,1
13265,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B07QWPVZJY,3.0,2021-12-11 00:34:19.152,13529,3630,0.993307,1
14423,AEFHRRLFCZQ3TWNYCBA7UD3NIXCA,B00D96J8IM,1.0,2021-10-17 20:54:19.325,1481,1262,0.993307,0
23297,AGK63CGC7N3MB2QN56EZNDZYNGIQ,B07HZLHPKP,3.0,2021-01-14 20:35:43.045,10201,3472,0.986969,1
...,...,...,...,...,...,...,...,...
32499474,AH5HVRS323QOWLMMWCEX3GJXNKLA,B07ZDXHZ6J,1.0,2021-01-24 15:11:12.439,12657,3784,0.987318,0
32967153,AG3JRUUWEDPQOLULW4QI5LE6LYSQ,B06W2PMP6C,5.0,2021-01-28 17:20:28.043,8393,2881,0.993307,1
33479788,AE2GH2QNHK6AVGUOTECE6FOECE2A,B091K4WYD1,2.0,2021-06-24 18:03:40.116,42,4086,0.993307,0
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,14144,4693,0.993307,1


In [None]:
# classification_report = log_classification_metrics(args, eval_classification_df)

In [None]:
# classification_report.as_dict()

{'metrics': [{'metric': 'ClassificationQualityMetric',
   'result': {'current': {'accuracy': 0.6498993963782697,
     'precision': 0.8529291274068005,
     'recall': 0.7079224753485209,
     'f1': 0.7736900780379041,
     'roc_auc': 0.5300361887884277,
     'log_loss': 0.671690219793583,
     'tpr': 0.7079224753485209,
     'tnr': 0.33271375464684017,
     'fpr': 0.6672862453531598,
     'fnr': 0.29207752465147907},
    'reference': None,
    'target_name': 'label'}},
  {'metric': 'ClassificationClassBalance', 'result': {}},
  {'metric': 'ClassificationConfusionMatrix',
   'result': {'current_matrix': {'labels': [0, 1],
     'values': [[179, 359], [859, 2082]]},
    'reference_matrix': None,
    'target_names': None}},
  {'metric': 'ClassificationQualityByClass',
   'result': {'columns': {'utility_columns': {'date': None,
      'id': None,
      'target': 'label',
      'prediction': 'classification_proba'},
     'num_feature_names': [],
     'cat_feature_names': [],
     'text_feature

## Persit

In [69]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.model_dump()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items() if k != "top_k"}
        mlflow.log_params(params_)

    mlflow.end_run()

üèÉ View run 003-i2i-cf-min-sim-count=4 at: http://localhost:5002/#/experiments/9/runs/56184a6cc7cf4471a294d27474060f06
üß™ View experiment at: http://localhost:5002/#/experiments/9


In [70]:
rec_df.to_csv(os.path.join(args.notebook_persit_dp, "rec_df.csv"), index = False)