## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
import mlflow
from loguru import logger
from load_dotenv import load_dotenv
import time
import json

sys.path.insert(0, "..")

from src.eval.utils import create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
_ = load_dotenv(override = True)

## Arguments

In [4]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    run_name: str = "001-popular-baseline"
    notebook_persit_dp: str = None
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "baseline"

    top_K: int = 100
    top_k: int = 10

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not os.environ.get("MLFLOW_TRACKING_URI"):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-06-29 10:37:02.103[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m28[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 001-popular-baseline[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "run_name": "001-popular-baseline",
  "notebook_persit_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/001-popular-baseline",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "baseline",
  "top_K": 100,
  "top_k": 10,
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u.parquet"
}


## Load data

In [5]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."
assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [6]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46


# Get popular df

In [7]:
popular_df =(train_df.groupby(args.item_col, as_index = False)
             .size()
             .assign(
                 score = lambda df: df["size"] / df["size"].max(),
                 rec_ranking = lambda df: df["score"].rank(method="first", ascending=False).astype(int),
             )
             .sort_values(["rec_ranking"], ascending=True)

             )

top_popular_df = popular_df.head(args.top_K)

In [8]:
top_popular_df

Unnamed: 0,parent_asin,size,score,rec_ranking
2694,B01K8B8YA8,934,1.000000,1
3089,B075X8471B,862,0.922912,2
2237,B010BWYDYA,597,0.639186,3
2253,B011BRUOMO,586,0.627409,4
3678,B07S764D9V,544,0.582441,5
...,...,...,...,...
2770,B01M3ULMWP,123,0.131692,96
758,B006GWO5WK,122,0.130621,97
3265,B07BX4X77P,122,0.130621,98
4692,B0BZ5KPQZK,122,0.130621,99


In [9]:
rec_df = (
    val_df[[args.user_col]]
    .drop_duplicates()
    .assign(temp=-1)
    .pipe(
        lambda df: pd.merge(
            df, top_popular_df.assign(temp = -1), on="temp", how = "left"
        )
    )[[args.user_col, args.item_col, "rec_ranking", "score"]]
)

rec_df

Unnamed: 0,user_id,parent_asin,rec_ranking,score
0,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B01K8B8YA8,1,1.000000
1,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B075X8471B,2,0.922912
2,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B010BWYDYA,3,0.639186
3,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B011BRUOMO,4,0.627409
4,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B07S764D9V,5,0.582441
...,...,...,...,...
242395,AEMSEOMPBE6KWOZWCX4VE4E2E7HQ,B01M3ULMWP,96,0.131692
242396,AEMSEOMPBE6KWOZWCX4VE4E2E7HQ,B006GWO5WK,97,0.130621
242397,AEMSEOMPBE6KWOZWCX4VE4E2E7HQ,B07BX4X77P,98,0.130621
242398,AEMSEOMPBE6KWOZWCX4VE4E2E7HQ,B0BZ5KPQZK,99,0.130621


## Evaluate

### Recommendation metrics

In [10]:
label_df = create_label_df(val_df)

In [11]:
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
12855326,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B091K4WYD1,4.0,1.0
4327569,AEJIJK6DNQRSGOSMSRTSIVHKEWRQ,B07KTYJ769,5.0,1.0
18287739,AHQSVMSTGTE5YW577ATYHRWWA3EQ,B07GZFM1ZM,3.0,1.0
5917697,AFQWFCSD3NNG5LIG6SO7DCUKCIJA,B08F1P3BCC,5.0,1.0
10962783,AGVPCCFOHWKJOO5B6G266N2ZHALQ,B09K4R9KDM,5.0,1.0
...,...,...,...,...
3348993,AFIGGCK7HZAP24TECVJXGOFP5IIA,B09G3MBH6V,1.0,8.0
7915094,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,8.0
3832028,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B08J9NQ6CS,4.0,9.0
2102221,AEN2KQVSR5TWRXNQS3OTFT4EZQCA,B0BRT7XFM5,5.0,9.0


In [12]:
eval_df = merge_recs_with_target(
    rec_df, label_df, k=args.top_K
)

In [13]:
eval_df

Unnamed: 0,user_id,parent_asin,rec_ranking,score,rating,rating_rank
41,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B01K8B8YA8,1,1.000000,0,
56,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B075X8471B,2,0.922912,0,
37,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B010BWYDYA,3,0.639186,0,
38,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B011BRUOMO,4,0.627409,0,
69,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07S764D9V,5,0.582441,0,
...,...,...,...,...,...,...
245331,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B006GWO5WK,97,0.130621,0,
245380,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B07BX4X77P,98,0.130621,0,
245418,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B0BZ5KPQZK,99,0.130621,0,
245405,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B09N42GVF7,100,0.128480,0,


In [14]:
ranking_report = log_ranking_metrics(args, eval_df)

In [15]:
ranking_report.as_dict()

{'metrics': [{'metric': 'NDCGKMetric',
   'result': {'k': 10,
    'current': 1     0.001650
    2     0.001430
    3     0.001497
    4     0.001464
    5     0.001876
    6     0.010217
    7     0.011207
    8     0.011204
    9     0.011203
    10    0.011592
    dtype: float64,
    'current_value': 0.01120267078839776,
    'reference': None,
    'reference_value': None}},
  {'metric': 'RecallTopKMetric',
   'result': {'k': 100, 'current': 0     0.000884
    1     0.001297
    2     0.001503
    3     0.001503
    4     0.002469
            ...   
    95    0.125617
    96    0.125617
    97    0.126030
    98    0.128986
    99    0.129227
    Length: 100, dtype: float64, 'current_value': 0.12922674707946982, 'reference': None, 'reference_value': None}},
  {'metric': 'PrecisionTopKMetric',
   'result': {'k': 100,
    'current': 0     0.001650
    1     0.001238
    2     0.000963
    3     0.000722
    4     0.000908
            ...   
    95    0.001908
    96    0.001888
    97  

## Classification metrics (WIP)

## Persit

In [16]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.model_dump()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items() if k != "top_k"}
        mlflow.log_params(params_)

    mlflow.end_run()

🏃 View run 001-popular-baseline at: http://138.2.61.6:5002/#/experiments/2/runs/bd7e4dace82b43dda141eeddbdb325dd
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2


In [17]:
top_popular_df.to_csv(os.path.join(args.notebook_persit_dp, "top_popular_df.csv"), index = False)