In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
from loguru import logger
from dotenv import load_dotenv
from pydantic import BaseModel
from tqdm.notebook import tqdm

import numpy as np # required for the scikit-learn pipeline to work
import pandas as pd
import plotly.express as px
import mlflow

load_dotenv()

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '058-baseline-popular'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        
        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            import mlflow

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-20 20:16:47.122[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m29[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 058-baseline-popular...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "058-baseline-popular",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/058-baseline-popular",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128
}


# Prep data

In [4]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")
val_df = pd.read_parquet("../data/val_features_neg_df.parquet")

In [5]:
user_col = args.user_col
item_col = args.item_col
rating_col = args.rating_col
timestamp_col = args.timestamp_col

# Implement

In [6]:
popular_items_df = (
    train_df
    .groupby(item_col, as_index=False)
    .size()
    .assign(
        score=lambda df: df['size'] / df['size'].max(),
        rec_ranking=lambda df: df['score'].rank(method='first', ascending=False).astype(int)
    )
    .sort_values(['rec_ranking'], ascending=[True])
)

top_popular_items_df = popular_items_df.head(args.top_K)

top_popular_items_df

Unnamed: 0,parent_asin,size,score,rec_ranking
1202,B0086VPUHI,924,1.000000,1
2578,B07YBXFDYN,860,0.930736,2
2132,B01N3ASPNV,840,0.909091,3
1339,B00BGA9WK2,707,0.765152,4
1353,B00BN5T30E,667,0.721861,5
...,...,...,...,...
1392,B00CMQTVUA,204,0.220779,96
1531,B00HRW8PRO,204,0.220779,97
718,B002Z01QO2,203,0.219697,98
1037,B0053BCML6,203,0.219697,99


In [7]:
recommendations_df = (
    val_df[[user_col]].drop_duplicates().assign(tmp=1)
    .pipe(lambda df: pd.merge(
        df,
        top_popular_items_df.assign(tmp=1),
        on='tmp',
        how='left'
    ))
    [[user_col, item_col, 'score', 'rec_ranking']]
)
    
recommendations_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking
0,AEGX7X6YKSB53B57U4RDNZMHQB2A,B0086VPUHI,1.000000,1
1,AEGX7X6YKSB53B57U4RDNZMHQB2A,B07YBXFDYN,0.930736,2
2,AEGX7X6YKSB53B57U4RDNZMHQB2A,B01N3ASPNV,0.909091,3
3,AEGX7X6YKSB53B57U4RDNZMHQB2A,B00BGA9WK2,0.765152,4
4,AEGX7X6YKSB53B57U4RDNZMHQB2A,B00BN5T30E,0.721861,5
...,...,...,...,...
30895,AECM3QY5M6F34S55J4AGDOUBB5MA,B00CMQTVUA,0.220779,96
30896,AECM3QY5M6F34S55J4AGDOUBB5MA,B00HRW8PRO,0.220779,97
30897,AECM3QY5M6F34S55J4AGDOUBB5MA,B002Z01QO2,0.219697,98
30898,AECM3QY5M6F34S55J4AGDOUBB5MA,B0053BCML6,0.219697,99


# Evaluate

## Ranking metrics

In [8]:
from src.eval import create_label_df, merge_recs_with_target
from src.eval import log_ranking_metrics

In [9]:
label_df = create_label_df(val_df, user_col=user_col, item_col=item_col, rating_col=rating_col, timestamp_col=timestamp_col)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
185,AELH2ZF5QSSIFBF6WXAZLCF7JIWA,B0C6DH316S,2.0,1.0
760,AHTWEK3GO4EZDJ2VIQA76IJETIIQ,B001EYUXWQ,3.0,1.0
106,AGE7WLBNOCG6OC4UZYEBWWNEVM3A,B08N7QBVBJ,3.0,1.0
776,AGLA37EWXJQRZVGWU3FHO4XHHLLA,B07V5CFMY4,4.0,1.0
371,AHO4T4RI3HBDLAMJ52QM6HE6QWQA,B0711ZH7GX,4.0,1.0
...,...,...,...,...
178,AG4RCXKPTC6QRORJLUSBY4SO2IAA,B00Z9TJHEC,0.0,12.0
220,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B00DOD51UY,0.0,13.0
198,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B000UW21A0,0.0,14.0
350,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B01N33LIXR,0.0,15.0


In [10]:
eval_df = merge_recs_with_target(recommendations_df, label_df, k=args.top_K, user_col=user_col, item_col=item_col, rating_col=rating_col)
eval_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking,rating,rating_rank
22,AE23PXMECDNZRSKAQV6CAIEN67UQ,B0086VPUHI,1.000000,1,0,
90,AE23PXMECDNZRSKAQV6CAIEN67UQ,B07YBXFDYN,0.930736,2,0,
78,AE23PXMECDNZRSKAQV6CAIEN67UQ,B01N3ASPNV,0.909091,3,0,
29,AE23PXMECDNZRSKAQV6CAIEN67UQ,B00BGA9WK2,0.765152,4,0,
33,AE23PXMECDNZRSKAQV6CAIEN67UQ,B00BN5T30E,0.721861,5,0,
...,...,...,...,...,...,...
31521,AHZT6GNY6QZ5RHEDTQZNXAGNFFBA,B00HRW8PRO,0.220779,97,0,
31476,AHZT6GNY6QZ5RHEDTQZNXAGNFFBA,B002Z01QO2,0.219697,98,0,
31484,AHZT6GNY6QZ5RHEDTQZNXAGNFFBA,B0053BCML6,0.219697,99,0,
31557,AHZT6GNY6QZ5RHEDTQZNXAGNFFBA,B07YBX7Y3P,0.219697,100,0,


In [11]:
log_ranking_metrics(args, eval_df)

## Classification metrics

In [12]:
from evidently.metric_preset import ClassificationPreset
from src.eval import log_classification_metrics

In [13]:
eval_classification_df = (
    pd.merge(
        val_df,
        popular_items_df[[args.item_col, 'score']],
        on=[args.item_col],
        how='left',
        validate="m:1"
    )
    .assign(
        label=lambda df: df['rating'].gt(0).astype(int)
    )
)
eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence,score,label
0,AEGX7X6YKSB53B57U4RDNZMHQB2A,B0BL65X86R,5.0,1651558065496,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,2610,1220,"[-1, -1, -1, -1, 205, 1291, 494, 1144, 3, 579]",0.087662,1
1,AG6MENO5OO7LRCH27J47ZODEZN6Q,B00EQNP8F4,0.0,1657815635870,Video Games,Microsoft Xbox LIVE 12 Month Gold Membership (...,[Gaming is better with Xbox Live Gold. Join th...,"[Video Games, Online Game Services, Xbox Live,...",,50,147,"[-1, -1, 2569, 1380, 1344, 1342, 2224, 1328, 5...",0.218615,0
2,AGSNYY5XOKFLVXUZMGXRS7DVL7EQ,B077GG9D5D,4.0,1645878084196,Video Games,DualShock 4 Wireless Controller for PlayStatio...,[The DualShock 4 Wireless Controller features ...,"[Video Games, PlayStation 4, Accessories, Cont...",57.0,377,684,"[2365, 1381, 2109, 1877, 2792, 2754, 2558, 147...",0.601732,1
3,AGXKJPHUFNOZGCDLXQRKW4OSY7JA,B012F20ZY6,0.0,1644874447517,Video Games,New Super Mario Bros. U + New Super Luigi U - ...,[(2 Games on 1 Disc) New Super Mario Bros. U: ...,"[Video Games, Legacy Systems, Nintendo Systems...",60.94,8451,170,"[-1, -1, 1762, 2792, 164, 1144, 772, 923, 266,...",0.035714,0
4,AHRPODFXDQKRO3OUDXTYZZZNGPTQ,B07WZ78VRN,1.0,1629011678308,Computers,8Bitdo SN30 Pro Wireless Bluetooth Controller ...,[],"[Video Games, Mac, Accessories, Controllers, G...",44.99,8412,2055,"[-1, -1, -1, -1, -1, 2076, 1204, 674, 63, 1668]",0.025974,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B07NQCDWWN,4.0,1648662318824,Video Games,The Legend of Zelda: TriForce Heroes - 3DS,[Three players take on the role of individual ...,"[Video Games, Legacy Systems, Nintendo Systems...",39.49,4352,1701,"[1313, 691, 1944, 2013, 456, 395, 863, 2152, 2...",0.059524,1
790,AH62RYWZBOQXAIHUU2FNRZNMRDGA,B07H3F94ZN,5.0,1649192892568,All Electronics,HD Retrovision PlayStation 2/3 (PS2/PS3) Premi...,[Use these high-quality cables to connect your...,"[Video Games, Legacy Systems, PlayStation Syst...",29.99,4482,2205,"[-1, 2334, 2632, 1798, 226, 2728, 2541, 2777, ...",0.028139,1
791,AGNUHVWLW65C3UFEKDTHNVOFTLVQ,B008UTF3W8,0.0,1643076728682,Video Games,Official Sony Playstation 3 Vertical Stand for...,[The official Vertical Stand is designed to ke...,"[Video Games, Legacy Systems, PlayStation Syst...",93.79,3514,1948,"[-1, -1, -1, 1969, 451, 1778, 1813, 1694, 1841...",0.038961,0
792,AG7ULZ7GACZ675QL2YVIG5XUQWIA,B07X5X5KF9,0.0,1637531154519,Video Games,WB Games Middle Earth: Shadow of Mordor - Play...,"[Exploit the individual fears, weakness and me...","[Video Games, PlayStation 4, Games]",20.6,2378,1402,"[-1, -1, -1, -1, -1, 1433, 1054, 145, 1941, 684]",0.225108,0


In [17]:
eval_classification_df['score'].describe().T

count    794.000000
mean       0.134581
std        0.176863
min        0.015152
25%        0.040043
50%        0.072511
75%        0.141775
max        1.000000
Name: score, dtype: float64

In [14]:
log_classification_metrics(args, eval_classification_df, target_col='label', prediction_col='score')

# Clean up

In [15]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/09/20 20:17:09 INFO mlflow.tracking._tracking_service.client: üèÉ View run 058-baseline-popular at: http://localhost:5003/#/experiments/1/runs/e9f1942f67b34278a7a4954f9e92562a.
2024/09/20 20:17:09 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5003/#/experiments/1.
