In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
from loguru import logger
from dotenv import load_dotenv
from pydantic import BaseModel
from tqdm.notebook import tqdm

import numpy as np # required for the scikit-learn pipeline to work
import pandas as pd
import plotly.express as px
import mlflow

load_dotenv()

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '058-baseline-popular'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        
        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            import mlflow

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-20 22:03:58.201[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m29[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 058-baseline-popular...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "058-baseline-popular",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/058-baseline-popular",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128
}


# Prep data

In [4]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")
val_df = pd.read_parquet("../data/val_features_neg_df.parquet")

In [5]:
user_col = args.user_col
item_col = args.item_col
rating_col = args.rating_col
timestamp_col = args.timestamp_col

# Implement

In [6]:
popular_items_df = (
    train_df
    .groupby(item_col, as_index=False)
    .size()
    .assign(
        score=lambda df: df['size'] / df['size'].max(),
        rec_ranking=lambda df: df['score'].rank(method='first', ascending=False).astype(int)
    )
    .sort_values(['rec_ranking'], ascending=[True])
)

top_popular_items_df = popular_items_df.head(args.top_K)

top_popular_items_df

Unnamed: 0,parent_asin,size,score,rec_ranking
2143,B0086VPUHI,1499,1.000000,1
4337,B07YBXFDYN,1444,0.963309,2
3554,B01N3ASPNV,1441,0.961308,3
2368,B00BGA9WK2,1288,0.859239,4
2385,B00BN5T30E,1091,0.727819,5
...,...,...,...,...
3407,B01GY35UK6,336,0.224149,96
1699,B004GUTRYK,334,0.222815,97
933,B001EYUQDW,332,0.221481,98
947,B001EYUQVE,332,0.221481,99


In [7]:
recommendations_df = (
    val_df[[user_col]].drop_duplicates().assign(tmp=1)
    .pipe(lambda df: pd.merge(
        df,
        top_popular_items_df.assign(tmp=1),
        on='tmp',
        how='left'
    ))
    [[user_col, item_col, 'score', 'rec_ranking']]
)
    
recommendations_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking
0,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0086VPUHI,1.000000,1
1,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B07YBXFDYN,0.963309,2
2,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B01N3ASPNV,0.961308,3
3,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B00BGA9WK2,0.859239,4
4,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B00BN5T30E,0.727819,5
...,...,...,...,...
70795,AEJRRYS5EDL5BMOEG4MPCXFFVQ7A,B01GY35UK6,0.224149,96
70796,AEJRRYS5EDL5BMOEG4MPCXFFVQ7A,B004GUTRYK,0.222815,97
70797,AEJRRYS5EDL5BMOEG4MPCXFFVQ7A,B001EYUQDW,0.221481,98
70798,AEJRRYS5EDL5BMOEG4MPCXFFVQ7A,B001EYUQVE,0.221481,99


# Evaluate

## Ranking metrics

In [8]:
from src.eval import create_label_df, merge_recs_with_target
from src.eval import log_ranking_metrics

In [9]:
label_df = create_label_df(val_df, user_col=user_col, item_col=item_col, rating_col=rating_col, timestamp_col=timestamp_col)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
463,AFUQQLR2N2LY7XPE4VJ5YF3LDZVA,B07YN82X3B,5.0,1.0
1229,AGVEONQRPWTQGUN64P22EVTGIW2A,B08FC5TTBF,4.0,1.0
224,AGJO7OFBOKRLDTSEL2HHSZSKDQ4Q,B07PZ8NZSZ,1.0,1.0
1527,AF4QBZD2EXOTKIOOH4BOC4HZDHYA,B08NRVRF3J,3.0,1.0
1306,AESEOKCWWKUG7YPP43J2CRWAXQIA,B09GM4283G,5.0,1.0
...,...,...,...,...
672,AFB6FYPPCN33UMUU5536IHXNOHCQ,B005NYK7YC,0.0,18.0
1867,AESD4RLWUKM6JTD6SNNWYLHLLQQA,B00B9775W4,0.0,18.0
470,AG4RCXKPTC6QRORJLUSBY4SO2IAA,B000FUIS40,0.0,18.0
1332,AFB6FYPPCN33UMUU5536IHXNOHCQ,B000U34SZA,0.0,19.0


In [10]:
eval_df = merge_recs_with_target(recommendations_df, label_df, k=args.top_K, user_col=user_col, item_col=item_col, rating_col=rating_col)
eval_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking,rating,rating_rank
26,AE2AZ2MNROPF33U6SS53VI22OXJA,B0086VPUHI,1.000000,1,0,
89,AE2AZ2MNROPF33U6SS53VI22OXJA,B07YBXFDYN,0.963309,2,0,
78,AE2AZ2MNROPF33U6SS53VI22OXJA,B01N3ASPNV,0.961308,3,0,
32,AE2AZ2MNROPF33U6SS53VI22OXJA,B00BGA9WK2,0.859239,4,0,
35,AE2AZ2MNROPF33U6SS53VI22OXJA,B00BN5T30E,0.727819,5,0,
...,...,...,...,...,...,...
72372,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B004GUTRYK,0.222815,97,0,2.0
72361,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B001EYUQDW,0.221481,98,0,
72363,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B001EYUQVE,0.221481,99,0,
72374,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B004I1JTEK,0.220814,100,0,


In [11]:
ranking_report = log_ranking_metrics(args, eval_df)

## Classification metrics

In [12]:
from evidently.metric_preset import ClassificationPreset
from src.eval import log_classification_metrics

In [13]:
eval_classification_df = (
    pd.merge(
        val_df,
        popular_items_df[[args.item_col, 'score']],
        on=[args.item_col],
        how='left',
        validate="m:1"
    )
    .assign(
        label=lambda df: df['rating'].gt(0).astype(int)
    )
)
eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence,score,label
0,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,8046,2332,"[1614, 3373, 970, 912, 455, 572, 1904, 4, 3467...",0.130754,1
1,AFFPVZ3JNCTQIKAK4XK37E2ENWWA,B00HVBPRUO,4.0,1655428133046,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,19677,621,"[-1, -1, 4649, 2405, 2655, 579, 3089, 3955, 44...",0.393596,1
2,AG7A42N537XQMGC5URJ6VZ2JO2FA,B00J48C36S,0.0,1634619046379,Video Games,Assassin's Creed Unity - Xbox One,"[Paris, 1789. The French Revolution turns a on...","[Video Games, Xbox One, Games]",23.5,3240,662,"[-1, -1, 2006, 3075, 2589, 1270, 421, 1524, 15...",0.138759,0
3,AFBRTNVOROW7UVA66UPX5YCFC6MQ,B07YBXFDYK,3.0,1636189764550,Video Games,The Evil Within 2 - PlayStation 4,"[From Shinji Mikami, The Evil Within 2 takes t...","[Video Games, PlayStation 4, Games]",20.98,12758,4410,"[-1, -1, -1, -1, 4407, 378, 3920, 831, 1594, 2...",0.104069,1
4,AEIS45LEWNPTLJCOLT2NPJ4NFZWQ,B081243BT6,0.0,1637350483061,Cell Phones & Accessories,Orzly Carrying case for Nintendo Switch OLED a...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",29.99,846,2216,"[991, 1867, 3847, 635, 933, 4635, 3781, 154, 4...",0.231488,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,AFUWPAK6VCGEL2OVIL2YGZNFQJZQ,B08N6NCR3Q,4.0,1642699950266,Video Games,Thrustmaster T 16000M SPACE SIM DUO STICK (PC),[The THRUSTMASTER T.16000M FCS Space Sim Duo c...,"[Video Games, PC, Accessories, Controllers, Fl...",119.51,19807,1968,"[-1, -1, -1, -1, 628, 2575, 892, 3726, 1213, 249]",0.038025,1
1894,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B0C3KJJ6XS,0.0,1657945454164,Computers,Razer Nari Ultimate Wireless 7.1 Surround Soun...,[Razer Nari Ultimate: 2.4GHz wireless PC gamin...,"[Video Games, PC, Accessories, Headsets]",100.42,2133,3530,"[4328, 4344, 3319, 2180, 2315, 3243, 1797, 979...",0.030687,0
1895,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,11918,1339,"[-1, 1127, 575, 2328, 2637, 4378, 3947, 4223, ...",0.074716,1
1896,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,11328,1289,"[-1, -1, -1, -1, -1, 3308, 3848, 2264, 2084, 1...",0.082055,1


In [14]:
eval_classification_df['score'].describe().T

count    1898.000000
mean        0.113672
std         0.156000
min         0.010007
25%         0.027352
50%         0.056704
75%         0.129420
max         1.000000
Name: score, dtype: float64

In [15]:
classification_report = log_classification_metrics(args, eval_classification_df, target_col='label', prediction_col='score')

## Loss

In [16]:
naive_prediction = train_df['rating'].mean()

naive_mse = (
    (val_df['rating'] - val_df.assign(naive_prediction=naive_prediction)['naive_prediction'])
    .apply(lambda s: s*s)
    .mean()
)
logger.info(f"Val MSE = {naive_mse:,.2f} given naive_prediction={naive_prediction:,.1f}")

[32m2024-09-20 22:04:01.953[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mVal MSE = 5.12 given naive_prediction=2.2[0m


# Clean up

In [17]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/09/20 22:04:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run 058-baseline-popular at: http://localhost:5003/#/experiments/1/runs/203ac3d24df643a28946c9c5c2e0c7ef.
2024/09/20 22:04:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5003/#/experiments/1.
