# Baseline Popular Recommender
As with any ML project, we start with a non-ML approach as our baseline. It's very common in RecSys project to have popular recommender as not only a benchmark but also an actual component (retrieval stage).

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import mlflow
import pandas as pd
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel

load_dotenv()

sys.path.insert(0, "..")

from src.eval import (
    create_label_df,
    log_classification_metrics,
    log_ranking_metrics,
    merge_recs_with_target,
)
from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L4 - Reco Algo"
    run_name: str = "001-baseline-popular"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")

        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-10-18 16:51:21.121[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m29[0m - [1mSetting up MLflow experiment FSDS RecSys - L4 - Reco Algo - run 001-baseline-popular...[0m
2024/10/18 16:51:21 INFO mlflow.tracking.fluent: Experiment with name 'FSDS RecSys - L4 - Reco Algo' does not exist. Creating a new experiment.


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L4 - Reco Algo",
  "run_name": "001-baseline-popular",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/001-baseline-popular",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128
}


# Prep data

In [4]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")
val_df = pd.read_parquet("../data/val_features_neg_df.parquet")

# Implement

In [5]:
popular_items_df = (
    train_df.groupby(args.item_col, as_index=False)
    .size()
    .assign(
        score=lambda df: df["size"] / df["size"].max(),
        rec_ranking=lambda df: df["score"]
        .rank(method="first", ascending=False)
        .astype(int),
    )
    .sort_values(["rec_ranking"], ascending=[True])
)

top_popular_items_df = popular_items_df.head(args.top_K)

top_popular_items_df

Unnamed: 0,parent_asin,size,score,rec_ranking
3554,B01N3ASPNV,1510,1.000000,1
2143,B0086VPUHI,1483,0.982119,2
4337,B07YBXFDYN,1456,0.964238,3
2368,B00BGA9WK2,1231,0.815232,4
4324,B07YBWT3PK,1080,0.715232,5
...,...,...,...,...
4272,B07WS18ZS3,347,0.229801,96
4330,B07YBX7Y3P,345,0.228477,97
2455,B00DB2BI00,341,0.225828,98
2461,B00DBLBMBQ,338,0.223841,99


In [6]:
recommendations_df = (
    val_df[[args.user_col]]
    .drop_duplicates()
    .assign(tmp=1)
    .pipe(
        lambda df: pd.merge(
            df, top_popular_items_df.assign(tmp=1), on="tmp", how="left"
        )
    )[[args.user_col, args.item_col, "score", "rec_ranking"]]
)

recommendations_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking
0,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B01N3ASPNV,1.000000,1
1,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0086VPUHI,0.982119,2
2,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B07YBXFDYN,0.964238,3
3,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B00BGA9WK2,0.815232,4
4,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B07YBWT3PK,0.715232,5
...,...,...,...,...
70795,AFNNVSVXIU446JKQR6Z6BWGYZXIA,B07WS18ZS3,0.229801,96
70796,AFNNVSVXIU446JKQR6Z6BWGYZXIA,B07YBX7Y3P,0.228477,97
70797,AFNNVSVXIU446JKQR6Z6BWGYZXIA,B00DB2BI00,0.225828,98
70798,AFNNVSVXIU446JKQR6Z6BWGYZXIA,B00DBLBMBQ,0.223841,99


# Evaluate

## Ranking metrics

In [7]:
label_df = create_label_df(
    val_df,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
    timestamp_col=args.timestamp_col,
)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
1757,AGDJVPSADB7OTS2CL4TLYQM4CT4A,B087NNPYP3,5.0,1.0
145,AFJKZEYD2VZSI2NO3JZNMA4XX4RA,B09MRM36JJ,4.0,1.0
170,AFBRTNVOROW7UVA66UPX5YCFC6MQ,B00EP2WNKY,3.0,1.0
1729,AGENEGNVXK2GM4YG35LAYXBO527Q,B071Y568B7,4.0,1.0
87,AEL7SJG67X3FAH6U4KLHBOTF6PZA,B001EYUWDG,5.0,1.0
...,...,...,...,...
393,AFB6FYPPCN33UMUU5536IHXNOHCQ,B0051D8PGM,0.0,18.0
1138,AESD4RLWUKM6JTD6SNNWYLHLLQQA,B00800VJWU,0.0,18.0
1845,AG4RCXKPTC6QRORJLUSBY4SO2IAA,B0050SVMYA,0.0,18.0
1677,AFB6FYPPCN33UMUU5536IHXNOHCQ,B0088TN73M,0.0,19.0


In [8]:
eval_df = merge_recs_with_target(
    recommendations_df,
    label_df,
    k=args.top_K,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
)
eval_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking,rating,rating_rank
80,AE2AZ2MNROPF33U6SS53VI22OXJA,B01N3ASPNV,1.000000,1,0,
26,AE2AZ2MNROPF33U6SS53VI22OXJA,B0086VPUHI,0.982119,2,0,
91,AE2AZ2MNROPF33U6SS53VI22OXJA,B07YBXFDYN,0.964238,3,0,
32,AE2AZ2MNROPF33U6SS53VI22OXJA,B00BGA9WK2,0.815232,4,0,
88,AE2AZ2MNROPF33U6SS53VI22OXJA,B07YBWT3PK,0.715232,5,0,
...,...,...,...,...,...,...
72426,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B00DB2BI00,0.225828,98,0,
72428,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B00DBLBMBQ,0.223841,99,0,
72395,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B0030H8I4K,0.223179,100,0,
72466,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B07CMKDB2K,,101,3,1.0


In [9]:
ranking_report = log_ranking_metrics(args, eval_df)

## Classification metrics

In [10]:
eval_classification_df = pd.merge(
    val_df,
    popular_items_df[[args.item_col, "score"]],
    on=[args.item_col],
    how="left",
    validate="m:1",
).assign(label=lambda df: df[args.rating_col].gt(0).astype(int))
eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,score,label
0,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,7087,3330,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,"[1596, 1047, 3310, 2885, 1862, 1299, 4500, 209...",0.130464,1
1,AFFPVZ3JNCTQIKAK4XK37E2ENWWA,B00HVBPRUO,4.0,1655428133046,4199,3721,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,"[-1, -1, 3810, 3019, 1158, 1714, 98, 309, 2291...",0.392053,1
2,AHAIICWIZT6PYSS5QJNFYP6ZXLCA,B001EYUPJW,0.0,1628811542081,18270,3274,Video Games,Def Jam Fight for NY - Gamecube (Gold),"[The ultimate hip-hop fueled fighting game, De...","[Video Games, Legacy Systems, Nintendo Systems...",349.97,"[2245, 705, 3089, 4462, 3922, 2808, 3476, 3558...",0.014570,0
3,AF2AAA4CWRVF2IYVE7WB6OOIEMFA,B072C3VM5F,0.0,1635286957988,10410,4179,Video Games,Far Cry 5 Gold Edition - Xbox One [Digital Code],[],"[Video Games, Xbox One, Games]",,"[-1, -1, -1, -1, 1690, 431, 720, 742, 1885, 2711]",0.193377,0
4,AFBRTNVOROW7UVA66UPX5YCFC6MQ,B07YBXFDYK,3.0,1636189764550,9102,2986,Video Games,The Evil Within 2 - PlayStation 4,"[From Shinji Mikami, The Evil Within 2 takes t...","[Video Games, PlayStation 4, Games]",20.98,"[-1, -1, -1, -1, 1853, 195, 4616, 1777, 4654, ...",0.090066,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,AGNZODFXG6WNNJUAKR3MI42SDG5A,B0B8RG61GK,0.0,1642617264556,10449,3436,Computers,Fast Charging Cable for Switch/Switch Lite/Swi...,"[Dimensions:, Length: 9.8 Feet/3 Meters]","[Video Games, Legacy Systems, Nintendo Systems...",9.89,"[-1, -1, -1, -1, 2074, 1310, 3877, 184, 2952, ...",0.015232,0
1894,AHA6LZWVG2U4WBXNZRWCESNJXNUA,B002JTX87C,0.0,1646645847748,16655,380,Video Games,Scooby Doo! First Frights NDS,"[Product Description, In Scooby-Doo! First Fri...","[Video Games, Legacy Systems, Nintendo Systems...",43.77,"[1208, 1088, 3365, 4287, 3679, 4048, 2351, 197...",0.027152,0
1895,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,11656,4503,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 2159, 3248, 48, 445, 4343, 584, 907, 4155...",0.075497,1
1896,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,1562,4543,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 4507, 1351, 132, 4660, 1979]",0.087417,1


In [11]:
eval_classification_df["score"].describe().T

count    1898.000000
mean        0.110385
std         0.156121
min         0.009934
25%         0.025828
50%         0.054967
75%         0.121854
max         1.000000
Name: score, dtype: float64

In [12]:
classification_report = log_classification_metrics(
    args, eval_classification_df, target_col="label", prediction_col="score"
)

## Loss
We can estimate what kinds of MSE loss should be the upper bound model training based on measuring the loss when asking the model to naively predict the mean rating for every item.

In [13]:
naive_prediction = train_df[args.rating_col].mean()

naive_mse = (
    (
        val_df[args.rating_col]
        - val_df.assign(naive_prediction=naive_prediction)["naive_prediction"]
    )
    .apply(lambda s: s * s)
    .mean()
)
logger.info(
    f"Val MSE = {naive_mse:,.2f} given naive_prediction={naive_prediction:,.1f}"
)

[32m2024-10-18 16:51:24.555[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mVal MSE = 5.12 given naive_prediction=2.2[0m


# Clean up

In [14]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/10/18 16:51:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run 001-baseline-popular at: http://localhost:5003/#/experiments/1/runs/60018ffc6a4b4425a10877ab39ecd0f9.
2024/10/18 16:51:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5003/#/experiments/1.
