# Baseline Popular Recommender

As with any ML project, we start with a non-ML approach as our baseline. It's very common in RecSys project to have popular recommender as not only a benchmark but also an actual component (retrieval stage).


# Set up


In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

import pandas as pd
from dotenv import load_dotenv
from loguru import logger

import mlflow

sys.path.insert(0, "..")

from src.cfg import ConfigLoader
from src.eval import (
    create_label_df,
    log_classification_metrics,
    log_ranking_metrics,
    merge_recs_with_target,
)
from src.viz import custom_style_plotly

load_dotenv()
custom_style_plotly()

# Controller


In [None]:
cfg = ConfigLoader("../cfg/common.yaml")
cfg.run.run_name = "001-baseline-popular"
cfg.run.experiment_name = "Retrieve - Binary"
cfg.init()

[32m2025-03-09 16:28:17.153[0m | [1mINFO    [0m | [36msrc.cfg[0m:[36minit[0m:[36m204[0m - [1mSetting up MLflow experiment Retrieve - Binary - run 001-baseline-popular...[0m


{
  "run": {
    "testing": false,
    "log_to_mlflow": true,
    "experiment_name": "Retrieve - Binary",
    "run_name": "001-baseline-popular",
    "run_persist_dir": "/home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/001-baseline-popular",
    "random_seed": 41,
    "device": "cuda"
  },
  "data": {
    "hf_datasets": {
      "name": "McAuley-Lab/Amazon-Reviews-2023",
      "mcauley_variant": "Books"
    },
    "train_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/train.parquet",
    "val_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/val.parquet",
    "idm_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/idm.json",
    "metadata_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/metadata.parquet",
    "train_features_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/train_features.parquet",
    "val_features_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/val_features.parquet",
    "full_features_neg_fp": "/home/dvq/frost

# Prep data


In [4]:
train_df = pd.read_parquet(cfg.data.train_features_neg_fp)
val_df = pd.read_parquet(cfg.data.val_features_neg_fp)

# Implement


In [5]:
popular_items_df = (
    train_df.groupby(cfg.data.item_col, as_index=False)
    .size()
    .assign(
        score=lambda df: df["size"] / df["size"].max(),
        rec_ranking=lambda df: df["score"]
        .rank(method="first", ascending=False)
        .astype(int),
    )
    .sort_values(["rec_ranking"], ascending=[True])
)

top_popular_items_df = popular_items_df.head(cfg.eval.top_k_retrieve)

top_popular_items_df

Unnamed: 0,parent_asin,size,score,rec_ranking
5342,B00L9B7IKE,2312,1.000000,1
5260,B00JO8PEN2,1611,0.696799,2
4340,B006LSZECO,1520,0.657439,3
4924,B00DPM7TIG,1254,0.542388,4
4855,B00CNQ7HAU,1134,0.490484,5
...,...,...,...,...
4529,B008J48RA4,300,0.129758,96
5186,B00IB5BSBG,298,0.128893,97
4757,B00BAXFAVK,294,0.127163,98
4614,B009KP9VIS,293,0.126730,99


In [6]:
recommendations_df = (
    val_df[[cfg.data.user_col]]
    .drop_duplicates()
    .assign(tmp=1)
    .pipe(
        lambda df: pd.merge(
            df, top_popular_items_df.assign(tmp=1), on="tmp", how="left"
        )
    )[[cfg.data.user_col, cfg.data.item_col, "score", "rec_ranking"]]
)

recommendations_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking
0,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00L9B7IKE,1.000000,1
1,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00JO8PEN2,0.696799,2
2,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B006LSZECO,0.657439,3
3,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00DPM7TIG,0.542388,4
4,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00CNQ7HAU,0.490484,5
...,...,...,...,...
173595,AG2O5XWVSLNQL2WFXOAMUURRLDFA,B008J48RA4,0.129758,96
173596,AG2O5XWVSLNQL2WFXOAMUURRLDFA,B00IB5BSBG,0.128893,97
173597,AG2O5XWVSLNQL2WFXOAMUURRLDFA,B00BAXFAVK,0.127163,98
173598,AG2O5XWVSLNQL2WFXOAMUURRLDFA,B009KP9VIS,0.126730,99


# Evaluate


## Ranking metrics


In [7]:
# In case we wonder what if we measure Popular Recommender ranking metrics on the basis that the label is binary
# to have a fair comparison with our models
if cfg.train.label_format == "binary":
    val_df = val_df.assign(
        **{cfg.data.rating_col: lambda df: df[cfg.data.rating_col].gt(0).astype(float)}
    )

In [8]:
label_df = create_label_df(
    val_df,
    user_col=cfg.data.user_col,
    item_col=cfg.data.item_col,
    rating_col=cfg.data.rating_col,
    timestamp_col=cfg.data.timestamp_col,
)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
6954,AGTSOB2DSZDRFHJ45WHCNE76AAKA,B07LF2YL9S,1.0,1.0
4958,AHXKQYMDRWCAMI5HMXOEHJZZ3WHQ,B01C1LUFFK,1.0,1.0
4768,AF6NYXCJPRNEDPYM5ADIZEPUM6PA,1842297406,1.0,1.0
3305,AFOETF44NA34J6CR4KRVUNVUXJ3Q,B079DPT9MS,1.0,1.0
3168,AETGEULA6ZYG5LSIEVE3GLB7LORA,B004QX07GY,1.0,1.0
...,...,...,...,...
776,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B00H6EJSC4,0.0,66.0
4803,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B01A4AXM3W,0.0,67.0
5205,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B003IEJZV0,0.0,68.0
3772,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B019ATLCWG,0.0,69.0


In [9]:
eval_df = merge_recs_with_target(
    recommendations_df,
    label_df,
    k=cfg.eval.top_k_retrieve,
    user_col=cfg.data.user_col,
    item_col=cfg.data.item_col,
    rating_col=cfg.data.rating_col,
)
eval_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking,rating,rating_rank
46,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00L9B7IKE,1.000000,1,0,
44,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00JO8PEN2,0.696799,2,0,
10,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B006LSZECO,0.657439,3,0,
28,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00DPM7TIG,0.542388,4,0,
26,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00CNQ7HAU,0.490484,5,0,
...,...,...,...,...,...,...
180026,AHZXAMLRISP275TBMPIGCRWUQFYA,B00BAXFAVK,0.127163,98,0,
180021,AHZXAMLRISP275TBMPIGCRWUQFYA,B009KP9VIS,0.126730,99,0,
180009,AHZXAMLRISP275TBMPIGCRWUQFYA,B004TI5N38,0.124135,100,0,
180003,AHZXAMLRISP275TBMPIGCRWUQFYA,0743424425,,101,0,2.0


In [10]:
ranking_report = log_ranking_metrics(cfg, eval_df)

  return (1 + beta_sqr) * precision_arr * recall_arr / (beta_sqr * precision_arr + recall_arr)


In [11]:
cfg.run.run_persist_dir

'/home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/001-baseline-popular'

## Classification metrics


In [12]:
eval_classification_df = pd.merge(
    val_df,
    popular_items_df[[cfg.data.item_col, "score"]],
    on=[cfg.data.item_col],
    how="left",
    validate="m:1",
).assign(label=lambda df: df[cfg.data.rating_col].gt(0).astype(int))
eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,score,label
0,AE23RLRV25THT7OZM4T4ZJ4BMYCA,0062409212,1.0,1646772001708,12,265,"[-1, -1, -1, -1, -1, 4136, 3643, 4566, 6057, 6...",0.019031,1
1,AEKQREM4SYKU6HK2CSMWOYXNMCWQ,B00EA8EO00,1.0,1650134360640,2600,4956,"[-1, -1, -1, -1, 4309, 7318, 7373, 7000, 7377,...",0.007785,1
2,AE4YFHNIXVBWI2V4DRU5UFSTY4RQ,B08ZM7BQ5J,1.0,1630002488344,469,7380,"[5058, 5229, 2515, 1277, 1370, 1371, 6978, 698...",0.016869,1
3,AF32NMJLLXFWLS6VNKOJQU2YIZFA,B000FCKIFU,0.0,1647485428978,5127,3478,"[7348, 7360, 7376, 6787, 5710, 7339, 6657, 676...",0.067907,0
4,AHPDMWKQJAGCVC22GSGIU3YANSJA,B0010SKUG0,0.0,1629083240553,18127,3659,"[-1, -1, -1, -1, -1, 6455, 7070, 4669, 6955, 7...",0.010381,0
...,...,...,...,...,...,...,...,...,...
7135,AH25V4CIGALTU2ARMKENS4HWD76A,B07DNDY87J,1.0,1631661207397,14899,6854,"[5260, 6108, 6172, 5967, 6069, 3849, 6201, 404...",0.111159,1
7136,AGT55MLCXTB5AUDMC7MGG2FVUEGQ,B00AFQC4QC,0.0,1636910759987,13878,4675,"[5133, 6862, 4328, 4147, 5715, 6659, 5966, 502...",0.006488,0
7137,AEFQQOMB5AYGR2FVV6X5OP5Y7VTQ,B00KIZQG96,1.0,1657081631273,1831,5301,"[5155, 5810, 4711, 5130, 4425, 5718, 6407, 617...",0.022491,1
7138,AGKILHN37242OQLPSMAMMZJA6IAQ,B07CRC52VH,1.0,1653587733450,12555,6808,"[6445, 6255, 5255, 5632, 6501, 7192, 6838, 727...",0.012976,1


In [13]:
eval_classification_df["score"].describe().T

count    7140.000000
mean        0.055762
std         0.095717
min         0.005190
25%         0.015138
50%         0.026384
75%         0.054498
max         1.000000
Name: score, dtype: float64

In [14]:
classification_report = log_classification_metrics(
    cfg, eval_classification_df, target_col="label", prediction_col="score"
)

## Loss

We can estimate what kinds of MSE loss should be the upper bound model training based on measuring the loss when asking the model to naively predict the mean rating for every item.


In [15]:
naive_prediction = train_df[cfg.data.rating_col].mean()

naive_mse = (
    (
        val_df[cfg.data.rating_col]
        - val_df.assign(naive_prediction=naive_prediction)["naive_prediction"]
    )
    .apply(lambda s: s * s)
    .mean()
)
logger.info(
    f"Val MSE = {naive_mse:,.2f} given naive_prediction={naive_prediction:,.1f}"
)

[32m2025-03-09 16:28:22.479[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mVal MSE = 3.01 given naive_prediction=2.2[0m


# Clean up


In [16]:
all_params = [cfg]

if cfg.run.log_to_mlflow:
    cfg.log_config_to_mlflow()
    mlflow.end_run()

🏃 View run 001-baseline-popular at: http://localhost:5002/#/experiments/3/runs/d2d94a81252f40cfb647fd5da79e83a1
🧪 View experiment at: http://localhost:5002/#/experiments/3
