# Baseline Popular Recommender
As with any ML project, we start with a non-ML approach as our baseline. It's very common in RecSys project to have popular recommender as not only a benchmark but also an actual component (retrieval stage).

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

import pandas as pd
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel

import mlflow


sys.path.insert(0, "..")

from src.eval import (
    create_label_df,
    log_ranking_metrics,
    merge_recs_with_target,
)
from src.viz import custom_style_plotly

custom_style_plotly()
load_dotenv()

# Controller

In [None]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "Retriever"
    run_name: str = "001-baseline-popular"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_k_retrieve: int = 100
    top_k_rerank: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")

        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                "Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

[32m2025-03-08 13:57:34.772[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m29[0m - [1mSetting up MLflow experiment Retriever - run 001-baseline-popular...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "Retriever",
  "run_name": "001-baseline-popular",
  "notebook_persist_dp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/001-baseline-popular",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_k_retrieve": 100,
  "top_k_rerank": 10,
  "batch_size": 128
}


# Prep data

In [4]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

# Implement

In [5]:
popular_items_df = (
    train_df.groupby(args.item_col, as_index=False)
    .size()
    .assign(
        score=lambda df: df["size"] / df["size"].max(),
        rec_ranking=lambda df: df["score"]
        .rank(method="first", ascending=False)
        .astype(int),
    )
    .sort_values(["rec_ranking"], ascending=[True])
)

top_popular_items_df = popular_items_df.head(args.top_k_retrieve)

top_popular_items_df

Unnamed: 0,parent_asin,size,score,rec_ranking
5342,B00L9B7IKE,1248,1.000000,1
5260,B00JO8PEN2,849,0.680288,2
4340,B006LSZECO,773,0.619391,3
4924,B00DPM7TIG,641,0.513622,4
4855,B00CNQ7HAU,593,0.475160,5
...,...,...,...,...
6253,B01LZFL63S,150,0.120192,96
999,0375842209,149,0.119391,97
6934,B07HQ5S1NW,149,0.119391,98
4529,B008J48RA4,147,0.117788,99


In [6]:
recommendations_df = (
    val_df[[args.user_col]]
    .drop_duplicates()
    .assign(tmp=1)
    .pipe(
        lambda df: pd.merge(
            df, top_popular_items_df.assign(tmp=1), on="tmp", how="left"
        )
    )[[args.user_col, args.item_col, "score", "rec_ranking"]]
)

recommendations_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking
0,AHTU4O4QDYDGUXW5ZO7OOYCTSXRA,B00L9B7IKE,1.000000,1
1,AHTU4O4QDYDGUXW5ZO7OOYCTSXRA,B00JO8PEN2,0.680288,2
2,AHTU4O4QDYDGUXW5ZO7OOYCTSXRA,B006LSZECO,0.619391,3
3,AHTU4O4QDYDGUXW5ZO7OOYCTSXRA,B00DPM7TIG,0.513622,4
4,AHTU4O4QDYDGUXW5ZO7OOYCTSXRA,B00CNQ7HAU,0.475160,5
...,...,...,...,...
173595,AFQD5UD2A75CDGBQVN7WPD7KP7GA,B01LZFL63S,0.120192,96
173596,AFQD5UD2A75CDGBQVN7WPD7KP7GA,0375842209,0.119391,97
173597,AFQD5UD2A75CDGBQVN7WPD7KP7GA,B07HQ5S1NW,0.119391,98
173598,AFQD5UD2A75CDGBQVN7WPD7KP7GA,B008J48RA4,0.117788,99


# Evaluate

## Ranking metrics

In [7]:
label_df = create_label_df(
    val_df,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
    timestamp_col=args.timestamp_col,
)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
163441,AFOXOV5G7LEBTDWVR5Q363HDVKCA,0062060627,5.0,1.0
193638,AF64Y2PPVDKBQUZ2AOEWFDON53NA,B01HZFB38U,5.0,1.0
86620,AENZPBHRYWAM7YWB6JPBPSYIHCPA,B07QYY1NN5,5.0,1.0
285220,AGCXACDXM4ZMPHOKP6DBC5R44WAA,B091P62TPV,5.0,1.0
388233,AEEI25ROY2PE3LJVDFU2OWY7D2TA,B08SNVSF47,5.0,1.0
...,...,...,...,...
254690,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B00B8548LS,5.0,32.0
48297,AERWHGDBA6B3XDADBXDKSBQZOW4A,B07B7MCM7G,2.0,32.0
254687,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B089KT6VCV,5.0,33.0
254686,AGJMFI2X7BIY4WXPW6T76SKYGCKQ,B085VNJ9SY,5.0,34.0


In [8]:
eval_df = merge_recs_with_target(
    recommendations_df,
    label_df,
    k=args.top_k_retrieve,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
)
eval_df

Unnamed: 0,user_id,parent_asin,score,rec_ranking,rating,rating_rank
46,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00L9B7IKE,1.000000,1,0,
44,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00JO8PEN2,0.680288,2,0,
11,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B006LSZECO,0.619391,3,0,
28,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00DPM7TIG,0.513622,4,0,
26,AE23RLRV25THT7OZM4T4ZJ4BMYCA,B00CNQ7HAU,0.475160,5,0,
...,...,...,...,...,...,...
176858,AHZXAMLRISP275TBMPIGCRWUQFYA,0375842209,0.119391,97,0,
176956,AHZXAMLRISP275TBMPIGCRWUQFYA,B07HQ5S1NW,0.119391,98,0,
176875,AHZXAMLRISP275TBMPIGCRWUQFYA,B008J48RA4,0.117788,99,0,
176896,AHZXAMLRISP275TBMPIGCRWUQFYA,B00ICN2Z78,0.117788,100,0,


In [9]:
ranking_report = log_ranking_metrics(args, eval_df)

  return (1 + beta_sqr) * precision_arr * recall_arr / (beta_sqr * precision_arr + recall_arr)


## Loss
We can estimate what kinds of MSE loss should be the upper bound model training based on measuring the loss when asking the model to naively predict the mean rating for every item.

In [10]:
naive_prediction = train_df[args.rating_col].mean()

naive_mse = (
    (
        val_df[args.rating_col]
        - val_df.assign(naive_prediction=naive_prediction)["naive_prediction"]
    )
    .apply(lambda s: s * s)
    .mean()
)
logger.info(
    f"Val MSE = {naive_mse:,.2f} given naive_prediction={naive_prediction:,.1f}"
)

[32m2025-03-08 13:57:39.128[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mVal MSE = 1.00 given naive_prediction=4.3[0m


# Clean up

In [11]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

🏃 View run 001-baseline-popular at: http://localhost:5002/#/experiments/1/runs/e1293423f32d488aba4cd400f9a662de
🧪 View experiment at: http://localhost:5002/#/experiments/1


/tmp/ipykernel_17971/1015503666.py:5: PydanticDeprecatedSince20:

The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/

