In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
from loguru import logger
from dotenv import load_dotenv
from pydantic import BaseModel
from tqdm.notebook import tqdm

import numpy as np # required for the scikit-learn pipeline to work
import pandas as pd
import plotly.express as px
import mlflow

load_dotenv()

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '041-baseline-popular-movie-lens'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'movie_id'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        
        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            import mlflow

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-19 05:37:11.511[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m29[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 041-baseline-popular-movie-lens...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "041-baseline-popular-movie-lens",
  "notebook_persist_dp": "/home/jupyter/frostmourne/reco-algo/notebooks/data/041-baseline-popular-movie-lens",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "movie_id",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128
}


# Prep data

In [4]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [5]:
user_col = args.user_col
item_col = args.item_col
rating_col = args.rating_col
timestamp_col = args.timestamp_col

# Implement

In [6]:
popular_items_df = (
    train_df
    .groupby(item_col, as_index=False)
    .size()
    .assign(
        rec_ranking=lambda df: df['size'].rank(method='first', ascending=False).astype(int)
    )
    .sort_values(['rec_ranking'], ascending=[True])
    .head(args.top_K)
)

popular_items_df

Unnamed: 0,movie_id,size,rec_ranking
1602,2858,151,1
172,1210,130,2
476,1580,130,3
1378,260,126,4
2715,589,126,5
...,...,...,...
1466,2699,62,96
1566,2804,62,97
1631,2890,62,98
2409,380,62,99


In [7]:
recommendations_df = (
    val_df[[user_col]].drop_duplicates().assign(tmp=1)
    .pipe(lambda df: pd.merge(
        df,
        popular_items_df.assign(tmp=1),
        on='tmp',
        how='left'
    ))
    .rename(columns={'size': 'score'})
    [[user_col, item_col, 'score', 'rec_ranking']]
)
    
recommendations_df

Unnamed: 0,user_id,movie_id,score,rec_ranking
0,1185,2858,151,1
1,1185,1210,130,2
2,1185,1580,130,3
3,1185,260,126,4
4,1185,589,126,5
...,...,...,...,...
29995,5831,2699,62,96
29996,5831,2804,62,97
29997,5831,2890,62,98
29998,5831,380,62,99


# Evaluate

## Ranking metrics

In [8]:
from src.eval import create_label_df, merge_recs_with_target

In [9]:
label_df = create_label_df(val_df, user_col=user_col, item_col=item_col, rating_col=rating_col, timestamp_col=timestamp_col)
label_df

Unnamed: 0,user_id,movie_id,rating,rating_rank
334600,1974,800,5,1.0
365519,2127,2706,5,1.0
273790,1650,3578,4,1.0
194281,1198,1198,5,1.0
812293,4875,1188,5,1.0
...,...,...,...,...
327728,1941,2456,1,1142.0
328703,1941,1971,1,1143.0
328769,1941,1988,1,1144.0
328068,1941,3840,1,1145.0


In [10]:
eval_df = merge_recs_with_target(recommendations_df, label_df, k=args.top_K, user_col=user_col, item_col=item_col, rating_col=rating_col)
eval_df

Unnamed: 0,user_id,movie_id,score,rec_ranking,rating,rating_rank
104,1185,2858,151.0,1,5,20.0
18,1185,1210,130.0,2,0,
46,1185,1580,130.0,3,0,
88,1185,260,126.0,4,0,
152,1185,589,126.0,5,0,
...,...,...,...,...,...,...
47416,5831,1206,61.0,100,0,
47463,5831,2792,,101,4,1.0
47482,5831,3512,,101,3,3.0
47484,5831,3623,,101,2,4.0


### Visualize

In [11]:
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metrics import PrecisionTopKMetric
from evidently.metrics import RecallTopKMetric
from evidently.metrics import FBetaTopKMetric
from evidently.metrics import NDCGKMetric
from evidently.metrics import PersonalizationMetric
import warnings

warnings.filterwarnings(
    action='ignore',
    category=FutureWarning,
    module=r'evidently.metrics.recsys.precision_recall_k'
)

from src.viz import color_scheme

In [12]:
column_mapping = ColumnMapping(
    recommendations_type='rank',
    target=rating_col,
    prediction='rec_ranking',
    item_id=item_col,
    user_id=user_col
)

report = Report(metrics=[
    NDCGKMetric(k=args.top_k),
    RecallTopKMetric(k=args.top_K),
    PrecisionTopKMetric(k=args.top_k),
    FBetaTopKMetric(k=args.top_k),
    PersonalizationMetric(k=args.top_k),
], options=[color_scheme])

report.run(
    reference_data=None,
    current_data=eval_df,
    column_mapping=column_mapping
)

evidently_report_fp = f"{args.notebook_persist_dp}/evidently_report.html"
os.makedirs(args.notebook_persist_dp, exist_ok=True)
report.save_html(evidently_report_fp)

if args.log_to_mlflow:
    mlflow.log_artifact(evidently_report_fp)
    for metric_result in report.as_dict()['metrics']:
        metric = metric_result['metric']
        if metric == 'PersonalizationMetric':
            metric_value = float(metric_result['result']['current_value'])
            mlflow.log_metric(f"val_{metric}", metric_value)
            continue
        result = metric_result['result']['current'].to_dict()
        for kth, metric_value in result.items():
            mlflow.log_metric(f"val_{metric}_at_k_as_step", metric_value, step=kth)

# Clean up

In [13]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/09/19 05:37:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run 041-baseline-popular-movie-lens at: http://localhost:5003/#/experiments/1/runs/57a7324a98cb43bead0d0a5e199439bd.
2024/09/19 05:37:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5003/#/experiments/1.
