In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
from loguru import logger

import pandas as pd
import numpy as np
from pydantic import BaseModel
import plotly.express as px
from dotenv import load_dotenv
import mlflow

load_dotenv()

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '001-cf-i2i'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    
    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        
        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            import mlflow

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-14 00:42:16.178[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m28[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 001-cf-i2i...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "001-cf-i2i",
  "notebook_persist_dp": "/home/dvquys/frostmourne/reco-algo/notebooks/data/001-cf-i2i",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128
}


# Implement

In [4]:
from src.train_utils import mse_loss, train, MetricLogCallback
from src.model import Item2ItemCollaborativeFiltering

# Test implementation

In [5]:
# Mock data
user_ids = [0, 0, 1, 1, 2, 2, 2]
item_ids = [0, 1, 1, 2, 3, 1, 2]
ratings = [1, 4, 4, 5, 3, 2, 4]
n_users = len(set(user_ids))
n_items = len(set(item_ids))

val_user_ids = [0, 1, 2]
val_item_ids = [2, 1, 2]
val_ratings = [2, 4, 5]

print("Mock User IDs:", user_ids)
print("Mock Item IDs:", item_ids)
print("Ratings:", ratings)

model = Item2ItemCollaborativeFiltering(n_users, n_items)

users = [1, 1, 2]
items = [3, 2, 0]
predictions = model.predict(users, items)
print(predictions)

Mock User IDs: [0, 0, 1, 1, 2, 2, 2]
Mock Item IDs: [0, 1, 1, 2, 3, 1, 2]
Ratings: [1, 4, 4, 5, 3, 2, 4]
[3 3 3]


In [6]:
model.fit(user_ids, item_ids, ratings)
predictions = model.predict(users, items)
print(predictions)

[4.6520632 4.        2.       ]


In [7]:
model.user_item_matrix.T

array([[1., 0., 0.],
       [4., 4., 2.],
       [0., 5., 4.],
       [0., 0., 3.]])

In [8]:
model.item_similarity

array([[0.        , 0.66666667, 0.        , 0.        ],
       [0.66666667, 0.        , 0.72881089, 0.33333333],
       [0.        , 0.72881089, 0.        , 0.62469505],
       [0.        , 0.33333333, 0.62469505, 0.        ]])

In [9]:
item = 3
user = 1

# Compute prediction using weighted average of ratings from similar items
sim_scores = model.item_similarity[item]
print(f"{sim_scores=}")

sim_scores=array([0.        , 0.33333333, 0.62469505, 0.        ])


In [10]:
# Only consider items that have been rated by the current user
item_ratings = model.user_item_matrix[user, :]
print(f"Ratings of current user for all items:\n{item_ratings=}")
sim_scores = sim_scores[item_ratings != 0]
print(f"Cosine similarity score of target item towards all other items where current user has rated:\n{sim_scores}")
item_ratings = item_ratings[item_ratings != 0]

Ratings of current user for all items:
item_ratings=array([0., 4., 5., 0.])
Cosine similarity score of target item towards all other items where current user has rated:
[0.33333333 0.62469505]


In [11]:
# Weighted average of ratings
print(f"Weighted average: {np.dot(sim_scores, item_ratings)}")
print(f"Normalization factor: {np.sum(sim_scores)}")
print(f"Predicted rating: {np.dot(sim_scores, item_ratings) / np.sum(sim_scores)}")

Weighted average: 4.456808571105455
Normalization factor: 0.9580283808877577
Predicted rating: 4.652063195638892


# Prep data

In [12]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [13]:
from src.id_mapper import IDMapper

In [14]:
user_ids = train_df['user_id'].values
item_ids = train_df['parent_asin'].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
n_users = len(unique_user_ids)
n_items = len(unique_item_ids)

logger.info(f"{len(unique_user_ids)=:,.0f}, {len(unique_item_ids)=:,.0f}")

[32m2024-09-14 00:42:17.698[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(unique_user_ids)=5,223, len(unique_item_ids)=2,653[0m


In [15]:
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [16]:
user_indices = [idm.get_user_index(user_id) for user_id in user_ids]
item_indices = [idm.get_item_index(item_id) for item_id in item_ids]
ratings = train_df['rating'].values.tolist()

In [17]:
val_user_indices = [idm.get_user_index(user_id) for user_id in val_df['user_id']]
val_item_indices = [idm.get_item_index(item_id) for item_id in val_df['parent_asin']]
val_ratings = val_df['rating'].values.tolist()

# Train

In [18]:
model = Item2ItemCollaborativeFiltering(n_users, n_items)

#### Predict before train

In [19]:
user_id = 'AEHW2B54HDLZ3APBEWXHYLZ6SSYQ'
val_df.loc[lambda df: df['user_id'].eq(user_id)]

Unnamed: 0,user_id,parent_asin,rating,timestamp
34367,AEHW2B54HDLZ3APBEWXHYLZ6SSYQ,B07MYVF61Y,4.0,1654225907045


In [20]:
item_id = 'B07MYVF61Y'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)

model.predict([user_indice], [item_indice])

array([3])

#### Training loop

In [21]:
model.fit(user_indices, item_indices, ratings)

# Evaluate

## MSE

In [22]:
from tqdm.notebook import tqdm

In [23]:
val_predictions = []
for i in tqdm(range(len(val_ratings))):
    user_indice = val_user_indices[i]
    item_indice = val_item_indices[i]
    prediction = model.predict([user_indice], [item_indice])
    prediction = float(prediction[0])
    val_predictions.append(prediction)

  0%|          | 0/4259 [00:00<?, ?it/s]

In [24]:
def mse(predictions, ratings):
    predictions = np.array(predictions)
    ratings = np.array(ratings)
    return np.mean((predictions - ratings) ** 2)

mse_loss = mse(val_predictions, val_ratings)
mse_loss

np.float64(2.922965643119177)

In [25]:
val_predictions[:5]

[3.0, 3.0, 5.0, 3.0, 4.999999999999999]

In [26]:
val_ratings[:5]

[5.0, 5.0, 5.0, 5.0, 2.0]

In [27]:
if args.log_to_mlflow:
    mlflow.log_metric("val_MSE", float(mse_loss))

## Ranking metrics

In [28]:
user_col = args.user_col
item_col = args.item_col
rating_col = args.rating_col

In [29]:
from src.eval import create_label_df, create_rec_df, merge_recs_with_target

In [30]:
recommendations = model.recommend(val_user_indices, k=args.top_K, progress_bar_type='tqdm_notebook')

Generating Recommendations:   0%|          | 0/4259 [00:00<?, ?it/s]

In [31]:
recommendations_df = pd.DataFrame(recommendations).pipe(create_rec_df, idm)
recommendations_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,2001,0,3.0,1.0,AGS4TR4K5DMBRAFNBYSB2I2RCHHQ,B00KUYZ7I6
1,2001,1,3.0,2.0,AGS4TR4K5DMBRAFNBYSB2I2RCHHQ,B07KL859FF
2,2001,2,3.0,3.0,AGS4TR4K5DMBRAFNBYSB2I2RCHHQ,B00Y8CQCXA
3,2001,3,3.0,4.0,AGS4TR4K5DMBRAFNBYSB2I2RCHHQ,B08HL69H8Y
4,2001,4,3.0,5.0,AGS4TR4K5DMBRAFNBYSB2I2RCHHQ,B00K32USMU
...,...,...,...,...,...,...
425895,3267,95,3.0,96.0,AGT3DOVVP5JRBD3JNUK2OG23PMAA,B07537RVWL
425896,3267,96,3.0,97.0,AGT3DOVVP5JRBD3JNUK2OG23PMAA,B07MDM62W6
425897,3267,97,3.0,98.0,AGT3DOVVP5JRBD3JNUK2OG23PMAA,B08MWP39SJ
425898,3267,98,3.0,99.0,AGT3DOVVP5JRBD3JNUK2OG23PMAA,B08LZGPPBH


In [32]:
label_df = create_label_df(val_df)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
31871,AEU4444ZVMLQB4ZXKLDCQL33BZPA,B07JH3LSHN,3.0,1.0
13604,AF3LO27R5D3TCPUNE5U6ZQWMRHAA,B09LTYGYY2,5.0,1.0
27506,AGYCROALU32Q553KYALGCZLDPOSQ,B0039QJKZ8,5.0,1.0
29163,AEZ4WKT6DIOZ5ZC2KXIYU4PUXMDA,B08M2K9K67,5.0,1.0
23463,AF2AAA4CWRVF2IYVE7WB6OOIEMFA,B07SJVCKQW,5.0,1.0
...,...,...,...,...
1412,AGVZRX53LPVHDZQC7SEC7JMHOM3Q,B01MG8P418,4.0,10.0
447,AHZGMQN5OMOHJAJ6F2YPOS66RSDQ,B07MZ6PDG9,5.0,10.0
14909,AEYWWBRMNORKNO6RKCNDN5D5ROTA,B07QKVKCT6,5.0,11.0
464,AHZGMQN5OMOHJAJ6F2YPOS66RSDQ,B06ZY6VHDD,4.0,11.0


In [33]:
eval_df = merge_recs_with_target(recommendations_df, label_df, k=args.top_K)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
80,185.0,5.0,5.0,1,AE22LPCN47WUTHSG67R6SKN4A4MQ,B0BF9HZ7Z5,0,
13,185.0,137.0,5.0,2,AE22LPCN47WUTHSG67R6SKN4A4MQ,B00Z9TM72Q,0,
74,185.0,989.0,5.0,3,AE22LPCN47WUTHSG67R6SKN4A4MQ,B0B1QZ1D49,0,
99,185.0,1438.0,5.0,4,AE22LPCN47WUTHSG67R6SKN4A4MQ,B0C6Q5DW7M,0,
12,185.0,1575.0,5.0,5,AE22LPCN47WUTHSG67R6SKN4A4MQ,B00QM7JUMY,0,
...,...,...,...,...,...,...,...,...
429793,2913.0,91.0,3.0,97,AHZZ26USAR7T6VXZ7XIMHB7E3XEQ,B07QBMS119,0,
429812,2913.0,92.0,3.0,98,AHZZ26USAR7T6VXZ7XIMHB7E3XEQ,B08C3WQ25C,0,
429822,2913.0,93.0,3.0,99,AHZZ26USAR7T6VXZ7XIMHB7E3XEQ,B08RMLH663,0,
429771,2913.0,94.0,3.0,100,AHZZ26USAR7T6VXZ7XIMHB7E3XEQ,B015O2T0G8,0,


### Visualize

In [34]:
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metrics import PrecisionTopKMetric
from evidently.metrics import RecallTopKMetric
from evidently.metrics import FBetaTopKMetric
from evidently.metrics import NDCGKMetric
from evidently.metrics import PersonalizationMetric
import warnings

warnings.filterwarnings(
    action='ignore',
    category=FutureWarning,
    module=r'evidently.metrics.recsys.precision_recall_k'
)

from src.viz import color_scheme

In [35]:
column_mapping = ColumnMapping(
    recommendations_type='rank',
    target=rating_col,
    prediction='rec_ranking',
    item_id=item_col,
    user_id=user_col
)

report = Report(metrics=[
    NDCGKMetric(k=args.top_k),
    RecallTopKMetric(k=args.top_K),
    PrecisionTopKMetric(k=args.top_k),
    FBetaTopKMetric(k=args.top_k),
    PersonalizationMetric(k=args.top_k),
], options=[color_scheme])

report.run(
    reference_data=None,
    current_data=eval_df,
    column_mapping=column_mapping
)

evidently_report_fp = f"{args.notebook_persist_dp}/evidently_report.html"
os.makedirs(args.notebook_persist_dp, exist_ok=True)
report.save_html(evidently_report_fp)

if args.log_to_mlflow:
    mlflow.log_artifact(evidently_report_fp)
    for metric_result in report.as_dict()['metrics']:
        metric = metric_result['metric']
        if metric == 'PersonalizationMetric':
            metric_value = float(metric_result['result']['current_value'])
            mlflow.log_metric(f"val_{metric}", metric_value)
            continue
        result = metric_result['result']['current'].to_dict()
        for kth, metric_value in result.items():
            mlflow.log_metric(f"val_{metric}_at_k_as_step", metric_value, step=kth)

# Predict

In [36]:
val_predictions[2]

5.0

In [37]:
val_df.iloc[[2]]

Unnamed: 0,user_id,parent_asin,rating,timestamp
6758,AFQAPWVESEJYTNZC23LDPQOH7QBA,B09GM4283G,5.0,1630119475785


In [38]:
user_id = 'AFQAPWVESEJYTNZC23LDPQOH7QBA'

In [39]:
item_id = 'B09GM4283G'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)

model.predict([user_indice], [item_indice])

array([5.])

# Clean up

In [40]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/09/14 00:43:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run 001-cf-i2i at: http://localhost:5003/#/experiments/1/runs/64231fd0751d4c929364b1cc923d0456.
2024/09/14 00:43:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5003/#/experiments/1.
