In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
from loguru import logger

import pandas as pd
import numpy as np
from pydantic import BaseModel
import plotly.express as px
from dotenv import load_dotenv
import mlflow

load_dotenv()

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '011-cf-u2u'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    
    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        
        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            import mlflow

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-15 12:02:36.864[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m31[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 011-cf-u2u...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "011-cf-u2u",
  "notebook_persist_dp": "/home/dvquys/frostmourne/reco-algo/notebooks/data/011-cf-u2u",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128,
  "learning_task": "rating"
}


# Implement

In [4]:
from src.train_utils import train, MetricLogCallback
from src.model import User2UserCollaborativeFiltering

# Test implementation

In [5]:
# Mock data
user_ids = [0, 0, 1, 1, 2, 2, 2]
item_ids = [0, 1, 1, 2, 3, 1, 2]
ratings = [1, 4, 4, 5, 3, 2, 4]
n_users = len(set(user_ids))
n_items = len(set(item_ids))

val_user_ids = [0, 1, 2]
val_item_ids = [2, 1, 2]
val_ratings = [2, 4, 5]

print("Mock User IDs:", user_ids)
print("Mock Item IDs:", item_ids)
print("Ratings:", ratings)

model = User2UserCollaborativeFiltering(n_users, n_items)

users = [0, 1, 2]
items = [2, 2, 0]
predictions = model.predict(users, items)
print(predictions)

Mock User IDs: [0, 0, 1, 1, 2, 2, 2]
Mock Item IDs: [0, 1, 1, 2, 3, 1, 2]
Ratings: [1, 4, 4, 5, 3, 2, 4]
[3 3 3]


In [6]:
model.fit(user_ids, item_ids, ratings)
predictions = model.predict(users, items)
print(predictions)

[4.62714989 4.         1.        ]


In [7]:
model.user_item_matrix

array([[1., 4., 0., 0.],
       [0., 4., 5., 0.],
       [0., 2., 4., 3.]])

In [8]:
model.user_similarity

array([[0.        , 0.60604322, 0.36030188],
       [0.60604322, 0.        , 0.81202071],
       [0.36030188, 0.81202071, 0.        ]])

In [9]:
user = 0
item = 2

# Compute prediction using weighted average of ratings from similar users
sim_scores = model.user_similarity[user]
print(f"{sim_scores=}")

sim_scores=array([0.        , 0.60604322, 0.36030188])


In [10]:
# Only consider users who have rated the item
user_ratings = model.user_item_matrix[:, item]
print(f"{user_ratings=}")
sim_scores = sim_scores[user_ratings != 0]
print(f"{sim_scores=}")
user_ratings = user_ratings[user_ratings != 0]
print(f"{user_ratings=}")

user_ratings=array([0., 5., 4.])
sim_scores=array([0.60604322, 0.36030188])
user_ratings=array([5., 4.])


In [11]:
# Weighted average of ratings
print(f"Weighted average: {np.dot(sim_scores, user_ratings)}")
print(f"Normalization factor: {np.sum(sim_scores)}")
print(f"Predicted rating: {np.dot(sim_scores, user_ratings) / np.sum(sim_scores)}")

Weighted average: 4.471423593469625
Normalization factor: 0.9663450945516922
Predicted rating: 4.627149885356445


# Prep data

In [12]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [13]:
from src.id_mapper import IDMapper

In [14]:
user_ids = train_df['user_id'].values
item_ids = train_df['parent_asin'].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
n_users = len(unique_user_ids)
n_items = len(unique_item_ids)

logger.info(f"{len(unique_user_ids)=:,.0f}, {len(unique_item_ids)=:,.0f}")

[32m2024-09-15 12:02:38.416[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(unique_user_ids)=12,397, len(unique_item_ids)=5,429[0m


In [15]:
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [16]:
user_indices = [idm.get_user_index(user_id) for user_id in user_ids]
item_indices = [idm.get_item_index(item_id) for item_id in item_ids]
ratings = train_df['rating'].values.tolist()

In [17]:
val_user_indices = [idm.get_user_index(user_id) for user_id in val_df['user_id']]
val_item_indices = [idm.get_item_index(item_id) for item_id in val_df['parent_asin']]
val_ratings = val_df['rating'].values.tolist()

# Train

In [18]:
model = User2UserCollaborativeFiltering(n_users, n_items)

#### Predict before train

In [19]:
user_id = 'AEHW2B54HDLZ3APBEWXHYLZ6SSYQ'
val_df.loc[lambda df: df['user_id'].eq(user_id)]

Unnamed: 0,user_id,parent_asin,rating,timestamp
34367,AEHW2B54HDLZ3APBEWXHYLZ6SSYQ,B07MYVF61Y,4.0,1654225907045
34366,AEHW2B54HDLZ3APBEWXHYLZ6SSYQ,B08WPW7XVG,4.0,1629347738853


In [20]:
item_id = 'B07MYVF61Y'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)

model.predict([user_indice], [item_indice])

array([3])

#### Training loop

In [21]:
model.fit(user_indices, item_indices, ratings)

# Evaluate

## MSE

In [22]:
from tqdm.notebook import tqdm

In [23]:
val_predictions = []
for i in tqdm(range(len(val_ratings))):
    user_indice = val_user_indices[i]
    item_indice = val_item_indices[i]
    prediction = model.predict([user_indice], [item_indice])
    prediction = float(prediction[0])
    val_predictions.append(prediction)

  0%|          | 0/15257 [00:00<?, ?it/s]

In [24]:
def mse(predictions, ratings):
    predictions = np.array(predictions)
    ratings = np.array(ratings)
    return np.mean((predictions - ratings) ** 2)

mse_loss = mse(val_predictions, val_ratings)
mse_loss

np.float64(2.905881666948432)

In [25]:
val_predictions[:5]

[3.0, 3.0, 3.0, 4.505111675558792, 3.0]

In [26]:
val_ratings[:5]

[3.0, 5.0, 5.0, 5.0, 5.0]

In [27]:
if args.log_to_mlflow:
    mlflow.log_metric("val_MSE", float(mse_loss))

## Ranking metrics

In [28]:
user_col = args.user_col
item_col = args.item_col
rating_col = args.rating_col

In [29]:
from src.eval import create_label_df, create_rec_df, merge_recs_with_target

In [30]:
recommendations = model.recommend(val_user_indices, k=args.top_K, progress_bar_type='tqdm_notebook')

Generating Recommendations:   0%|          | 0/15257 [00:00<?, ?it/s]

In [31]:
recommendations_df = pd.DataFrame(recommendations).pipe(create_rec_df, idm)
recommendations_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,2978,5169,5.0,1.0,AGCL7QDBZ24RZHTSPHSQ4ZXSG3RQ,B07MYVF61Y
1,2978,5140,4.0,16.0,AGCL7QDBZ24RZHTSPHSQ4ZXSG3RQ,B07N11TKK9
2,2978,5031,5.0,2.0,AGCL7QDBZ24RZHTSPHSQ4ZXSG3RQ,B07QFPF8QW
3,2978,4897,4.0,17.0,AGCL7QDBZ24RZHTSPHSQ4ZXSG3RQ,B07VQ9Q7X3
4,2978,4847,5.0,3.0,AGCL7QDBZ24RZHTSPHSQ4ZXSG3RQ,B00F27JGVA
...,...,...,...,...,...,...
1518295,155,3570,3.0,296.0,AEEXD6AIYLX4YWPPQCM7M7PQUHKQ,B08L74DB2M
1518296,155,3569,3.0,297.0,AEEXD6AIYLX4YWPPQCM7M7PQUHKQ,B00004UDVL
1518297,155,3600,3.0,298.0,AEEXD6AIYLX4YWPPQCM7M7PQUHKQ,B07TV9X6LY
1518298,155,3599,3.0,299.0,AEEXD6AIYLX4YWPPQCM7M7PQUHKQ,B085217GYP


In [32]:
label_df = create_label_df(val_df)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
11013,AE2WIEN3VYPFDPZ3IATSPFQQV2FQ,B094YHB1QK,1.0,1.0
2360,AH4JY7JDGBM5KWZNNS564ZZ7KCEA,B07D53QSMK,5.0,1.0
7596,AH5XFYDNDNYOSEYC7VZ6ORVNS3UA,B07XHMFCJ2,5.0,1.0
7647,AG3N3EMFIFGW66WOIC4MW55IUS6Q,B0BXQH38S6,4.0,1.0
21493,AGJAX3U73M52QLRSR4VYWZXEENMA,B07ZJ6RY1W,5.0,1.0
...,...,...,...,...
501,AETPWUNCW3KZDTKNUQGS5YYKNUEA,B08JTNL426,3.0,35.0
537,AETPWUNCW3KZDTKNUQGS5YYKNUEA,B008CP6MA2,2.0,36.0
531,AETPWUNCW3KZDTKNUQGS5YYKNUEA,B07BZP7HML,2.0,37.0
517,AETPWUNCW3KZDTKNUQGS5YYKNUEA,B008FPMBNG,2.0,38.0


In [33]:
eval_df = merge_recs_with_target(recommendations_df, label_df, k=args.top_K)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
34,174.0,4865.0,5.000000,1,AE22LPCN47WUTHSG67R6SKN4A4MQ,B0785SFKYF,0,
23,174.0,3578.0,5.000000,2,AE22LPCN47WUTHSG67R6SKN4A4MQ,B01LXC1QL0,0,
33,174.0,3453.0,5.000000,3,AE22LPCN47WUTHSG67R6SKN4A4MQ,B0779JXNV9,0,
41,174.0,2775.0,5.000000,4,AE22LPCN47WUTHSG67R6SKN4A4MQ,B07F1HTGV1,0,
69,174.0,3277.0,4.008566,5,AE22LPCN47WUTHSG67R6SKN4A4MQ,B0891DDRYN,0,
...,...,...,...,...,...,...,...,...
1533106,3226.0,111.0,3.000000,97,AHZZZY2XVWEUJUTYPGGL4WXH6CSA,B07SRLXN3T,0,
1533073,3226.0,110.0,3.000000,98,AHZZZY2XVWEUJUTYPGGL4WXH6CSA,B007W5PG5U,0,
1533080,3226.0,109.0,3.000000,99,AHZZZY2XVWEUJUTYPGGL4WXH6CSA,B00P5DOOTU,0,
1533100,3226.0,108.0,3.000000,100,AHZZZY2XVWEUJUTYPGGL4WXH6CSA,B07LGNB9C2,0,


### Visualize

In [34]:
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metrics import PrecisionTopKMetric
from evidently.metrics import RecallTopKMetric
from evidently.metrics import FBetaTopKMetric
from evidently.metrics import NDCGKMetric
from evidently.metrics import PersonalizationMetric
import warnings

warnings.filterwarnings(
    action='ignore',
    category=FutureWarning,
    module=r'evidently.metrics.recsys.precision_recall_k'
)

from src.viz import color_scheme

In [35]:
column_mapping = ColumnMapping(
    recommendations_type='rank',
    target=rating_col,
    prediction='rec_ranking',
    item_id=item_col,
    user_id=user_col
)

report = Report(metrics=[
    NDCGKMetric(k=args.top_k),
    RecallTopKMetric(k=args.top_K),
    PrecisionTopKMetric(k=args.top_k),
    FBetaTopKMetric(k=args.top_k),
    PersonalizationMetric(k=args.top_k),
], options=[color_scheme])

report.run(
    reference_data=None,
    current_data=eval_df,
    column_mapping=column_mapping
)

evidently_report_fp = f"{args.notebook_persist_dp}/evidently_report.html"
os.makedirs(args.notebook_persist_dp, exist_ok=True)
report.save_html(evidently_report_fp)

if args.log_to_mlflow:
    mlflow.log_artifact(evidently_report_fp)
    for metric_result in report.as_dict()['metrics']:
        metric = metric_result['metric']
        if metric == 'PersonalizationMetric':
            metric_value = float(metric_result['result']['current_value'])
            mlflow.log_metric(f"val_{metric}", metric_value)
            continue
        result = metric_result['result']['current'].to_dict()
        for kth, metric_value in result.items():
            mlflow.log_metric(f"val_{metric}_at_k_as_step", metric_value, step=kth)

# Predict

In [36]:
val_predictions[2]

3.0

In [37]:
val_df.iloc[[2]]

Unnamed: 0,user_id,parent_asin,rating,timestamp
15518,AF7HTSEWIKYSP5D3ST4EZIUK6PJQ,B08F5T3F9Y,5.0,1644540517651


In [38]:
user_id = 'AFQAPWVESEJYTNZC23LDPQOH7QBA'

In [39]:
item_id = 'B09GM4283G'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)

model.predict([user_indice], [item_indice])

array([4.50511168])

# Clean up

In [40]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/09/15 12:03:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run 011-cf-u2u at: http://localhost:5003/#/experiments/1/runs/94b583a979654c639742cfb1fb8022e3.
2024/09/15 12:03:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5003/#/experiments/1.
