# Ranker that can takes into accound different features

# Set up

In [3]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [4]:
import os
import sys

import dill
import lightning as L
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from mlflow.exceptions import MlflowException
from mlflow.models.signature import infer_signature
from pydantic import BaseModel
from torch.utils.data import DataLoader

import mlflow

load_dotenv()

sys.path.insert(0, "..")

from cfg.run_cfg import RunCfg
# from src.ann import AnnIndex
from src.utils.data_prep import chunk_transform
from src.algo.ranker.dataset import UserItemBinaryDFDataset
from src.utils.embedding_id_mapper import IDMapper
from src.algo.ranker.inference import RankerInferenceWrapper
from src.algo.ranker.model import Ranker
from src.algo.ranker.trainer import LitRanker
from src.algo.item2vec.trainer import LitSkipGram
from src.algo.item2vec.model import SkipGram



# Controller

In [5]:
# This is a parameter cell used by papermill
max_epochs = 10

In [6]:
class Args(BaseModel):
    testing: bool = False
    author: str = "dinh-trieu"
    log_to_mlflow: bool = True
    experiment_name: str = "RecSys MVP - Ranker"
    run_name: str = "004-use-sbert-features-and-llm-tags"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    rc: RunCfg = RunCfg().init()

    item_metadata_pipeline_fp: str = "../data_for_ai/interim/item_metadata_pipeline_wo_user_item_manipulate.dill"
    qdrant_url: str = None
    qdrant_collection_name: str = "item_desc_sbert"

    max_epochs: int = max_epochs
    batch_size: int = 128
    tfm_chunk_size: int = 10000
    neg_to_pos_ratio: int = 1

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    embedding_dim: int = 256
    item_sequence_ts_bucket_size: int = 10
    bucket_embedding_dim: int = 16
    dropout: float = 0.3
    early_stopping_patience: int = 5
    learning_rate: float = 0.001
    l2_reg: float = 1e-5

    mlf_item2vec_model_name: str = "item2vec"
    mlf_model_name: str = "ranker"
    min_roc_auc: float = 0.7

    best_checkpoint_path: str = None
    use_item_feature: bool = True

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        if not (qdrant_host := os.getenv("QDRANT_HOST")):
            raise Exception(f"Environment variable QDRANT_HOST is not set.")

        qdrant_port = os.getenv("QDRANT_PORT")
        self.qdrant_url = f"{qdrant_host}:{qdrant_port}"

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if self.device is None:
            self.device = (
                "cuda"
                if torch.cuda.is_available()
                else "mps" if torch.backends.mps.is_available() else "cpu"
            )

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

[32m2025-07-01 00:15:40.248[0m | [34m[1mDEBUG   [0m | [36mcfg.run_cfg[0m:[36minit[0m:[36m31[0m - [34m[1mSetting use_sbert_features=True requires running notebook 016-sentence-transformers[0m
[32m2025-07-01 00:15:40.249[0m | [34m[1mDEBUG   [0m | [36mcfg.run_cfg[0m:[36minit[0m:[36m38[0m - [34m[1mChanging use_item_tags_from_llm requires re-running notebook 002-features-v2 to get the new item_metadata_pipeline.dill file[0m
[32m2025-07-01 00:15:40.252[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m62[0m - [1mSetting up MLflow experiment RecSys MVP - Ranker - run 004-use-sbert-features-and-llm-tags...[0m


{
  "testing": false,
  "author": "dinh-trieu",
  "log_to_mlflow": true,
  "experiment_name": "RecSys MVP - Ranker",
  "run_name": "004-use-sbert-features-and-llm-tags",
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/004-use-sbert-features-and-llm-tags",
  "random_seed": 41,
  "device": "cuda",
  "rc": {
    "use_sbert_features": true,
    "use_item_tags_from_llm": false,
    "item_feature_cols": [
      "main_category",
      "categories",
      "price",
      "parent_asin_rating_cnt_365d",
      "parent_asin_rating_avg_prev_rating_365d",
      "parent_asin_rating_cnt_90d",
      "parent_asin_rating_avg_prev_rating_90d",
      "parent_asin_rating_cnt_30d",
      "parent_asin_rating_avg_prev_rating_30d",
      "parent_asin_rating_cnt_7d",
      "parent_asin_rating_avg_prev_rating_7d"
    ]
  },
  "item_metadata_pipeline_fp": "../data_for_ai/interim/item_metadata_pipeline_wo_user_item_manipulate.dill",
  "qdrant_url": "138.2.61.6:6333",
  "qdrant_collecti

# Implement

In [7]:
def init_model(
    n_users,
    n_items,
    embedding_dim,
    item_sequence_ts_bucket_size,
    bucket_embedding_dim,
    item_feature_size,
    dropout,
    item_embedding=None,
    use_item_feature=False,
):
    model = Ranker(
        n_users,
        n_items,
        embedding_dim,
        item_sequence_ts_bucket_size=item_sequence_ts_bucket_size,
        bucket_embedding_dim=bucket_embedding_dim,
        use_item_feature=use_item_feature,
        item_feature_size=item_feature_size,
        dropout=dropout,
        item_embedding=item_embedding,
    )
    return model

## Load pretrained Item2Vec embeddings

In [8]:
n_items = 4817  # This should be the number of unique items in your dataset
assert args.embedding_dim == 256, "Embedding dimension must be 256"
best_trainer = LitSkipGram.load_from_checkpoint(
    "../data_for_ai/interim/best-item2vec-weight.ckpt",
    skipgram_model=SkipGram(n_items, args.embedding_dim).to(args.device),
)
skipgram_item_embedding = best_trainer.skipgram_model.embeddings.weight.data.cpu()
print(f"SkipGram Item embedding shape: {skipgram_item_embedding.shape}")
print(f"SkipGram Item embedding dtype: {skipgram_item_embedding.dtype}")

# create a embedding layer with num_items + 1 embedding, then apply the pretrained weights
pretrained_item_embedding = torch.nn.Embedding(
    num_embeddings=n_items + 1,  # +1 for the unknown item (-1 padding)
    embedding_dim=args.embedding_dim,
    padding_idx=n_items,  # Set padding_idx to the last index
)
pretrained_item_embedding.weight.data[:n_items] = skipgram_item_embedding[:n_items]
pretrained_item_embedding.weight.data[n_items] = torch.zeros(
    args.embedding_dim, dtype=skipgram_item_embedding.dtype
)

[32m2025-07-01 00:15:40.338[0m | [1mINFO    [0m | [36msrc.algo.item2vec.model[0m:[36m__init__[0m:[36m12[0m - [1mInitializing item embeddings with num items 4817, embedding dim 256[0m


FileNotFoundError: [Errno 2] No such file or directory: '/home/dinhln/Desktop/real_time_recsys/notebooks/../data_for_ai/interim/best-item2vec-weight.ckpt'

In [None]:
# mlf_client = mlflow.MlflowClient()
# model = mlflow.pyfunc.load_model(
#     model_uri=f"models:/{args.mlf_item2vec_model_name}@champion"
# )
# skipgram_model = model.unwrap_python_model().model
# embedding_0 = skipgram_model.embeddings(torch.tensor(0))
# embedding_dim = embedding_0.size()[0]
# id_mapping = model.unwrap_python_model().id_mapping
# pretrained_item_embedding = skipgram_model.embeddings

In [None]:
assert (
    pretrained_item_embedding.embedding_dim == args.embedding_dim
), "Mismatch pretrained item_embedding dimension"

## Load vectorized item features

In [None]:
with open(args.item_metadata_pipeline_fp, "rb") as f:
    item_metadata_pipeline = dill.load(f)

## Load ANN Index

In [None]:
# if args.rc.use_sbert_features:
#     ann_index = AnnIndex(args.qdrant_url, args.qdrant_collection_name)
#     vector = ann_index.get_vector_by_ids([0])[0]
#     sbert_embedding_dim = vector.shape[0]
#     logger.info(f"{sbert_embedding_dim=}")
#     neighbors = ann_index.get_neighbors_by_ids([0])
#     display(neighbors)

# Prep data

In [None]:
train_df = pd.read_parquet("../data_for_ai/interim/train_sample_interactions_16407u_features_neg_seq_without_stats_item_user.parquet")
val_df = pd.read_parquet("../data_for_ai/interim/val_sample_interactions_16407u_features_neg_seq_without_stats_item_user.parquet")
idm_fp = "../data_for_ai/interim/idm_16407u.json"
idm = IDMapper().load(idm_fp)

assert (
    train_df[args.user_col].map(lambda s: idm.get_user_index(s))
    != train_df["user_indice"]
).sum() == 0, "Mismatch IDM"
assert (
    val_df[args.user_col].map(lambda s: idm.get_user_index(s)) != val_df["user_indice"]
).sum() == 0, "Mismatch IDM"

if args.rc.use_item_tags_from_llm:
    assert (
        "tags" in train_df.columns
    ), "There is no column `tags` in train_df, please make sure you have run notebook 002, 020 with RunCfg.use_item_tags_from_llm=True"

4817 items in the dataset


In [None]:
train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,user_indice,item_indice,item_sequence,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
0,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00SG3CWGS,0.0,2017-06-10 00:30:32.698,1497054632,2546,4213,"[-1, -1, -1, -1, -1, -1, -1, -1, 218, 2648]","[-1, -1, -1, -1, -1, -1, -1, -1, 1457886402, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 6, 0]",Cell Phones & Accessories,Garmin Nuvi 67LMT 6-Inch GPS Navigator,"[With bright 6” dual-orientation displays, spo...","[Electronics, GPS, Finders & Accessories, Spor...",199.0
1,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,1501720830,2416,2467,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Computers,Samsung T3 Portable SSD - 2TB - USB 3.1 Extern...,[Portability is the key element shared among a...,"[Electronics, Computers & Accessories, Data St...",348.69
2,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B09BW3XJQV,0.0,2018-12-08 16:57:03.101,1544288223,4292,1208,"[-1, -1, -1, -1, 3541, 3089, 4168, 3936, 4066,...","[-1, -1, -1, -1, 1488569087, 1499723220, 15334...","[-1, -1, -1, -1, 6, 6, 5, 5, 4, 4]",Computers,ASUS AC1300 WiFi Router (RT-ACRH13) - Dual Ban...,[Upgrade to AC Wi-Fi for your bandwidth-hungry...,"[Electronics, Computers & Accessories, Network...",
3,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,1409796219,728,689,"[-1, -1, -1, -1, -1, -1, 193, 3945, 1849, 4407]","[-1, -1, -1, -1, -1, -1, 1327177801, 133520743...","[-1, -1, -1, -1, -1, -1, 6, 6, 6, 5]",All Electronics,Logitech K750 Wireless Solar Keyboard for Mac ...,[Battery hassles are a thing of the past with ...,"[Electronics, Computers & Accessories, Compute...",49.99
4,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B001W6Q7SU,0.0,2016-09-14 16:29:39.000,1473870579,5481,834,"[-1, -1, -1, -1, -1, -1, -1, 3965, 4617, 2003]","[-1, -1, -1, -1, -1, -1, -1, 1473870313, 14738...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]",All Electronics,PNY Optima 2GB (2x1GB) Dual Channel Kit DDR2 6...,[PNY OPTIMA 2GB (2x1GB) Dual Channel Kit DDR2 ...,"[Electronics, Computers & Accessories, Compute...",65.99


In [None]:
print(train_df.shape)
train_df.head()

In [None]:
user_indices = train_df["user_indice"].unique()
item_indices = train_df["item_indice"].unique()

train_item_features = chunk_transform(
    train_df, item_metadata_pipeline, chunk_size=args.tfm_chunk_size
)
train_item_features = train_item_features.astype(np.float32)

val_item_features = chunk_transform(
    val_df, item_metadata_pipeline, chunk_size=args.tfm_chunk_size
)
val_item_features = val_item_features.astype(np.float32)

logger.info(f"{len(user_indices)=:,.0f}, {len(item_indices)=:,.0f}")

In [None]:
val_item_features.shape

# Train

In [None]:
rating_dataset = UserItemBinaryDFDataset(
    train_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    item_feature=train_item_features,
)
val_rating_dataset = UserItemBinaryDFDataset(
    val_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    item_feature=val_item_features,
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False
)

In [None]:
n_items = len(item_indices)
n_users = len(user_indices)
item_feature_size = train_item_features.shape[1]

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    args.item_sequence_ts_bucket_size,
    args.bucket_embedding_dim,
    item_feature_size,
    args.dropout,
    use_item_feature=args.use_item_feature,
)
model.item_embedding.padding_idx

In [None]:
for i in val_loader:
    print(i)
    break

#### Predict before train

In [None]:
print(val_df.shape)
val_df.head()

In [None]:
val_df = val_rating_dataset.df
val_df.sample(10)

In [None]:
user_id = val_df.sample(1)[args.user_col].values[0]
test_df = val_df.loc[lambda df: df[args.user_col].eq(user_id)]
with pd.option_context("display.max_colwidth", None):
    display(test_df)

In [None]:
val_item_features.shape, train_item_features.shape

In [None]:
test_row = test_df.loc[lambda df: df[args.rating_col].gt(0)].iloc[0]
item_id = test_row[args.item_col]
item_sequence = test_row["item_sequence"]
item_sequence_ts_bucket = test_row["item_sequence_ts_bucket"]
row_idx = test_row.name
item_feature = val_item_features[row_idx]
logger.info(
    f"Test predicting before training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
)
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)
user = torch.tensor([user_indice])
item_sequence = torch.tensor([item_sequence])
item_sequence_ts_bucket = torch.tensor([item_sequence_ts_bucket])
item_feature = torch.tensor([item_feature])
item = torch.tensor([item_indice])

model.eval()
model.predict(user, item_sequence, item_sequence_ts_bucket, item_feature, item)
model.train()

#### Training loop

##### Overfit 1 batch

In [None]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=10, mode="min", verbose=False
)

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    args.item_sequence_ts_bucket_size,
    args.bucket_embedding_dim,
    item_feature_size,
    dropout=args.dropout,
    use_item_feature=args.use_item_feature,
)
lit_model = LitRanker(
    model,
    learning_rate=args.learning_rate,
    l2_reg=0.0,
    log_dir=args.notebook_persist_dp,
    accelerator=args.device,
)

log_dir = f"{args.notebook_persist_dp}/logs/overfit"

# # train model
# trainer = L.Trainer(
#     default_root_dir=log_dir,
#     accelerator=args.device if args.device else "auto",
#     max_epochs=2,
#     overfit_batches=1,
#     callbacks=[early_stopping],
# )
# trainer.fit(
#     model=lit_model,
#     train_dataloaders=train_loader,
#     val_dataloaders=val_loader,

# )
# logger.info(f"Logs available at {trainer.log_dir}")

In [None]:
# Need to make sure port 6006 at local is accessible
# %tensorboard --logdir $trainer.log_dir

##### Fit on all data

In [None]:
# print the number of rows in train_df that has rating = 0 and 1
print(
    f"Number of rows in train_df that has rating = 0: {train_df[train_df[args.rating_col] == 0.0].shape[0]}"
)
print(
    f"Number of rows in train_df that has rating = 1: {train_df[train_df[args.rating_col] >= 1.0].shape[0]}"
)
print(f"Number of rows in train_df: {train_df.shape[0]}")

In [None]:
# group by a specific user_id and all the rows for that user
user_id = "AF5KKBAOVY7J7LGPHAECKUTDQVTA"
user_df = train_df.loc[lambda df: df[args.user_col].eq(user_id)]
print(f"Number of rows for user {user_id}: {user_df.shape[0]}")
user_df = user_df.sort_values(by=args.timestamp_col, ascending=False)
user_df

In [None]:
# sort the train_df by timestamp and get the lastest item features from train_df
all_items_df = train_df.sort_values(by=args.timestamp_col, ascending=False)
# get the lastest item features from train_df
all_items_indices = all_items_df.drop_duplicates(subset=[args.item_col], keep="first")["item_indice"].values
all_items_features = item_metadata_pipeline.transform(all_items_df).astype(np.float32)
logger.info(
    f"Mean std over categorical and numerical features: {all_items_features.std(axis=0).mean()}"
)
# if args.rc.use_sbert_features:
#     all_sbert_vectors = ann_index.get_vector_by_ids(all_items_indices.tolist()).astype(
#         np.float32
#     )
#     logger.info(f"Mean std over text features: {all_sbert_vectors.std(axis=0).mean()}")
#     all_items_features = np.hstack([all_items_features, all_sbert_vectors])

In [None]:
# all_items_df = train_df.drop_duplicates(subset=["item_indice"])
# all_items_indices = all_items_df["item_indice"].values
# all_items_features = item_metadata_pipeline.transform(all_items_df).astype(np.float32)
# logger.info(
#     f"Mean std over categorical and numerical features: {all_items_features.std(axis=0).mean()}"
# )
# # if args.rc.use_sbert_features:
# #     all_sbert_vectors = ann_index.get_vector_by_ids(all_items_indices.tolist()).astype(
# #         np.float32
# #     )
# #     logger.info(f"Mean std over text features: {all_sbert_vectors.std(axis=0).mean()}")
# #     all_items_features = np.hstack([all_items_features, all_sbert_vectors])

In [None]:
# papermill_description=fit-model
early_stopping = EarlyStopping(
    monitor="val_roc_auc", patience=args.early_stopping_patience, mode="max", verbose=False
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persist_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_roc_auc",
    mode="max",
)

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    args.item_sequence_ts_bucket_size,
    args.bucket_embedding_dim,
    item_feature_size,
    dropout=args.dropout,
    item_embedding=pretrained_item_embedding,
    use_item_feature=args.use_item_feature,
)
lit_model = LitRanker(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persist_dp,
    evaluate_ranking=True,
    idm=idm,
    all_items_indices=all_items_indices,
    all_items_features=all_items_features,
    args=args,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    checkpoint_callback=checkpoint_callback,
    accelerator=args.device,
)

In [None]:
log_dir = f"{args.notebook_persist_dp}/logs/run"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    max_epochs=args.max_epochs,
    callbacks=[early_stopping, checkpoint_callback],
    accelerator=args.device if args.device else "auto",
    logger=args._mlf_logger if args.log_to_mlflow else None,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

In [None]:
# print the number of unique items in val_df
print(f"Number of unique items in val_df: {val_df['item_indice'].nunique()}")

In [None]:
# logger.info(
#     f"Test predicting after training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
# )
# model.eval()
# model = model.to(user.device)  # Move model back to align with data device
# model.predict(user, item_sequence, item_sequence_ts_bucket, item_feature, item)
# model.train()

# Load best checkpoint

In [None]:
logger.info(f"Loading best checkpoint from {checkpoint_callback.best_model_path}...")
args.best_checkpoint_path = checkpoint_callback.best_model_path

best_trainer = LitRanker.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    model=init_model(
        n_users,
        n_items,
        args.embedding_dim,
        args.item_sequence_ts_bucket_size,
        args.bucket_embedding_dim,
        item_feature_size,
        dropout=0,
        item_embedding=pretrained_item_embedding,
        use_item_feature=args.use_item_feature,
    ),
)

In [None]:
# best_trainer = LitRanker.load_from_checkpoint(
#     "C:/Users/Trieu/Downloads/best-checkpoint (2).ckpt",
#     model=init_model(
#         n_users,
#         n_items,
#         args.embedding_dim,
#         args.item_sequence_ts_bucket_size,
#         args.bucket_embedding_dim,
#         item_feature_size,
#         dropout=0,
#         item_embedding=pretrained_item_embedding,
#     ),
# )

## testing after train

In [None]:
# user_id = val_df.sample(1)[args.user_col].values[0]
# test_df = val_df.loc[lambda df: df[args.user_col].eq(user_id)]
# # with pd.option_context("display.max_colwidth", None):
# #     display(test_df)
# test_df.shape

In [None]:
# test_row = test_df.loc[lambda df: df[args.rating_col].eq(0)].iloc[0]
# item_id = test_row[args.item_col]
# item_sequence = test_row["item_sequence"]
# item_sequence_ts_bucket = test_row["item_sequence_ts_bucket"]
# row_idx = test_row.name
# item_feature = val_item_features[row_idx]
# logger.info(
#     f"Test predicting before training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
# )
# user_indice = idm.get_user_index(user_id)
# item_indice = idm.get_item_index(item_id)
# user = torch.tensor([user_indice])
# item_sequence = torch.tensor([item_sequence])
# item_sequence_ts_bucket = torch.tensor([item_sequence_ts_bucket])
# item_feature = torch.tensor([item_feature])
# item = torch.tensor([item_indice])

# # print the information of the user and item and rating we are testing as a row in dataframe
# user_df = pd.DataFrame({
#     args.user_col: [user_id],
#     args.item_col: [item_id],
#     args.rating_col: [test_row[args.rating_col]],
#     "item_sequence": [item_sequence.tolist()],
#     "item_sequence_ts_bucket": [item_sequence_ts_bucket],
#     "item_feature": [item_feature.tolist()],
# })
# user_df


In [None]:
best_model = best_trainer.model.to(lit_model.device)

In [None]:
# best_model.eval()
# best_model.predict(user, item_sequence, item_sequence_ts_bucket, item_feature, item)

### Persist artifacts

In [None]:
if args.log_to_mlflow:
    # Persist id_mapping so that at inference we can predict based on item_ids (string) instead of item_index
    run_id = trainer.logger.run_id
    mlf_client = trainer.logger.experiment
    mlf_client.log_artifact(run_id, idm_fp)
    # Persist item_feature_metadata pipeline
    mlf_client.log_artifact(run_id, args.item_metadata_pipeline_fp)

    # Persist model architecture
    model_architecture_fp = f"{args.notebook_persist_dp}/model_architecture.txt"
    with open(model_architecture_fp, "w") as f:
        f.write(repr(model))
    mlf_client.log_artifact(run_id, model_architecture_fp)

### Wrap inference function and register best checkpoint as MLflow model

In [None]:
inferrer = RankerInferenceWrapper(best_model)

In [None]:
def generate_sample_item_features():
    sample_row = train_df.iloc[0].fillna(0)
    output = dict()
    for col in args.rc.item_feature_cols:
        v = sample_row[col]
        if isinstance(v, np.ndarray):
            v = "__".join(
                sample_row[col].tolist()
            )  # Workaround to avoid MLflow Got error: Per-column arrays must each be 1-dimensional
        output[col] = [v]
    return output

In [None]:
sample_input = {
    args.user_col: [idm.get_user_id(0)],
    "item_sequence": [",".join([idm.get_item_id(0), idm.get_item_id(1)])],
    "item_sequence_ts": [
        "1095133116,109770848"
    ],  # Here we input unix timestamp seconds instead of timestamp bucket because we need to calculate the bucket
    # **{col: [train_df.iloc[0].fillna(0)[col]] for col in args.item_feature_cols},
    **generate_sample_item_features(),
    args.item_col: [idm.get_item_id(0)],
}
sample_output = inferrer.infer([0], [[0, 1]], [[2, 0]], [train_item_features[0]], [0])
sample_output

In [None]:
sample_input

In [None]:
if args.log_to_mlflow:
    run_id = trainer.logger.run_id
    sample_output_np = sample_output
    signature = infer_signature(sample_input, sample_output_np)
    idm_filename = idm_fp.split("/")[-1]
    item_metadata_pipeline_filename = args.item_metadata_pipeline_fp.split("/")[-1]
    with mlflow.start_run(run_id=run_id):
        mlflow.pyfunc.log_model(
            python_model=inferrer,
            artifact_path="inferrer",
            artifacts={
                # We log the id_mapping to the predict function so that it can accept item_id and automatically convert ot item_indice for PyTorch model to use
                "idm": mlflow.get_artifact_uri(idm_filename),
                "item_metadata_pipeline": mlflow.get_artifact_uri(
                    item_metadata_pipeline_filename
                ),
            },
            model_config={"use_sbert_features": args.rc.use_sbert_features},
            signature=signature,
            input_example=sample_input,
            registered_model_name=args.mlf_model_name,
        )

# Set the newly trained model as champion

In [None]:
if args.log_to_mlflow:
    # Get current champion
    deploy_alias = "champion"
    curr_model_run_id = None

    min_roc_auc = args.min_roc_auc

    try:
        curr_champion_model = mlf_client.get_model_version_by_alias(
            args.mlf_model_name, deploy_alias
        )
        curr_model_run_id = curr_champion_model.run_id
    except MlflowException as e:
        if "not found" in str(e).lower():
            logger.info(
                f"There is no {deploy_alias} alias for model {args.mlf_model_name}"
            )

    # Compare new vs curr models
    new_mlf_run = trainer.logger.experiment.get_run(trainer.logger.run_id)
    new_metrics = new_mlf_run.data.metrics
    roc_auc = new_metrics["roc_auc"]
    if curr_model_run_id:
        curr_model_run_info = mlf_client.get_run(curr_model_run_id)
        curr_metrics = curr_model_run_info.data.metrics
        if (curr_roc_auc := curr_metrics["roc_auc"]) > min_roc_auc:
            logger.info(
                f"Current {deploy_alias} model has {curr_roc_auc:,.4f} ROC-AUC..."
            )
            min_roc_auc = curr_roc_auc

        top_metrics = ["roc_auc", "val_PersonalizationMetric"]
        vizer = ModelMetricsComparisonVisualizer(curr_metrics, new_metrics, top_metrics)
        print(f"Comparing metrics between new run and current champion:")
        display(vizer.compare_metrics_df())
        vizer.create_metrics_comparison_plot(n_cols=5)
        vizer.plot_diff()

    # Register new champion
    if roc_auc < min_roc_auc:
        logger.info(
            f"Current run has ROC-AUC = {roc_auc:,.4f}, smaller than {min_roc_auc:,.4f}. Skip aliasing this model as the new {deploy_alias}.."
        )
    else:
        logger.info(f"Aliasing the new model as champion...")
        # Get the model version for current run by assuming it's the most recent registered version
        model_version = (
            mlf_client.get_registered_model(args.mlf_model_name)
            .latest_versions[0]
            .version
        )

        mlf_client.set_registered_model_alias(
            name=args.mlf_model_name, alias="champion", version=model_version
        )

        mlf_client.set_model_version_tag(
            name=args.mlf_model_name,
            version=model_version,
            key="author",
            value=args.author,
        )

# Clean up

In [None]:
all_params = [args]

if args.log_to_mlflow:
    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.dict()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)