# Ranker that can takes into accound different features

# Set up

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
import os
import sys

import dill
import lightning as L
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from mlflow.exceptions import MlflowException
from mlflow.models.signature import infer_signature
from pydantic import BaseModel
from torch.utils.data import DataLoader

import mlflow

load_dotenv()

sys.path.insert(0, "..")

from cfg.run_cfg import RunCfg
# from src.ann import AnnIndex
from src.utils.data_prep import chunk_transform
from src.algo.ranker.dataset import UserItemBinaryDFDataset
from src.utils.embedding_id_mapper import IDMapper
from src.algo.ranker.inference import RankerInferenceWrapper
from src.algo.ranker.model import Ranker
from src.algo.ranker.trainer import LitRanker
from src.algo.item2vec.trainer import LitSkipGram
from src.algo.item2vec.model import SkipGram



# Controller

In [3]:
# This is a parameter cell used by papermill
max_epochs = 1

In [None]:
class Args(BaseModel):
    testing: bool = False
    author: str = "dinh-trieu"
    log_to_mlflow: bool = False
    experiment_name: str = "RecSys MVP - Ranker"
    run_name: str = "004-use-sbert-features-and-llm-tags"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    rc: RunCfg = RunCfg().init()

    item_metadata_pipeline_fp: str = "../data_for_ai/interim/item_metadata_pipeline.dill"
    qdrant_url: str = None
    qdrant_collection_name: str = "item_desc_sbert"

    max_epochs: int = max_epochs
    batch_size: int = 128
    tfm_chunk_size: int = 10000
    neg_to_pos_ratio: int = 1

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    embedding_dim: int = 256
    item_sequence_ts_bucket_size: int = 10
    bucket_embedding_dim: int = 16
    dropout: float = 0.3
    early_stopping_patience: int = 5
    learning_rate: float = 0.001
    l2_reg: float = 1e-5

    mlf_item2vec_model_name: str = "item2vec"
    mlf_model_name: str = "ranker"
    min_roc_auc: float = 0.7

    best_checkpoint_path: str = None

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        if not (qdrant_host := os.getenv("QDRANT_HOST")):
            raise Exception(f"Environment variable QDRANT_HOST is not set.")

        qdrant_port = os.getenv("QDRANT_PORT")
        self.qdrant_url = f"{qdrant_host}:{qdrant_port}"

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if self.device is None:
            self.device = (
                "cuda"
                if torch.cuda.is_available()
                else "mps" if torch.backends.mps.is_available() else "cpu"
            )

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

[32m2025-06-28 09:45:01.710[0m | [34m[1mDEBUG   [0m | [36mcfg.run_cfg[0m:[36minit[0m:[36m31[0m - [34m[1mSetting use_sbert_features=True requires running notebook 016-sentence-transformers[0m
[32m2025-06-28 09:45:01.725[0m | [34m[1mDEBUG   [0m | [36mcfg.run_cfg[0m:[36minit[0m:[36m38[0m - [34m[1mChanging use_item_tags_from_llm requires re-running notebook 002-features-v2 to get the new item_metadata_pipeline.dill file[0m
[32m2025-06-28 09:45:01.730[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m61[0m - [1mSetting up MLflow experiment RecSys MVP - Ranker - run 004-use-sbert-features-and-llm-tags...[0m


{
  "testing": false,
  "author": "dinh-trieu",
  "log_to_mlflow": true,
  "experiment_name": "RecSys MVP - Ranker",
  "run_name": "004-use-sbert-features-and-llm-tags",
  "notebook_persist_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\004-use-sbert-features-and-llm-tags",
  "random_seed": 41,
  "device": "cpu",
  "rc": {
    "use_sbert_features": true,
    "use_item_tags_from_llm": false,
    "item_feature_cols": [
      "main_category",
      "categories",
      "price",
      "parent_asin_rating_cnt_365d",
      "parent_asin_rating_avg_prev_rating_365d",
      "parent_asin_rating_cnt_90d",
      "parent_asin_rating_avg_prev_rating_90d",
      "parent_asin_rating_cnt_30d",
      "parent_asin_rating_avg_prev_rating_30d",
      "parent_asin_rating_cnt_7d",
      "parent_asin_rating_avg_prev_rating_7d"
    ]
  },
  "item_metadata_pipeline_fp": "../data_for_ai/interim/item_metadata_pipeline.dill",
  "qdrant_url": "138.2.61.6:6333",
  "qdrant_collec

# Implement

In [5]:
def init_model(
    n_users,
    n_items,
    embedding_dim,
    item_sequence_ts_bucket_size,
    bucket_embedding_dim,
    item_feature_size,
    dropout,
    item_embedding=None,
):
    model = Ranker(
        n_users,
        n_items,
        embedding_dim,
        item_sequence_ts_bucket_size=item_sequence_ts_bucket_size,
        bucket_embedding_dim=bucket_embedding_dim,
        item_feature_size=item_feature_size,
        dropout=dropout,
        item_embedding=item_embedding,
    )
    return model

## Load pretrained Item2Vec embeddings

In [6]:
n_items = 4817  # This should be the number of unique items in your dataset
assert args.embedding_dim == 256, "Embedding dimension must be 256"
best_trainer = LitSkipGram.load_from_checkpoint(
    "../data_for_ai/interim/best-item2vec-weight.ckpt",
    skipgram_model=SkipGram(n_items, args.embedding_dim).to(args.device),
)
skipgram_item_embedding = best_trainer.skipgram_model.embeddings.weight.data.cpu()
print(f"SkipGram Item embedding shape: {skipgram_item_embedding.shape}")
print(f"SkipGram Item embedding dtype: {skipgram_item_embedding.dtype}")

pretrained_item_embedding = torch.nn.Embedding.from_pretrained(
    skipgram_item_embedding[:n_items], freeze=False
)

[32m2025-06-28 09:45:02.413[0m | [1mINFO    [0m | [36msrc.algo.item2vec.model[0m:[36m__init__[0m:[36m12[0m - [1mInitializing item embeddings with num items 4817, embedding dim 256[0m


SkipGram Item embedding shape: torch.Size([4818, 256])
SkipGram Item embedding dtype: torch.float32



The loaded checkpoint was produced with Lightning v2.5.2, which is newer than your current Lightning version: v2.5.0



In [7]:
# mlf_client = mlflow.MlflowClient()
# model = mlflow.pyfunc.load_model(
#     model_uri=f"models:/{args.mlf_item2vec_model_name}@champion"
# )
# skipgram_model = model.unwrap_python_model().model
# embedding_0 = skipgram_model.embeddings(torch.tensor(0))
# embedding_dim = embedding_0.size()[0]
# id_mapping = model.unwrap_python_model().id_mapping
# pretrained_item_embedding = skipgram_model.embeddings

In [8]:
assert (
    pretrained_item_embedding.embedding_dim == args.embedding_dim
), "Mismatch pretrained item_embedding dimension"

## Load vectorized item features

In [9]:
with open(args.item_metadata_pipeline_fp, "rb") as f:
    item_metadata_pipeline = dill.load(f)

## Load ANN Index

In [10]:
# if args.rc.use_sbert_features:
#     ann_index = AnnIndex(args.qdrant_url, args.qdrant_collection_name)
#     vector = ann_index.get_vector_by_ids([0])[0]
#     sbert_embedding_dim = vector.shape[0]
#     logger.info(f"{sbert_embedding_dim=}")
#     neighbors = ann_index.get_neighbors_by_ids([0])
#     display(neighbors)

# Prep data

In [11]:
train_df = pd.read_parquet("../data_for_ai/interim/train_sample_interactions_16407u_features_neg_seq.parquet")
val_df = pd.read_parquet("../data_for_ai/interim/val_sample_interactions_16407u_features_neg_seq.parquet")
idm_fp = "../data_for_ai/interim/idm_16407u.json"
idm = IDMapper().load(idm_fp)

assert (
    train_df[args.user_col].map(lambda s: idm.get_user_index(s))
    != train_df["user_indice"]
).sum() == 0, "Mismatch IDM"
assert (
    val_df[args.user_col].map(lambda s: idm.get_user_index(s)) != val_df["user_indice"]
).sum() == 0, "Mismatch IDM"

if args.rc.use_item_tags_from_llm:
    assert (
        "tags" in train_df.columns
    ), "There is no column `tags` in train_df, please make sure you have run notebook 002, 020 with RunCfg.use_item_tags_from_llm=True"

4817 items in the dataset


In [12]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,user_indice,item_indice,item_sequence,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
0,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00SG3CWGS,0.0,2017-06-10 00:30:32.698,1497054632,10.0,4.500000,1.0,5.000000,0.0,...,2546,4213,"[-1, -1, -1, -1, -1, -1, -1, -1, 218, 2648]","[-1, -1, -1, -1, -1, -1, -1, -1, 1457886402, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 6, 0]",Cell Phones & Accessories,Garmin Nuvi 67LMT 6-Inch GPS Navigator,"[With bright 6” dual-orientation displays, spo...","[Electronics, GPS, Finders & Accessories, Spor...",199.0
1,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,1501720830,16.0,4.187500,3.0,4.333333,2.0,...,2416,2467,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Computers,Samsung T3 Portable SSD - 2TB - USB 3.1 Extern...,[Portability is the key element shared among a...,"[Electronics, Computers & Accessories, Data St...",348.69
2,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B09BW3XJQV,0.0,2018-12-08 16:57:03.101,1544288223,5.0,3.400000,1.0,4.000000,1.0,...,4292,1208,"[-1, -1, -1, -1, 3541, 3089, 4168, 3936, 4066,...","[-1, -1, -1, -1, 1488569087, 1499723220, 15334...","[-1, -1, -1, -1, 6, 6, 5, 5, 4, 4]",Computers,ASUS AC1300 WiFi Router (RT-ACRH13) - Dual Ban...,[Upgrade to AC Wi-Fi for your bandwidth-hungry...,"[Electronics, Computers & Accessories, Network...",
3,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,1409796219,5.0,4.600000,1.0,5.000000,0.0,...,728,689,"[-1, -1, -1, -1, -1, -1, 193, 3945, 1849, 4407]","[-1, -1, -1, -1, -1, -1, 1327177801, 133520743...","[-1, -1, -1, -1, -1, -1, 6, 6, 6, 5]",All Electronics,Logitech K750 Wireless Solar Keyboard for Mac ...,[Battery hassles are a thing of the past with ...,"[Electronics, Computers & Accessories, Compute...",49.99
4,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B001W6Q7SU,0.0,2016-09-14 16:29:39.000,1473870579,0.0,,0.0,,0.0,...,5481,834,"[-1, -1, -1, -1, -1, -1, -1, 3965, 4617, 2003]","[-1, -1, -1, -1, -1, -1, -1, 1473870313, 14738...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]",All Electronics,PNY Optima 2GB (2x1GB) Dual Channel Kit DDR2 6...,[PNY OPTIMA 2GB (2x1GB) Dual Channel Kit DDR2 ...,"[Electronics, Computers & Accessories, Compute...",65.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254779,AGFRYVIF7CVPOK777KN3PSOSWSMA,B07SBT1FSK,0.0,2013-12-17 03:19:23.000,1387250363,1.0,3.000000,0.0,,0.0,...,9644,3575,"[-1, -1, -1, -1, -1, -1, -1, 1186, 1084, 1309]","[-1, -1, -1, -1, -1, -1, -1, 1375882567, 13761...","[-1, -1, -1, -1, -1, -1, -1, 5, 5, 4]",Camera & Photo,Fotasy 49mm Macro Reverse Adapter Ring for E M...,[This Fotasy 49mm Metal Reverse Ring Adapter w...,"[Electronics, Camera & Photo, Accessories, Len...",6.79
254780,AGMAUSEXCG2JEGI245KGJJYHOWBQ,B08F98WSWH,0.0,2019-04-30 00:21:36.489,1556583696,9.0,4.000000,2.0,1.000000,0.0,...,10458,4240,"[-1, -1, 374, 4660, 2706, 2418, 3820, 4579, 44...","[-1, -1, 1443597405, 1495887973, 1515621456, 1...","[-1, -1, 7, 6, 6, 6, 1, 1, 1, 0]",Computers,SABRENT USB Type C External Stereo Sound Adapt...,[],"[Electronics, Computers & Accessories, Compute...",9.99
254781,AGGEMMEOSRGTGESZ56F7ESETFRHQ,B00U3FPN4U,5.0,2017-07-22 01:23:03.787,1500686583,141.0,4.390071,24.0,4.166667,7.0,...,9715,2003,"[-1, -1, -1, -1, -1, -1, -1, -1, 2475, 1363]","[-1, -1, -1, -1, -1, -1, -1, -1, 1480885209, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 5, 1]",Amazon Devices,Amazon Fire TV with 4K Ultra HD,[],[],48.29
254782,AGU6EIWIZSV6AIQSAVRDHTIJCHPA,B08XPWDSWW,0.0,2020-10-27 16:09:49.492,1603814989,63.0,4.190476,13.0,4.307692,6.0,...,11488,4251,"[-1, -1, -1, -1, -1, 3839, 2390, 1413, 3394, 4...","[-1, -1, -1, -1, -1, 1402878124, 1547313390, 1...","[-1, -1, -1, -1, -1, 8, 6, 6, 6, 6]",Home Audio & Theater,TOZO T10 Bluetooth 5.3 Wireless Earbuds with W...,[],"[Electronics, Headphones, Earbuds & Accessorie...",21.99


In [13]:
user_indices = train_df["user_indice"].unique()
item_indices = train_df["item_indice"].unique()
# if args.rc.use_sbert_features:
#     all_sbert_vectors = ann_index.get_vector_by_ids(
#         item_indices.tolist(), chunk_size=1000
#     ).astype(np.float32)

train_item_features = chunk_transform(
    train_df, item_metadata_pipeline, chunk_size=args.tfm_chunk_size
)
train_item_features = train_item_features.astype(np.float32)

val_item_features = chunk_transform(
    val_df, item_metadata_pipeline, chunk_size=args.tfm_chunk_size
)
val_item_features = val_item_features.astype(np.float32)

# if args.rc.use_sbert_features:
#     train_sbert_vectors = all_sbert_vectors[train_df["item_indice"].values]
#     train_item_features = np.hstack([train_item_features, train_sbert_vectors])
#     val_sbert_vectors = all_sbert_vectors[val_df["item_indice"].values]
#     val_item_features = np.hstack([val_item_features, val_sbert_vectors])

logger.info(f"{len(user_indices)=:,.0f}, {len(item_indices)=:,.0f}")

Transforming chunks:   0%|          | 0/26 [00:00<?, ?it/s]

Transforming chunks:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-06-28 09:45:33.535[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1mlen(user_indices)=16,407, len(item_indices)=4,817[0m


# Train

In [14]:
rating_dataset = UserItemBinaryDFDataset(
    train_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    item_feature=train_item_features,
)
val_rating_dataset = UserItemBinaryDFDataset(
    val_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    item_feature=val_item_features,
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False
)

In [15]:
n_items = len(item_indices)
n_users = len(user_indices)
item_feature_size = train_item_features.shape[1]

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    args.item_sequence_ts_bucket_size,
    args.bucket_embedding_dim,
    item_feature_size,
    args.dropout,
)
model

Ranker(
  (item_embedding): Embedding(4818, 256, padding_idx=4817)
  (user_embedding): Embedding(16407, 256)
  (item_sequence_ts_bucket_embedding): Embedding(11, 16, padding_idx=10)
  (gru): GRU(272, 256, batch_first=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (item_feature_tower): Sequential(
    (0): Linear(in_features=626, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
  )
  (fc_rating): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

#### Predict before train

In [16]:
val_df = val_rating_dataset.df
val_df.sample(10)

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,user_indice,item_indice,item_sequence,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
976,AE4POORML32YFATNR2JZDZDYN4GA,B0BXXS51NG,0.0,2021-08-08 05:39:14.321,1628401154,0.0,,0.0,,0.0,...,368,3976,"[2943, 4003, 4597, 3451, 3957, 3039, 4653, 296...","[1572314842, 1586199265, 1586201246, 158620130...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]",Computers,"AOC I1601FWUX 15.6"" USB-C powered portable mon...","[Works with MacBooks, notebooks, and PCs Aspec...","[Electronics, Computers & Accessories, Monitors]",129.99
1131,AHN6PGMHRJOD42YBCIH6MZNO36KA,B07D48JV9T,1.0,2021-10-04 12:56:21.085,1633352181,1.0,5.0,0.0,,0.0,...,14766,3347,"[1276, 3678, 3330, 1585, 1928, 2694, 2756, 388...","[1433806591, 1441044824, 1442177213, 144933021...","[8, 8, 8, 8, 8, 7, 7, 7, 5, 5]",All Electronics,Mediabridge™ Coaxial Cable (1.5 Feet) with F-M...,[Mediabridge flex Series Mini coaxial digital ...,"[Electronics, Television & Video, Accessories,...",6.84
4542,AEPDFYV25EEWSOXVCGIN2SMK4VOA,B0BM5FP56J,0.0,2021-01-20 00:25:52.127,1611102352,13.0,4.538462,2.0,4.5,0.0,...,2771,388,"[4124, 4430, 3386, 4624, 626, 3704, 1891, 1429...","[1449737967, 1449738032, 1458600623, 148100438...","[8, 8, 7, 7, 7, 7, 7, 7, 6, 0]",Computers,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",[AMD CPU 100 100000031box Ryzen 5 3600 6C 12T ...,"[Electronics, Computers & Accessories, Compute...",94.99
3716,AGJRAHBPON3QW2FDHZ6GNC5TALVQ,B0BGS23YKX,0.0,2021-04-05 22:20:54.066,1617661254,19.0,4.842105,4.0,4.75,1.0,...,10139,2939,"[-1, -1, -1, -1, -1, 3552, 3047, 3443, 3472, 156]","[-1, -1, -1, -1, -1, 1557970528, 1560403096, 1...","[-1, -1, -1, -1, -1, 6, 6, 6, 6, 6]",All Electronics,JSAUX USB-C to USB A Cable 3.1A Fast Charging ...,[],"[Electronics, Computers & Accessories, Compute...",11.99
671,AGHR4N4TCYWA6ZXXC3RTS4NB6DOQ,B0BQLQHP74,1.0,2021-05-21 19:20:41.366,1621624841,0.0,,0.0,,0.0,...,9890,4599,"[4618, 4102, 3532, 3747, 4162, 3200, 4147, 447...","[1575571020, 1605536394, 1605536405, 160553643...","[6, 5, 5, 5, 5, 5, 5, 5, 5, 5]",Computers,Monoprice 18-Inch SATA III 6.0 Gbps Cable with...,[Each major revision of the Serial ATA interfa...,"[Electronics, Computers & Accessories, Compute...",4.98
3603,AF24ESCAWYNFS2A47OHOH7ASAK7Q,B07PHQ93TV,1.0,2021-01-26 05:02:54.536,1611637374,98.0,4.561224,34.0,4.647059,6.0,...,4151,3590,"[-1, -1, -1, -1, 3500, 1348, 1479, 4776, 2552,...","[-1, -1, -1, -1, 1446166547, 1450239057, 14502...","[-1, -1, -1, -1, 8, 8, 8, 7, 6, 6]",Amazon Devices,"Fire HD 10 Tablet (10.1"" 1080p full HD display...",[],[],
3907,AEUBO4TJQFIEAZ4SA4KMX2YWY3KQ,B07S6X6RLG,0.0,2022-02-04 22:41:41.746,1644014501,,,,,,...,3368,4382,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",All Electronics,Sony DVPSR210P DVD Player,"[Ultra slim, new mid size design, progressive ...","[Electronics, Television & Video, DVD Players ...",34.99
2721,AHWM64D2NQP6GIH6IBYLXOVBAHAQ,B09P4Q7JK4,1.0,2021-06-26 12:06:40.839,1624709200,5.0,1.8,1.0,1.0,1.0,...,15970,4268,"[1307, 2112, 1302, 2614, 1377, 3743, 3899, 350...","[1378393848, 1381671717, 1393169274, 139932958...","[8, 8, 8, 8, 8, 7, 7, 6, 5, 5]",Computers,NETGEAR Nighthawk Smart WiFi Router (R7000P) -...,"[Built with gaming, streaming, and mobile devi...","[Electronics, Computers & Accessories, Network...",258.98
3316,AEOFRQFCEVJL4AQ2Q375RWUIJIDQ,B01K8B8YA8,0.0,2021-01-12 23:32:36.150,1610494356,12.0,4.333333,1.0,5.0,1.0,...,2649,3716,"[-1, -1, -1, -1, 2092, 1768, 1904, 4530, 3931,...","[-1, -1, -1, -1, 1466351066, 1512923287, 15129...","[-1, -1, -1, -1, 7, 7, 7, 7, 6, 6]",Amazon Devices,Echo Dot (2nd Generation) - Smart speaker with...,[],[],39.99
4369,AEORZ6NOF3GB2WNBHXAWX2YG2DNA,B0C2PVFRWV,1.0,2021-02-01 08:42:58.047,1612168978,20.0,4.5,5.0,4.4,2.0,...,2702,4721,"[4523, 4710, 1043, 1904, 5, 4463, 4568, 4513, ...","[1516529969, 1520654843, 1528515082, 152971382...","[7, 6, 6, 6, 6, 6, 6, 5, 5, 5]",Cell Phones & Accessories,"USB Type C Cable, Anker [2-Pack 6Ft] Premium N...",[Premium Nylon CableThe Durable Sync-and-Charg...,"[Electronics, Computers & Accessories, Compute...",14.99


In [17]:
user_id = val_df.sample(1)[args.user_col].values[0]
test_df = val_df.loc[lambda df: df[args.user_col].eq(user_id)]
with pd.option_context("display.max_colwidth", None):
    display(test_df)

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,user_indice,item_indice,item_sequence,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
145,AF6LGLIU44ICZKPVEXVRCKMNSBUQ,B00G05A2MU,1.0,2021-05-22 15:33:21.675,1621697601,3.0,5.0,0.0,,0.0,...,4739,1434,"[-1, -1, -1, -1, 3767, 3994, 954, 800, 4236, 3716]","[-1, -1, -1, -1, 1405981979, 1405982542, 1405982582, 1431565588, 1431565878, 1621696857]","[-1, -1, -1, -1, 8, 8, 8, 8, 8, 1]",Computers,"AC Infinity MULTIFAN S3, Quiet 120mm USB Fan, UL-Certified for Receiver DVR Playstation Xbox Computer Cabinet Cooling",[],"[Electronics, Computers & Accessories, Computer Components, Internal Components, Fans & Cooling, Case Fans]",13.99
672,AF6LGLIU44ICZKPVEXVRCKMNSBUQ,B0BGS23YKX,0.0,2021-05-23 21:58:06.165,1621807086,21.0,4.857143,5.0,5.0,2.0,...,4739,2569,"[-1, -1, -1, 3767, 3994, 954, 800, 4236, 3716, 1434]","[-1, -1, -1, 1405981979, 1405982542, 1405982582, 1431565588, 1431565878, 1621696857, 1621697602]","[-1, -1, -1, 8, 8, 8, 8, 8, 3, 3]",All Electronics,"JSAUX USB-C to USB A Cable 3.1A Fast Charging [2-Pack 6.6ft], USB Type C Charger Cord Compatible with Samsung Galaxy S10 S9 S8 S20 Plus A51 A12 A11, Note 10 9 8, PS5 Controller USB C Charger-Green",[],"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Cables & Accessories, Cables & Interconnects, USB Cables]",11.99
1210,AF6LGLIU44ICZKPVEXVRCKMNSBUQ,B00G05A2MU,0.0,2021-05-22 15:33:21.675,1621697601,3.0,5.0,0.0,,0.0,...,4739,1677,"[-1, -1, -1, -1, 3767, 3994, 954, 800, 4236, 3716]","[-1, -1, -1, -1, 1405981979, 1405982542, 1405982582, 1431565588, 1431565878, 1621696857]","[-1, -1, -1, -1, 8, 8, 8, 8, 8, 1]",Computers,"AC Infinity MULTIFAN S3, Quiet 120mm USB Fan, UL-Certified for Receiver DVR Playstation Xbox Computer Cabinet Cooling",[],"[Electronics, Computers & Accessories, Computer Components, Internal Components, Fans & Cooling, Case Fans]",13.99
3885,AF6LGLIU44ICZKPVEXVRCKMNSBUQ,B0BGS23YKX,1.0,2021-05-23 21:58:06.165,1621807086,21.0,4.857143,5.0,5.0,2.0,...,4739,4518,"[-1, -1, -1, 3767, 3994, 954, 800, 4236, 3716, 1434]","[-1, -1, -1, 1405981979, 1405982542, 1405982582, 1431565588, 1431565878, 1621696857, 1621697602]","[-1, -1, -1, 8, 8, 8, 8, 8, 3, 3]",All Electronics,"JSAUX USB-C to USB A Cable 3.1A Fast Charging [2-Pack 6.6ft], USB Type C Charger Cord Compatible with Samsung Galaxy S10 S9 S8 S20 Plus A51 A12 A11, Note 10 9 8, PS5 Controller USB C Charger-Green",[],"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Cables & Accessories, Cables & Interconnects, USB Cables]",11.99
4885,AF6LGLIU44ICZKPVEXVRCKMNSBUQ,B07VFBX16H,0.0,2021-05-22 15:20:57.437,1621696857,4.0,5.0,0.0,,0.0,...,4739,446,"[-1, -1, -1, -1, -1, 3767, 3994, 954, 800, 4236]","[-1, -1, -1, -1, -1, 1405981979, 1405982542, 1405982582, 1431565588, 1431565878]","[-1, -1, -1, -1, -1, 8, 8, 8, 8, 8]",All Electronics,"SmartQ C307 USB 3.0 SD Card Reader for SD, SDXC, MicroSD, MicroSDXC, USB C to USB A Adapter, USB 3.0 Ultra Speed USB A to USB C Adapter, Work with Smartphone and Most USB C Devices (USB-A to USB- C)","[SmartQ C307 USB 3.0 SD Card Reader for SD, SDHC, SDXC, MicroSD, MicroSDHC, MicroSDXC, USB C to USB A adapter, USB 3.0 Ultra Speed USB A to USB C Adapter, work with MacBook Pro 2019, MacBook Air 2020, Smartphone and most USB C devices (USB-A to USB- C)]","[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Computer Cable Adapters, USB-to-USB Adapters]",10.99
4971,AF6LGLIU44ICZKPVEXVRCKMNSBUQ,B07VFBX16H,1.0,2021-05-22 15:20:57.437,1621696857,4.0,5.0,0.0,,0.0,...,4739,3716,"[-1, -1, -1, -1, -1, 3767, 3994, 954, 800, 4236]","[-1, -1, -1, -1, -1, 1405981979, 1405982542, 1405982582, 1431565588, 1431565878]","[-1, -1, -1, -1, -1, 8, 8, 8, 8, 8]",All Electronics,"SmartQ C307 USB 3.0 SD Card Reader for SD, SDXC, MicroSD, MicroSDXC, USB C to USB A Adapter, USB 3.0 Ultra Speed USB A to USB C Adapter, Work with Smartphone and Most USB C Devices (USB-A to USB- C)","[SmartQ C307 USB 3.0 SD Card Reader for SD, SDHC, SDXC, MicroSD, MicroSDHC, MicroSDXC, USB C to USB A adapter, USB 3.0 Ultra Speed USB A to USB C Adapter, work with MacBook Pro 2019, MacBook Air 2020, Smartphone and most USB C devices (USB-A to USB- C)]","[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Computer Cable Adapters, USB-to-USB Adapters]",10.99


In [19]:
val_item_features.shape, train_item_features.shape

((6958, 626), (254784, 626))

In [20]:
test_row = test_df.loc[lambda df: df[args.rating_col].gt(0)].iloc[0]
item_id = test_row[args.item_col]
item_sequence = test_row["item_sequence"]
item_sequence_ts_bucket = test_row["item_sequence_ts_bucket"]
row_idx = test_row.name
item_feature = val_item_features[row_idx]
logger.info(
    f"Test predicting before training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
)
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)
user = torch.tensor([user_indice])
item_sequence = torch.tensor([item_sequence])
item_sequence_ts_bucket = torch.tensor([item_sequence_ts_bucket])
item_feature = torch.tensor([item_feature])
item = torch.tensor([item_indice])

model.eval()
model.predict(user, item_sequence, item_sequence_ts_bucket, item_feature, item)
model.train()

[32m2025-06-28 09:49:26.209[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTest predicting before training with user_id = AF6LGLIU44ICZKPVEXVRCKMNSBUQ and parent_asin = B00G05A2MU[0m

Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\utils\tensor_new.cpp:257.)



Ranker(
  (item_embedding): Embedding(4818, 256, padding_idx=4817)
  (user_embedding): Embedding(16407, 256)
  (item_sequence_ts_bucket_embedding): Embedding(11, 16, padding_idx=10)
  (gru): GRU(272, 256, batch_first=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (item_feature_tower): Sequential(
    (0): Linear(in_features=626, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
  )
  (fc_rating): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

#### Training loop

##### Overfit 1 batch

In [21]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=10, mode="min", verbose=False
)

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    args.item_sequence_ts_bucket_size,
    args.bucket_embedding_dim,
    item_feature_size,
    dropout=0,
)
lit_model = LitRanker(
    model,
    learning_rate=args.learning_rate,
    l2_reg=0.0,
    log_dir=args.notebook_persist_dp,
    accelerator=args.device,
)

log_dir = f"{args.notebook_persist_dp}/logs/overfit"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=2,
    overfit_batches=1,
    callbacks=[early_stopping],
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=train_loader,

)
logger.info(f"Logs available at {trainer.log_dir}")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.

  | Name               | Type                   | Params | Mode 
----------------------------------------------------------------------
0 | model              | Ranker                 | 6.3 M  | train
1 | val_roc_auc_metric | BinaryAUROC            | 0      | train
2 | val_pr_auc_metric  | BinaryAveragePrecision | 0      | train
----------------------------------------------------------------------
6.3 M     Trainable params
0         Non-trainable params
6.3 M     Total params
25.060    Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


You requested to overfit but enabled val dataloader shuffling. We are turning off the val dataloader shuffling for you.


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


You requested to overfit but enabled train dataloader shuffling. We are turning off the train dataloader shuffling for you.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.
[32m2025-06-28 09:49:43.067[0m | [1mINFO    [0m | [36msrc.algo.ranker.trainer[0m:[36mon_fit_end[0m:[36m206[0m - [1mLogging classification metrics...[0m
[32m2025-06-28 09:51:16.244[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1mLogs available at c:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\004-use-sbert-features-and-llm-tags\logs\overfit\lightning_logs\version_1[0m


In [22]:
# Need to make sure port 6006 at local is accessible
%tensorboard --logdir $trainer.log_dir

##### Fit on all data

In [None]:
all_items_df = train_df.drop_duplicates(subset=["item_indice"])
all_items_indices = all_items_df["item_indice"].values
all_items_features = item_metadata_pipeline.transform(all_items_df).astype(np.float32)
logger.info(
    f"Mean std over categorical and numerical features: {all_items_features.std(axis=0).mean()}"
)
# if args.rc.use_sbert_features:
#     all_sbert_vectors = ann_index.get_vector_by_ids(all_items_indices.tolist()).astype(
#         np.float32
#     )
#     logger.info(f"Mean std over text features: {all_sbert_vectors.std(axis=0).mean()}")
#     all_items_features = np.hstack([all_items_features, all_sbert_vectors])

In [None]:
# papermill_description=fit-model
early_stopping = EarlyStopping(
    monitor="val_roc_auc", patience=args.early_stopping_patience, mode="max", verbose=False
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persist_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_roc_auc",
    mode="max",
)

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    args.item_sequence_ts_bucket_size,
    args.bucket_embedding_dim,
    item_feature_size,
    dropout=args.dropout,
    item_embedding=pretrained_item_embedding,
)
lit_model = LitRanker(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persist_dp,
    evaluate_ranking=True,
    idm=idm,
    all_items_indices=all_items_indices,
    all_items_features=all_items_features,
    args=args,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    checkpoint_callback=checkpoint_callback,
    accelerator=args.device,
)

In [None]:
log_dir = f"{args.notebook_persist_dp}/logs/run"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    max_epochs=args.max_epochs,
    callbacks=[early_stopping, checkpoint_callback],
    accelerator=args.device if args.device else "auto",
    logger=args._mlf_logger if args.log_to_mlflow else None,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

In [None]:
logger.info(
    f"Test predicting after training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
)
model.eval()
model = model.to(user.device)  # Move model back to align with data device
model.predict(user, item_sequence, item_sequence_ts_bucket, item_feature, item)
model.train()

# Load best checkpoint

In [None]:
logger.info(f"Loading best checkpoint from {checkpoint_callback.best_model_path}...")
args.best_checkpoint_path = checkpoint_callback.best_model_path

best_trainer = LitRanker.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    model=init_model(
        n_users,
        n_items,
        args.embedding_dim,
        args.item_sequence_ts_bucket_size,
        args.bucket_embedding_dim,
        item_feature_size,
        dropout=0,
        item_embedding=pretrained_item_embedding,
    ),
)

In [None]:
best_model = best_trainer.model.to(lit_model.device)

In [None]:
best_model.eval()
best_model.predict(user, item_sequence, item_sequence_ts_bucket, item_feature, item)
best_model.train()

### Persist artifacts

In [None]:
if args.log_to_mlflow:
    # Persist id_mapping so that at inference we can predict based on item_ids (string) instead of item_index
    run_id = trainer.logger.run_id
    mlf_client = trainer.logger.experiment
    mlf_client.log_artifact(run_id, idm_fp)
    # Persist item_feature_metadata pipeline
    mlf_client.log_artifact(run_id, args.item_metadata_pipeline_fp)

    # Persist model architecture
    model_architecture_fp = f"{args.notebook_persist_dp}/model_architecture.txt"
    with open(model_architecture_fp, "w") as f:
        f.write(repr(model))
    mlf_client.log_artifact(run_id, model_architecture_fp)

### Wrap inference function and register best checkpoint as MLflow model

In [None]:
inferrer = RankerInferenceWrapper(best_model)

In [None]:
def generate_sample_item_features():
    sample_row = train_df.iloc[0].fillna(0)
    output = dict()
    for col in args.rc.item_feature_cols:
        v = sample_row[col]
        if isinstance(v, np.ndarray):
            v = "__".join(
                sample_row[col].tolist()
            )  # Workaround to avoid MLflow Got error: Per-column arrays must each be 1-dimensional
        output[col] = [v]
    return output

In [None]:
sample_input = {
    args.user_col: [idm.get_user_id(0)],
    "item_sequence": [",".join([idm.get_item_id(0), idm.get_item_id(1)])],
    "item_sequence_ts": [
        "1095133116,109770848"
    ],  # Here we input unix timestamp seconds instead of timestamp bucket because we need to calculate the bucket
    # **{col: [train_df.iloc[0].fillna(0)[col]] for col in args.item_feature_cols},
    **generate_sample_item_features(),
    args.item_col: [idm.get_item_id(0)],
}
sample_output = inferrer.infer([0], [[0, 1]], [[2, 0]], [train_item_features[0]], [0])
sample_output

In [None]:
sample_input

In [None]:
if args.log_to_mlflow:
    run_id = trainer.logger.run_id
    sample_output_np = sample_output
    signature = infer_signature(sample_input, sample_output_np)
    idm_filename = idm_fp.split("/")[-1]
    item_metadata_pipeline_filename = args.item_metadata_pipeline_fp.split("/")[-1]
    with mlflow.start_run(run_id=run_id):
        mlflow.pyfunc.log_model(
            python_model=inferrer,
            artifact_path="inferrer",
            artifacts={
                # We log the id_mapping to the predict function so that it can accept item_id and automatically convert ot item_indice for PyTorch model to use
                "idm": mlflow.get_artifact_uri(idm_filename),
                "item_metadata_pipeline": mlflow.get_artifact_uri(
                    item_metadata_pipeline_filename
                ),
            },
            model_config={"use_sbert_features": args.rc.use_sbert_features},
            signature=signature,
            input_example=sample_input,
            registered_model_name=args.mlf_model_name,
        )

# Set the newly trained model as champion

In [None]:
if args.log_to_mlflow:
    # Get current champion
    deploy_alias = "champion"
    curr_model_run_id = None

    min_roc_auc = args.min_roc_auc

    try:
        curr_champion_model = mlf_client.get_model_version_by_alias(
            args.mlf_model_name, deploy_alias
        )
        curr_model_run_id = curr_champion_model.run_id
    except MlflowException as e:
        if "not found" in str(e).lower():
            logger.info(
                f"There is no {deploy_alias} alias for model {args.mlf_model_name}"
            )

    # Compare new vs curr models
    new_mlf_run = trainer.logger.experiment.get_run(trainer.logger.run_id)
    new_metrics = new_mlf_run.data.metrics
    roc_auc = new_metrics["roc_auc"]
    if curr_model_run_id:
        curr_model_run_info = mlf_client.get_run(curr_model_run_id)
        curr_metrics = curr_model_run_info.data.metrics
        if (curr_roc_auc := curr_metrics["roc_auc"]) > min_roc_auc:
            logger.info(
                f"Current {deploy_alias} model has {curr_roc_auc:,.4f} ROC-AUC..."
            )
            min_roc_auc = curr_roc_auc

        top_metrics = ["roc_auc", "val_PersonalizationMetric"]
        vizer = ModelMetricsComparisonVisualizer(curr_metrics, new_metrics, top_metrics)
        print(f"Comparing metrics between new run and current champion:")
        display(vizer.compare_metrics_df())
        vizer.create_metrics_comparison_plot(n_cols=5)
        vizer.plot_diff()

    # Register new champion
    if roc_auc < min_roc_auc:
        logger.info(
            f"Current run has ROC-AUC = {roc_auc:,.4f}, smaller than {min_roc_auc:,.4f}. Skip aliasing this model as the new {deploy_alias}.."
        )
    else:
        logger.info(f"Aliasing the new model as champion...")
        # Get the model version for current run by assuming it's the most recent registered version
        model_version = (
            mlf_client.get_registered_model(args.mlf_model_name)
            .latest_versions[0]
            .version
        )

        mlf_client.set_registered_model_alias(
            name=args.mlf_model_name, alias="champion", version=model_version
        )

        mlf_client.set_model_version_tag(
            name=args.mlf_model_name,
            version=model_version,
            key="author",
            value=args.author,
        )

# Clean up

In [None]:
all_params = [args]

if args.log_to_mlflow:
    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.dict()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)