# Sequence modeling for ranking task

# Set up

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
import os
import sys

import lightning as L
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from mlflow.exceptions import MlflowException
from mlflow.models.signature import infer_signature
from pydantic import BaseModel
from torch.utils.data import DataLoader

import mlflow

sys.path.insert(0, "..")

from src.dataset import UserItemRatingDFDataset
from src.eval.compare_runs import ModelMetricsComparisonVisualizer
from src.id_mapper import IDMapper
from src.sequence.inference import SequenceRatingPredictionInferenceWrapper
from src.sequence.model import SequenceRatingPrediction
from src.sequence.trainer import LitSequenceRatingPrediction
from src.sequence.utils import generate_item_sequences
from src.viz import custom_style_plotly

load_dotenv()
custom_style_plotly()

  from .autonotebook import tqdm as notebook_tqdm


# Controller

In [3]:
# This is a parameter cell used by papermill
max_epochs = 100

In [4]:
class Args(BaseModel):
    testing: bool = False
    author: str = "quy.dinh"
    log_to_mlflow: bool = True
    experiment_name: str = "Retriever"
    run_name: str = "003-sequence-model-increase-l2-reg"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    max_epochs: int = max_epochs
    batch_size: int = 128

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    embedding_dim: int = 128
    dropout: float = 0.3
    early_stopping_patience: int = 5
    learning_rate: float = 0.003
    l2_reg: float = 1e-4

    mlf_item2vec_model_name: str = "item2vec"
    mlf_model_name: str = "sequence_rating_prediction"
    min_metric_value: float = 0.05

    best_checkpoint_path: str = None

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            logger.warning(
                "Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if self.device is None:
            self.device = (
                "cuda"
                if torch.cuda.is_available()
                else "mps" if torch.backends.mps.is_available() else "cpu"
            )

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

[32m2025-03-08 20:14:35.913[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m47[0m - [1mSetting up MLflow experiment Retriever - run 003-sequence-model-increase-l2-reg...[0m


{
  "testing": false,
  "author": "quy.dinh",
  "log_to_mlflow": true,
  "experiment_name": "Retriever",
  "run_name": "003-sequence-model-increase-l2-reg",
  "notebook_persist_dp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/003-sequence-model-increase-l2-reg",
  "random_seed": 41,
  "device": "cuda",
  "max_epochs": 100,
  "batch_size": 128,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "embedding_dim": 128,
  "dropout": 0.3,
  "early_stopping_patience": 5,
  "learning_rate": 0.003,
  "l2_reg": 0.0001,
  "mlf_item2vec_model_name": "item2vec",
  "mlf_model_name": "sequence_rating_prediction",
  "min_metric_value": 0.05,
  "best_checkpoint_path": null
}


# Implement

In [5]:
def init_model(n_users, n_items, embedding_dim, dropout, item_embedding=None):
    model = SequenceRatingPrediction(
        n_users, n_items, embedding_dim, dropout=dropout, item_embedding=item_embedding
    )
    return model

# Test implementation

In [6]:
embedding_dim = 8
batch_size = 2

# Mock data
user_indices = [0, 0, 1, 2, 2]
item_indices = [0, 1, 2, 3, 4]
timestamps = [0, 1, 2, 3, 4]
ratings = [0, 4, 5, 3, 0]
item_sequences = [
    [-1, -1, 2, 3],
    [-1, -1, 2, 3],
    [-1, -1, 1, 3],
    [-1, -1, 2, 1],
    [-1, -1, 2, 1],
]

n_users = len(set(user_indices))
n_items = len(set(item_indices))

train_df = pd.DataFrame(
    {
        "user_indice": user_indices,
        "item_indice": item_indices,
        args.timestamp_col: timestamps,
        args.rating_col: ratings,
        "item_sequence": item_sequences,
    }
)

model = init_model(n_users, n_items, embedding_dim, args.dropout)

# Example forward pass
model.eval()
user = torch.tensor([0])
item_sequence = torch.tensor([[-1, -1, -1, 0, 1]])
target_item = torch.tensor([2])
predictions = model.predict(user, item_sequence, target_item)
print(predictions)
model.train()

tensor([[0.5768]], grad_fn=<SigmoidBackward0>)


SequenceRatingPrediction(
  (item_embedding): Embedding(6, 8, padding_idx=5)
  (user_embedding): Embedding(3, 8)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc_rating): Sequential(
    (0): Linear(in_features=24, out_features=8, bias=True)
    (1): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=8, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [7]:
rating_dataset = UserItemRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col
)

train_loader = DataLoader(
    rating_dataset, batch_size=batch_size, shuffle=False, drop_last=True
)

In [8]:
for batch_input in train_loader:
    print(batch_input)

{'user': tensor([0, 0]), 'item': tensor([0, 1]), 'rating': tensor([0.0000, 0.8000]), 'item_sequence': tensor([[-1, -1,  2,  3],
        [-1, -1,  2,  3]]), 'item_sequence_ts_bucket': tensor([], size=(2, 0), dtype=torch.int64), 'item_feature': tensor([], size=(2, 0))}
{'user': tensor([1, 2]), 'item': tensor([2, 3]), 'rating': tensor([1.0000, 0.6000]), 'item_sequence': tensor([[-1, -1,  1,  3],
        [-1, -1,  2,  1]]), 'item_sequence_ts_bucket': tensor([], size=(2, 0), dtype=torch.int64), 'item_feature': tensor([], size=(2, 0))}


In [9]:
# model
lit_model = LitSequenceRatingPrediction(model, log_dir=args.notebook_persist_dp)

# train model
trainer = L.Trainer(
    default_root_dir=f"{args.notebook_persist_dp}/test",
    max_epochs=2,
    accelerator=args.device if args.device else "auto",
)
trainer.fit(
    model=lit_model, train_dataloaders=train_loader, val_dataloaders=train_loader
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 SUPER') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                     | Params | Mode 
-----------------------------------------------------------
0 | model | SequenceRatingPrediction | 297    | train
-----------------------------------------------------------
297       Trainable params
0         Non-trainable params
297       Total params
0.001     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|                                                                                                                                                               | 0/2 [00:00<?, ?it/s]

/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                                                                                                                                                                        

/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 29.28it/s, v_num=0, train_loss_step=0.134]
[Aidation: |                                                                                                                                                                                     | 0/? [00:00<?, ?it/s]
[Aidation:   0%|                                                                                                                                                                                 | 0/2 [00:00<?, ?it/s]
[Aidation DataLoader 0:   0%|                                                                                                                                                                    | 0/2 [00:00<?, ?it/s]
[Aidation DataLoader 0:  50%|█████████████████████████████████████████████████████████████████████████████▌                        

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 155.29it/s, v_num=0, train_loss_step=0.108, val_loss=0.142, train_loss_epoch=0.157]


In [10]:
users = torch.tensor([0, 0, 0, 0])
item_sequences = torch.tensor(
    [[-1, -1, 2, 3], [-1, -1, 2, 3], [-1, -1, 1, 3], [-1, -1, 2, 1]]
)
items = torch.tensor([0, 1, 2, 3])
predictions = model.predict(users, item_sequences, items)
print(predictions)

tensor([[0.4451],
        [0.4518],
        [0.5116],
        [0.5322]], grad_fn=<SigmoidBackward0>)


In [11]:
def create_predict_df(
    train_df,
    val_user_indices,
    val_timestamp,
    rating_col,
    timestamp_col,
    sequence_length=10,
):
    predict_df = pd.DataFrame(
        {
            "user_indice": val_user_indices,
            "item_indice": -1,  # placeholder
            "timestamp": val_timestamp,
            "source": "predict",
        }
    )

    predict_df = (
        pd.concat(
            [
                train_df.loc[lambda df: df[rating_col].gt(0)][
                    ["user_indice", "item_indice", timestamp_col]
                ].assign(source="train"),
                predict_df,
            ],
            axis=0,
        )
        .pipe(
            generate_item_sequences,
            "user_indice",
            "item_indice",
            timestamp_col,
            sequence_length=sequence_length,
            padding=True,
            padding_value=-1,
        )
        .loc[lambda df: df["source"].eq("predict")]
        .assign(item_sequence=lambda df: df["item_sequence"].apply(np.array))
    )

    return predict_df


predict_df = create_predict_df(
    train_df,
    user_indices,
    timestamps[-1],
    args.rating_col,
    args.timestamp_col,
    sequence_length=10,
)

predict_df

Unnamed: 0,user_indice,item_indice,timestamp,source,item_sequence
0,0,-1,4,predict,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1]"
1,0,-1,4,predict,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1]"
2,1,-1,4,predict,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2]"
3,2,-1,4,predict,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3]"
4,2,-1,4,predict,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3]"


In [12]:
recommendations = model.recommend(
    torch.tensor(predict_df["user_indice"].values),
    torch.tensor(predict_df["item_sequence"].values.tolist()),
    k=2,
    batch_size=4,
)
recommendations

  torch.tensor(predict_df["item_sequence"].values.tolist()),
Generating recommendations: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2499.59it/s]


{'user_indice': [0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
 'recommendation': [2, 1, 2, 1, 2, 1, 2, 1, 2, 1],
 'score': [0.5766147375106812,
  0.5584030151367188,
  0.5766147375106812,
  0.5584030151367188,
  0.5423680543899536,
  0.5048094987869263,
  0.5723859667778015,
  0.542466402053833,
  0.5723859667778015,
  0.542466402053833]}

# Prep data

In [13]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")
idm_fp = "../data/idm.json"
idm = IDMapper().load(idm_fp)

assert (
    train_df[args.user_col].map(lambda s: idm.get_user_index(s))
    != train_df["user_indice"]
).sum() == 0, "Mismatch IDM"
assert (
    val_df[args.user_col].map(lambda s: idm.get_user_index(s)) != val_df["user_indice"]
).sum() == 0, "Mismatch IDM"

In [14]:
user_indices = train_df["user_indice"].unique()
item_indices = train_df["item_indice"].unique()

logger.info(f"{len(user_indices)=:,.0f}, {len(item_indices)=:,.0f}")

[32m2025-03-08 20:14:36.629[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(user_indices)=19,734, len(item_indices)=7,388[0m


In [15]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AE224PFXAEAT66IXX43GRJSWHXCA,0399159312,2.0,1373291889000,6822,4732,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE224PFXAEAT66IXX43GRJSWHXCA,B000FA5TTW,1.0,1382077065000,6822,1581,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE224PFXAEAT66IXX43GRJSWHXCA,030758836X,1.0,1424138603000,6822,2712,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE224PFXAEAT66IXX43GRJSWHXCA,B00MSRW6SM,4.0,1437924147000,6822,4217,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 473..."
4,AE224PFXAEAT66IXX43GRJSWHXCA,B00A18VD7A,1.0,1464603674000,6822,6558,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4732.0, 1..."
...,...,...,...,...,...,...,...
194419,AHZZZ6UASY7CGOTGP5BH5637FMPA,B017GFRJZK,5.0,1508089337653,6916,6998,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4088.0, 2670.0,..."
194420,AHZZZ6UASY7CGOTGP5BH5637FMPA,B00RPM9MJ6,4.0,1521230143557,6916,5013,"[-1.0, -1.0, -1.0, -1.0, 4088.0, 2670.0, 6456...."
194421,AHZZZ6UASY7CGOTGP5BH5637FMPA,B00RU7SNP0,4.0,1534867184329,6916,1099,"[-1.0, -1.0, -1.0, 4088.0, 2670.0, 6456.0, 141..."
194422,AHZZZ6UASY7CGOTGP5BH5637FMPA,B00HGSVGSY,4.0,1534867223318,6916,685,"[-1.0, -1.0, 4088.0, 2670.0, 6456.0, 1416.0, 3..."


# Train

In [16]:
rating_dataset = UserItemRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col
)
val_rating_dataset = UserItemRatingDFDataset(
    val_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False
)

In [17]:
n_items = len(item_indices)
n_users = len(user_indices)

model = init_model(n_users, n_items, args.embedding_dim, args.dropout)

#### Predict before train

In [18]:
model.item_embedding

Embedding(7389, 128, padding_idx=7388)

In [19]:
val_df = val_rating_dataset.df
val_df.sample(10)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
592,AEQR5XHD4TWDAUTLD2P4DS6XWNFA,1416550534,0.2,1654100190659,13448,698,"[3318, 4316, 3017, 665, 3984, 4741, 784, 6691,..."
1804,AG6LX4UC2CFVAC6ADFV5ZRY3SR6A,B01L1CEZ6K,1.0,1636320979844,14013,4203,"[-1, -1, -1, -1, -1, 4209, 5610, 6382, 316, 1282]"
777,AEXPZ52OOPPTAVUPNVCAJR5HHR7Q,B07S7Q1ZGC,0.8,1656538333273,17918,720,"[-1, -1, 1254, 4938, 4998, 1392, 2556, 2600, 1..."
1415,AFPKBQSE6CQKNORE6ZZNYAVUHKBQ,B000FC12XW,0.6,1630241506654,14750,525,"[-1, -1, -1, -1, 5981, 6678, 5237, 2662, 4519,..."
1991,AGFAO4KF3ZSLI2DGXSCEY3SQFA2Q,038549565X,1.0,1657371830840,11278,7129,"[-1, -1, -1, -1, -1, 1781, 394, 4558, 1126, 1730]"
2206,AGKFRC3Q6AIE6FOJDDP5AXTUZQ5A,B07R5YVHL9,0.8,1653403855051,11573,4010,"[805, 1843, 3720, 650, 694, 4621, 3326, 1674, ..."
3316,AHQWXH5DBAPDWSJIY5ZTWH7CWU4Q,1250081831,1.0,1636561292992,4729,2982,"[6341, 6571, 1288, 3715, 518, 6587, 697, 152, ..."
3447,AHV56DPHQX7E2GXWVN5ORR4TVDVQ,B07CKQV7F8,1.0,1633819274452,13219,3581,"[3315, 5372, 4203, 4593, 7369, 3572, 6720, 306..."
1124,AFG2YMPWAFRUTQT4EQJHW2UUWSHQ,B0844QT2FT,1.0,1635428560673,18922,800,"[4928, 3564, 308, 4861, 2926, 6689, 5062, 1547..."
2198,AGKFRC3Q6AIE6FOJDDP5AXTUZQ5A,B004J4WN12,0.8,1648047942893,11573,3720,"[3651, 213, 13, 6366, 3883, 3698, 436, 1928, 8..."


In [20]:
user_id = val_df.sample(1)[args.user_col].values[0]
# user_id = "AH4AOFTTDPHPAFAAVFMAF25H2LIQ"
test_df = val_df.loc[lambda df: df[args.user_col].eq(user_id)]
with pd.option_context("display.max_colwidth", None):
    display(test_df)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
2112,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B06XKB5J4M,0.8,1633105161090,9803,3942,"[6955, 893, 2655, 901, 6642, 1873, 6813, 4314, 4887, 2779]"
2113,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B01N7WS2JG,0.6,1640120438619,9803,1886,"[893, 2655, 901, 6642, 1873, 6813, 4314, 4887, 2779, 3942]"
2114,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B0096SOUF8,1.0,1642998968336,9803,6999,"[2655, 901, 6642, 1873, 6813, 4314, 4887, 2779, 3942, 1886]"
2115,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B01MV4ZPBN,1.0,1644217368428,9803,4093,"[901, 6642, 1873, 6813, 4314, 4887, 2779, 3942, 1886, 6999]"
2116,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B06ZZH686T,1.0,1645258659617,9803,1777,"[6642, 1873, 6813, 4314, 4887, 2779, 3942, 1886, 6999, 4093]"
2117,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B07SBQDLS4,0.8,1645650559515,9803,5561,"[1873, 6813, 4314, 4887, 2779, 3942, 1886, 6999, 4093, 1777]"
2118,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B000P0JM5O,1.0,1648353469251,9803,32,"[6813, 4314, 4887, 2779, 3942, 1886, 6999, 4093, 1777, 5561]"
2119,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B00KPVB4MM,1.0,1648596357023,9803,7120,"[4314, 4887, 2779, 3942, 1886, 6999, 4093, 1777, 5561, 32]"
2120,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B078G2SWRJ,1.0,1649113753396,9803,4186,"[4887, 2779, 3942, 1886, 6999, 4093, 1777, 5561, 32, 7120]"
2121,AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ,B003YFJ5KY,0.6,1649784930654,9803,3038,"[2779, 3942, 1886, 6999, 4093, 1777, 5561, 32, 7120, 4186]"


In [21]:
test_row = test_df.loc[lambda df: df[args.rating_col].gt(0)].iloc[0]
item_id = test_row[args.item_col]
item_sequence = test_row["item_sequence"]
logger.info(
    f"Test predicting before training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
)
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)
user = torch.tensor([user_indice])
item_sequence = torch.tensor([item_sequence])
item = torch.tensor([item_indice])

model.eval()
model.predict(user, item_sequence, item)
model.train()

[32m2025-03-08 20:14:36.829[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTest predicting before training with user_id = AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ and parent_asin = B06XKB5J4M[0m


SequenceRatingPrediction(
  (item_embedding): Embedding(7389, 128, padding_idx=7388)
  (user_embedding): Embedding(19734, 128)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc_rating): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

#### Training loop

##### Overfit 1 batch

In [22]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=10, mode="min", verbose=False
)

model = init_model(n_users, n_items, args.embedding_dim, dropout=0)
lit_model = LitSequenceRatingPrediction(
    model,
    learning_rate=args.learning_rate,
    l2_reg=0.0,
    log_dir=args.notebook_persist_dp,
    accelerator=args.device,
)

log_dir = f"{args.notebook_persist_dp}/logs/overfit"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=100,
    overfit_batches=1,
    callbacks=[early_stopping],
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=train_loader,
)
logger.info(f"Logs available at {trainer.log_dir}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                     | Params | Mode 
-----------------------------------------------------------
0 | model | SequenceRatingPrediction | 3.5 M  | train
-----------------------------------------------------------
3.5 M     Trainable params
0         Non-trainable params
3.5 M     Total params
14.086    Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


                                                                                                                                                                                                                        

/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:252: You requested to overfit but enabled val dataloader shuffling. We are turning off the val dataloader shuffling for you.
/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:252: You requested to overfit but enabled train dataloader shuffling. We are turning off the train dataloader shuffling for you.
/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.

Epoch 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 114.38it/s, v_num=0, train_loss_step=0.118]
[Aidation: |                                                                                                                                                                                     | 0/? [00:00<?, ?it/s]
[Aidation:   0%|                                                                                                                                                                                 | 0/1 [00:00<?, ?it/s]
[Aidation DataLoader 0:   0%|                                                                                                                                                                    | 0/1 [00:00<?, ?it/s]
[Aidation DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.33it/s, v_num=0, train_loss_step=3.08e-5, val_loss=3.27e-5, train_loss_epoch=3.08e-5]


[32m2025-03-08 20:14:43.964[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mLogs available at /home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/003-sequence-model-increase-l2-reg/logs/overfit/lightning_logs/version_0[0m


In [23]:
# Need to make sure port 6006 at local is accessible
%tensorboard --logdir $trainer.log_dir

##### Fit on all data

In [24]:
# papermill_description=fit-model
early_stopping = EarlyStopping(
    monitor="val_loss", patience=args.early_stopping_patience, mode="min", verbose=False
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persist_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_loss",
    mode="min",
)

model = init_model(
    n_users,
    n_items,
    args.embedding_dim,
    dropout=args.dropout
)
lit_model = LitSequenceRatingPrediction(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persist_dp,
    evaluate_ranking=True,
    idm=idm,
    args=args,
    accelerator=args.device,
    checkpoint_callback=checkpoint_callback,
)

log_dir = f"{args.notebook_persist_dp}/logs/run"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    max_epochs=args.max_epochs,
    callbacks=[early_stopping, checkpoint_callback],
    accelerator=args.device if args.device else "auto",
    logger=args._mlf_logger if args.log_to_mlflow else None,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                     | Params | Mode 
-----------------------------------------------------------
0 | model | SequenceRatingPrediction | 3.5 M  | train
-----------------------------------------------------------
3.5 M     Trainable params
0         Non-trainable params
3.5 M     Total params
14.086    Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


                                                                                                                                                                                                                        

/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/dvq/frostmourne/recsys-blog/1-seq-model/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1518/1518 [00:08<00:00, 174.89it/s, v_num=cb7b, train_loss_step=0.0354]
[Aidation: |                                                                                                                                                                                     | 0/? [00:00<?, ?it/s]
[Aidation:   0%|                                                                                                                                                                                | 0/28 [00:00<?, ?it/s]
[Aidation DataLoader 0:   0%|                                                                                                                                                                   | 0/28 [00:00<?, ?it/s]
[Aidation DataLoader 0:   4%|█████▌                                                                                                

[32m2025-03-08 20:17:34.030[0m | [1mINFO    [0m | [36msrc.sequence.trainer[0m:[36mon_fit_end[0m:[36m121[0m - [1mLoading best model from /home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/003-sequence-model-increase-l2-reg/checkpoints/best-checkpoint.ckpt...[0m
[32m2025-03-08 20:17:34.185[0m | [1mINFO    [0m | [36msrc.sequence.trainer[0m:[36mon_fit_end[0m:[36m129[0m - [1mLogging ranking metrics...[0m
Generating recommendations: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 434/434 [00:00<00:00, 604.44it/s]


🏃 View run 003-sequence-model-increase-l2-reg at: http://localhost:5002/#/experiments/1/runs/f1e1fefd7166418a96d9dc38f3adcb7b
🧪 View experiment at: http://localhost:5002/#/experiments/1


In [25]:
logger.info(
    f"Test predicting after training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
)
model.eval()
model = model.to(user.device)
model.predict(user, item_sequence, item)
model.train()

[32m2025-03-08 20:17:39.489[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mTest predicting after training with user_id = AGJCRK2ZEVQ4KLPESEEBQBNSEDEQ and parent_asin = B06XKB5J4M[0m


SequenceRatingPrediction(
  (item_embedding): Embedding(7389, 128, padding_idx=7388)
  (user_embedding): Embedding(19734, 128)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc_rating): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

# Load best checkpoint

In [26]:
logger.info(f"Loading best checkpoint from {checkpoint_callback.best_model_path}...")
args.best_checkpoint_path = checkpoint_callback.best_model_path

best_trainer = LitSequenceRatingPrediction.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    model=init_model(n_users, n_items, args.embedding_dim, dropout=0),
)

[32m2025-03-08 20:17:39.521[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading best checkpoint from /home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/003-sequence-model-increase-l2-reg/checkpoints/best-checkpoint.ckpt...[0m


In [27]:
best_model = best_trainer.model.to(lit_model.device)

In [28]:
best_model.eval()
best_model.predict(user, item_sequence, item)
best_model.train()

SequenceRatingPrediction(
  (item_embedding): Embedding(7389, 128, padding_idx=7388)
  (user_embedding): Embedding(19734, 128)
  (relu): ReLU()
  (dropout): Dropout(p=0, inplace=False)
  (fc_rating): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0, inplace=False)
    (4): Linear(in_features=128, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

### Persist id mapping

In [29]:
if args.log_to_mlflow:
    # Persist id_mapping so that at inference we can predict based on item_ids (string) instead of item_index
    run_id = trainer.logger.run_id
    mlf_client = trainer.logger.experiment
    mlf_client.log_artifact(run_id, idm_fp)

### Wrap inference function and register best checkpoint as MLflow model

In [30]:
inferrer = SequenceRatingPredictionInferenceWrapper(best_model)

In [31]:
sample_input = {
    "user_ids": [idm.get_user_id(0)],
    "item_sequences": [[idm.get_item_id(0), idm.get_item_id(1)]],
    "item_ids": [idm.get_item_id(0)],
}
sample_output = inferrer.infer([0], [[0, 1]], [0])
sample_output

array([0.53312916], dtype=float32)

In [32]:
if args.log_to_mlflow:
    run_id = trainer.logger.run_id
    sample_output_np = sample_output
    signature = infer_signature(sample_input, sample_output_np)
    idm_filename = idm_fp.split("/")[-1]
    with mlflow.start_run(run_id=run_id):
        mlflow.pyfunc.log_model(
            python_model=inferrer,
            artifact_path="inferrer",
            # We log the id_mapping to the predict function so that it can accept item_id and automatically convert ot item_indice for PyTorch model to use
            artifacts={"idm": mlflow.get_artifact_uri(idm_filename)},
            signature=signature,
            input_example=sample_input,
            registered_model_name=args.mlf_model_name,
        )

2025/03/08 20:17:39 INFO mlflow.pyfunc: Validating input example against model signature
Downloading artifacts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 82.82it/s]
Registered model 'sequence_rating_prediction' already exists. Creating a new version of this model...
2025/03/08 20:17:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sequence_rating_prediction, version 3


🏃 View run 003-sequence-model-increase-l2-reg at: http://localhost:5002/#/experiments/1/runs/f1e1fefd7166418a96d9dc38f3adcb7b
🧪 View experiment at: http://localhost:5002/#/experiments/1


Created version '3' of model 'sequence_rating_prediction'.


# Set the newly trained model as champion

In [33]:
if args.log_to_mlflow:
    key_metric = 'val_RecallTopKMetric_at_k_as_step'
    # Get current champion
    deploy_alias = "champion"
    curr_model_run_id = None

    min_metric_value = args.min_metric_value

    try:
        curr_champion_model = mlf_client.get_model_version_by_alias(
            args.mlf_model_name, deploy_alias
        )
        curr_model_run_id = curr_champion_model.run_id
    except MlflowException as e:
        if "not found" in str(e).lower():
            logger.info(
                f"There is no {deploy_alias} alias for model {args.mlf_model_name}"
            )

    # Compare new vs curr models
    new_mlf_run = trainer.logger.experiment.get_run(trainer.logger.run_id)
    new_metrics = new_mlf_run.data.metrics
    metric_value = new_metrics[key_metric]
    if curr_model_run_id:
        curr_model_run_info = mlf_client.get_run(curr_model_run_id)
        curr_metrics = curr_model_run_info.data.metrics
        if (curr_metric_value := curr_metrics[key_metric]) > min_metric_value:
            logger.info(
                f"Current {deploy_alias} model has {curr_metric_value:,.4f} ROC-AUC. Setting it to the deploy baseline..."
            )
            min_metric_value = curr_metric_value

        top_metrics = [key_metric]
        vizer = ModelMetricsComparisonVisualizer(curr_metrics, new_metrics, top_metrics)
        print("Comparing metrics between new run and current champion:")
        display(vizer.compare_metrics_df())
        vizer.create_metrics_comparison_plot(n_cols=5)
        vizer.plot_diff()

    # Register new champion
    if metric_value < min_metric_value:
        logger.info(
            f"Current run has {key_metric} = {metric_value:,.4f}, smaller than {min_metric_value:,.4f}. Skip aliasing this model as the new {deploy_alias}.."
        )
    else:
        logger.info("Aliasing the new model as champion...")
        # Get the model version for current run by assuming it's the most recent registered version
        model_version = (
            mlf_client.get_registered_model(args.mlf_model_name)
            .latest_versions[0]
            .version
        )

        mlf_client.set_registered_model_alias(
            name=args.mlf_model_name, alias="champion", version=model_version
        )

        mlf_client.set_model_version_tag(
            name=args.mlf_model_name,
            version=model_version,
            key="author",
            value=args.author,
        )

[32m2025-03-08 20:17:42.481[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mThere is no champion alias for model sequence_rating_prediction[0m
[32m2025-03-08 20:17:42.487[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m42[0m - [1mCurrent run has val_RecallTopKMetric_at_k_as_step = 0.0369, smaller than 0.0500. Skip aliasing this model as the new champion..[0m


# Clean up

In [34]:
all_params = [args]

if args.log_to_mlflow:
    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.dict()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)

🏃 View run 003-sequence-model-increase-l2-reg at: http://localhost:5002/#/experiments/1/runs/f1e1fefd7166418a96d9dc38f3adcb7b
🧪 View experiment at: http://localhost:5002/#/experiments/1


/tmp/ipykernel_79371/747004171.py:6: PydanticDeprecatedSince20:

The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/

