## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import time
import json
import torch
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.sequence.model import SequenceRatingPrediction
from src.algo.sequence.dataset import UserItemBinaryRatingDFDataset
from src.algo.sequence.trainer import SeqModellingLitModule
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
load_dotenv(override = True)

True

In [4]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    notebook_persit_dp: str = None
    
    run_name: str = None

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "seq-modelling"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128
    learning_rate: float = 0.001
    l2_reg: float = 5e-6
    early_stopping_patience: int = 50
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    max_epochs: int = 100

    # TwoTower specific
    dropout: float = 0.2
    embedding_dim: int = 128


    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    best_checkpoint_path: str = None
    def init(self):
        self.run_name: str = f"006-sequence-modelling-attn-{self.embedding_dim}-dim-bce-prelu"
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-05-21 13:57:05.585[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m43[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 006-sequence-modelling-attn-128-dim-bce-prelu[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "notebook_persit_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/006-sequence-modelling-attn-128-dim-bce-prelu",
  "run_name": "006-sequence-modelling-attn-128-dim-bce-prelu",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "seq-modelling",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128,
  "learning_rate": 0.001,
  "l2_reg": 5e-6,
  "early_stopping_patience": 50,
  "device": "cuda",
  "max_epochs": 100,
  "dropout": 0.2,
  "embedding_dim": 128,
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet",
  "best_checkpoint_path": null
}


## Init model

In [5]:
def init_model(n_users, n_items, embedding_dim, dropout, item_embedding=None):
    return SequenceRatingPrediction(
        item_embedding=item_embedding,
        num_users=n_users,
        num_items=n_items,
        embedding_dim=embedding_dim,
        dropout=dropout,
    )

## Test implementation

In [6]:
embedding_dim = 16
batch_size = 2

# Mock data
user_indices = [0, 0, 1, 2, 2]
item_indices = [0, 1, 2, 3, 4]
timestamps = [0, 1, 2, 3, 4]
ratings = [0, 3, 1, 3, 0]
item_sequences = [
    [-1, -1, 2, 3],
    [-1, -1, 2, 3],
    [-1, -1, 1, 3],
    [-1, -1, 2, 1],
    [-1, -1, 2, 1],
]

n_users = len(set(user_indices))
n_items = len(set(item_indices))

train_df = pd.DataFrame(
    {
        "user_indice": user_indices,
        "item_indice": item_indices,
        args.timestamp_col: timestamps,
        args.rating_col: ratings,
        "item_sequence": item_sequences,
    }
)

model = init_model(n_users, n_items, embedding_dim, args.dropout)

# Example forward pass
model.eval()
user = torch.tensor([0])
item_sequence = torch.tensor([[-1, -1, -1, 0, 1]])
target_item = torch.tensor([2])
predictions = model(user, item_sequence, target_item)
print(predictions)
model.train()

tensor([[0.1964]], grad_fn=<MaskedFillBackward0>)



enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.activation_relu_or_gelu was not True



SequenceRatingPrediction(
  (item_embedding): Embedding(7, 16, padding_idx=6)
  (user_embedding): Embedding(3, 16)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
    )
    (linear1): Linear(in_features=16, out_features=16, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=16, out_features=16, bias=True)
    (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
    (activation): PReLU(num_parameters=1)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (line

In [7]:
rating_dataset = UserItemBinaryRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col,"item_sequence"
)

train_loader = DataLoader(
    rating_dataset, batch_size=batch_size, shuffle=False, drop_last=True
)

In [8]:
for batch_input in train_loader:
    print(batch_input)

{'user': tensor([0, 0]), 'item': tensor([0, 1]), 'rating': tensor([0., 1.]), 'item_sequence': tensor([[-1, -1,  2,  3],
        [-1, -1,  2,  3]], dtype=torch.int32)}
{'user': tensor([1, 2]), 'item': tensor([2, 3]), 'rating': tensor([1., 1.]), 'item_sequence': tensor([[-1, -1,  1,  3],
        [-1, -1,  2,  1]], dtype=torch.int32)}


In [9]:
# model
lit_model = SeqModellingLitModule(model, log_dir=args.notebook_persit_dp)

# train model
trainer = L.Trainer(
    default_root_dir=f"{args.notebook_persit_dp}/test",
    max_epochs=100,
    accelerator=args.device if args.device else "auto",
)
trainer.fit(
    model=lit_model, train_dataloaders=train_loader, val_dataloaders=train_loader
)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                     | Params | Mode 
------------------------------------------------------------------------
0 | model              | SequenceRatingPrediction | 4.2 K  | train
1 | val_roc_auc_metric | BinaryAUROC              | 0      | train
2 | val_pr_auc_metric  | BinaryAveragePrecision   | 0      

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [10]:
model.eval()
user = torch.tensor([0])
item_sequence = torch.tensor([[-1, -1, -1, 0, 1]])
target_item = torch.tensor([2])
predictions = model.predict(user, item_sequence, target_item)
print(predictions)

tensor([[0.9936]], grad_fn=<SigmoidBackward0>)


## Training loop

In [11]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."

assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [12]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


## Convert user_id and item_id into indices

In [13]:
# idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
# idm = IDMapper().load(idm_path)
# idm.get_user_id(1)

In [14]:
# train_df = train_df.pipe(idm.map_indices)
# val_df = val_df.pipe(idm.map_indices)

# assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
# assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
# assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
# assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [15]:
# train_df.head(3)

In [16]:
# assert train_df.groupby(args.user_col)[args.item_col].nunique().min() >= 5, "Each user must have at least five items."
# assert train_df.groupby(args.item_col)[args.user_col].nunique().min() >= 10, "Each item must have at least ten users."

## Training loop

In [17]:
rating_dataset = UserItemBinaryRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col, "item_sequence"
)
val_rating_dataset = UserItemBinaryRatingDFDataset(
    val_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col, "item_sequence"
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False
)

In [18]:
item_indices = train_df[args.item_col].unique()
user_indices = train_df[args.user_col].unique()
n_items = len(item_indices)
n_users = len(user_indices)

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")
model = init_model(n_users, n_items, args.embedding_dim, args.dropout)

[32m2025-05-21 13:57:15.342[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of users: 16407, Number of items: 4817[0m

enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.activation_relu_or_gelu was not True



In [19]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

## Overfir 1 batch

In [20]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=5, mode="min", verbose=False
)

model = init_model(n_users, n_items, args.embedding_dim, args.dropout)
lit_model = SeqModellingLitModule(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/overfit"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=50,
    overfit_batches=1,
    callbacks=[early_stopping],
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=train_loader,
)
logger.info(f"Logs available at {trainer.log_dir}")

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                     | Params | Mode 
------------------------------------------------------------------------
0 | model              | SequenceRatingPrediction | 3.0 M  | train
1 | val_roc_auc_metric | BinaryAUROC              | 0      | train
2 | val_pr_auc_metric  | BinaryAveragePrecision   | 0      | train
------------------------------------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
11.829    Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


You requested to overfit but enabled val dataloader shuffling. We are turning off the val dataloader shuffling for you.


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


No negative samples in targets, false positive value should be meaningless. Returning zero tensor in false positive score


You requested to overfit but enabled train dataloader shuffling. We are turning off the train dataloader shuffling for you.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.
[32m2025-05-21 13:57:21.006[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mLogs available at /home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/006-sequence-modelling-attn-128-dim-bce-prelu/logs/overfit/lightning_logs/version_59[0m


In [21]:
for i in train_loader:
    print(i)
    break

{'user': tensor([10519,  2753, 15007,  5989, 15516,  1724,  3544,  2040, 13607,  9981,
         9324, 14952,  1422,  8790,  6454,  7855,  7116,  6549,   699, 11753,
         2451, 12682, 12035,  4839, 15821, 10611,  3470,  3620, 12991, 15144,
        15802, 10977, 10930,  2546,  7764,  2565, 12774,  7209,  2206,  9818,
        13101, 10258, 11163,  8071,    53, 12020,  6924,  4372, 16284, 15186,
        14699, 12280, 11647,  8472,   193,  7051,  3393,  8252,   621, 10885,
         2305,  3784,  8115, 11791, 15789,  3954, 14597,   269,  1328,  5257,
         3104,  1730,  1508,  5223,  6213, 10334,  8806,  8232, 13295,  5178,
         2078,  2402, 14237,  6009,   647,  1831, 11801, 14727,  3262,  7832,
        14509,  8830,  9971, 13990,  2965,  3056, 11489,  6624, 11543,  1692,
        13940, 13967, 12796,  7580, 11404, 12843,  6719, 14669,   650,  4669,
         3500, 16178, 13827, 10904,  6978,  7295,  4360,   657, 11383,  7597,
        11892,  5095, 11858,  4447, 12841, 13034, 14629

## Run on all data

In [22]:
early_stopping = EarlyStopping(
    monitor="val_roc_auc", patience=args.early_stopping_patience, mode="max", verbose=False, min_delta=0.001
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persit_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_roc_auc",
    mode="max",
)

model = init_model(n_users, n_items, args.embedding_dim, args.dropout)

print(f"Model: {model}")
lit_model = SeqModellingLitModule(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/run"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=args.max_epochs,
    callbacks=[early_stopping, checkpoint_callback],
    logger=args._mlf_logger if args.log_to_mlflow else None,
    
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
    
)

# Change the library as a workaround for the issue in the latest Lightning release
#https://github.com/Lightning-AI/pytorch-lightning/pull/20669/commits/429f732a0528c558e701da7ec01e51c1e2e4f32e

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Model: SequenceRatingPrediction(
  (item_embedding): Embedding(4819, 128, padding_idx=4818)
  (user_embedding): Embedding(16407, 128)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
    (linear1): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=128, out_features=128, bias=True)
    (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
    (activation): PReLU(num_parameters=1)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=


Checkpoint directory /home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/006-sequence-modelling-attn-128-dim-bce-prelu/checkpoints exists and is not empty.

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                     | Params | Mode 
------------------------------------------------------------------------
0 | model              | SequenceRatingPrediction | 3.0 M  | train
1 | val_roc_auc_metric | BinaryAUROC              | 0      | train
2 | val_pr_auc_metric  | BinaryAveragePrecision   | 0      | train
------------------------------------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
11.829    Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.




No negative samples in targets, false positive value should be meaningless. Returning zero tensor in false positive score


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

🏃 View run 006-sequence-modelling-attn-128-dim-bce-prelu at: http://138.2.61.6:5002/#/experiments/2/runs/478d1654ba75416bbeca6619131216c6
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2


## Log metrics

In [23]:
logger.info(f"Loading best checkpoint from {checkpoint_callback.best_model_path}...")
args.best_checkpoint_path = checkpoint_callback.best_model_path

best_trainer = SeqModellingLitModule.load_from_checkpoint(
    checkpoint_path=checkpoint_callback.best_model_path,
    model=init_model(n_users, n_items, args.embedding_dim, args.dropout),
)

[32m2025-05-21 14:35:56.826[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading best checkpoint from /home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/006-sequence-modelling-attn-128-dim-bce-prelu/checkpoints/best-checkpoint-v20.ckpt...[0m

enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.activation_relu_or_gelu was not True



In [24]:
best_model = best_trainer.model.to(args.device)
best_model.eval()

SequenceRatingPrediction(
  (item_embedding): Embedding(4819, 128, padding_idx=4818)
  (user_embedding): Embedding(16407, 128)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
    (linear1): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=128, out_features=128, bias=True)
    (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
    (activation): PReLU(num_parameters=1)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
 

In [25]:
val_recs_df = val_df.sort_values(by=args.timestamp_col).drop_duplicates(subset=[args.user_col], keep="first")

In [26]:
mlflow.start_run(run_id = trainer.logger.run_id)

<ActiveRun: >

### Classification metrics

In [27]:
val_user_indices = val_df["user_indice"].values
val_item_indices = val_df["item_indice"].values
val_item_sequences = val_df["item_sequence"].values.tolist()

In [28]:
users = torch.tensor(val_user_indices, device=args.device)
item_sequences = torch.tensor(val_item_sequences, device=args.device)
items = torch.tensor(val_item_indices, device=args.device)
classifications = best_model.predict(users, item_sequences, items)


Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /pytorch/torch/csrc/utils/tensor_new.cpp:254.)



In [29]:
classifications.shape

torch.Size([6958, 1])

In [30]:
eval_classification_df = val_df.assign(
    classification_proba=classifications.cpu().detach().numpy(),
    label=lambda df: df[args.rating_col].gt(0).astype(int),
)

In [31]:
eval_classification_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,classification_proba,label
0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,2020-12-27 00:30:31.146,11295,528,"[1715, 2537, 3743, 506, 4490, 3479, 3908, 2723...",0.498414,1
1,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,2.0,2020-12-27 01:44:52.242,1784,3925,"[-1, -1, -1, -1, -1, 3382, 4330, 423, 3167, 2677]",0.498414,1
2,AGAVHCK42EGMVS7DGPRX6HBCUCNQ,B09Q3NR84W,5.0,2020-12-27 02:25:48.357,9042,4273,"[-1, -1, -1, -1, 3104, 1416, 3743, 2694, 3612,...",0.498414,1


In [32]:
classification_report = log_classification_metrics(
    args,
    eval_classification_df,
    target_col="label",
    prediction_col="classification_proba",
)


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Ranking metrics

In [33]:
val_recs_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,2020-12-27 00:30:31.146,11295,528,"[1715, 2537, 3743, 506, 4490, 3479, 3908, 2723..."
1,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,2.0,2020-12-27 01:44:52.242,1784,3925,"[-1, -1, -1, -1, -1, 3382, 4330, 423, 3167, 2677]"
2,AGAVHCK42EGMVS7DGPRX6HBCUCNQ,B09Q3NR84W,5.0,2020-12-27 02:25:48.357,9042,4273,"[-1, -1, -1, -1, 3104, 1416, 3743, 2694, 3612,..."
3,AEFVBMCJAFNULDI5V2CKKTBCPURA,B07N1L5HX1,5.0,2020-12-27 02:32:15.171,1542,3550,"[-1, -1, -1, -1, -1, 1320, 2162, 2472, 2694, 3..."
4,AGLXMKHBLTBNT3X2CLBAPW6QUTQA,B0BB6Y5N3M,5.0,2020-12-27 03:37:22.772,10418,4471,"[341, 3803, 4431, 1067, 4530, 4018, 2688, 4365..."
...,...,...,...,...,...,...,...
6496,AGGDNWGN3NDJ2DI5CBSFOMUAM6XA,B076XFGK32,0.0,2022-02-18 19:43:25.492,9711,3115,"[-1, -1, -1, -1, 1019, 754, 2059, 413, 4262, 3..."
3474,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 9..."
3475,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625,..."
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]"


In [34]:
recommendations = best_model.recommend(
    torch.tensor(val_recs_df["user_indice"].values, device=args.device),
    torch.tensor(val_recs_df["item_sequence"].values.tolist(), device=args.device),
    k=args.top_K,
    batch_size=1)

Generating recommendations:   0%|          | 0/2424 [00:00<?, ?it/s]

In [35]:
recommendations_df = pd.DataFrame(recommendations).pipe(
    create_rec_df, idm, args.user_col, args.item_col
)
recommendations_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,11295,4817,0.498414,1.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,unknown_item
1,11295,0,0.498414,2.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,0972683275
2,11295,1,0.498414,3.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,1449410243
3,11295,2,0.498414,4.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B000001OM5
4,11295,3,0.498414,5.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B00000K2YR
...,...,...,...,...,...,...
242395,2446,94,0.498414,96.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B000EH0NLK
242396,2446,95,0.498414,97.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B000ENRQ3M
242397,2446,96,0.498414,98.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B000ER5G6C
242398,2446,97,0.498414,99.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B000EZL1EA


In [36]:
label_df = create_label_df(
    val_df,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
    timestamp_col=args.timestamp_col,
)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
3478,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B091K4WYD1,4.0,1.0
1781,AHZ6GFHFM6Z7CRPSXRIYQ5Z7GERQ,B07JMQP6T6,5.0,1.0
1780,AFQZQHAMZHP54BLVW3AZG2NDKAQA,B01N27P7ME,3.0,1.0
1779,AH7L2ZE36P7Q7ZDTDE2FIWWBU7ZA,B0B5J7MLTS,5.0,1.0
1777,AGOAZS3ZJNV74POYA7OW2JBZYAQQ,B0B2Y5WYRG,5.0,1.0
...,...,...,...,...
6099,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B0BSF17PM2,0.0,17.0
5244,AEN2KQVSR5TWRXNQS3OTFT4EZQCA,B07D4Z36V8,0.0,18.0
4648,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B003XRES32,0.0,18.0
4949,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B0051VVOB2,0.0,19.0


In [37]:
eval_df = merge_recs_with_target(
    recommendations_df,
    label_df,
    k=args.top_K,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
103,8.0,4817.0,0.498414,1,AE24AB4DW5KYK3F5DYOT5VPW2VLA,unknown_item,0,
0,8.0,0.0,0.498414,2,AE24AB4DW5KYK3F5DYOT5VPW2VLA,0972683275,0,
1,8.0,1.0,0.498414,3,AE24AB4DW5KYK3F5DYOT5VPW2VLA,1449410243,0,
2,8.0,2.0,0.498414,4,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B000001OM5,0,
3,8.0,3.0,0.498414,5,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B00000K2YR,0,
...,...,...,...,...,...,...,...,...
249274,16403.0,96.0,0.498414,98,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B000ER5G6C,0,
249275,16403.0,97.0,0.498414,99,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B000EZL1EA,0,
249276,16403.0,98.0,0.498414,100,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B000F5K82A,0,
249277,,,,101,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B075QC3TZY,1,1.0


In [38]:
ranking_report = log_ranking_metrics(args, eval_df)


invalid value encountered in divide



In [39]:
mlflow.end_run()

🏃 View run 006-sequence-modelling-attn-128-dim-bce-prelu at: http://138.2.61.6:5002/#/experiments/2/runs/478d1654ba75416bbeca6619131216c6
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2


## Clean up

In [40]:
all_params = [args]

if args.log_to_mlflow:
    run_id = trainer.logger.run_id

    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.model_dump()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)

🏃 View run 006-sequence-modelling-attn-128-dim-bce-prelu at: http://138.2.61.6:5002/#/experiments/2/runs/478d1654ba75416bbeca6619131216c6
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2
