## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.sequence.model import SequenceRatingPrediction
from src.algo.sequence.dataset import UserItemBinaryRatingDFDataset
from src.algo.sequence.trainer import SeqModellingLitModule
from src.algo.item2vec.trainer import LitSkipGram
from src.algo.item2vec.model import SkipGram
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
load_dotenv(override = True)

True

In [None]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "pruning-sequence-modelling"
    notebook_persit_dp: str = None
    
    run_name: str = None

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "seq-modelling"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 512
    learning_rate: float = 0.001
    l2_reg: float = 1e-6
    early_stopping_patience: int = 10
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    max_epochs: int = 100

    # TwoTower specific
    dropout: float = 0.3
    embedding_dim: int = 256
    use_user_embedding: bool = True
    user_embedding_dim: int = 32

    use_metadata: bool = True
    metadata_embedding_dim: int = 384
    metadata_fc_dim: int = 128


    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    best_checkpoint_path: str = None
    def init(self):
        self.run_name: str = f"attn-{self.embedding_dim}-dim-bce-prelu"
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=False,
            )

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-06-25 23:10:59.313[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m49[0m - [1mSetting up Mlflow experiment: pruning-sequence-modelling, run_name: attn-256-dim-bce-prelu[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "pruning-sequence-modelling",
  "notebook_persit_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\pruning-sequence-modelling\\attn-256-dim-bce-prelu",
  "run_name": "attn-256-dim-bce-prelu",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "seq-modelling",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 512,
  "learning_rate": 0.001,
  "l2_reg": 1e-6,
  "early_stopping_patience": 10,
  "device": "cpu",
  "max_epochs": 1,
  "dropout": 0.3,
  "embedding_dim": 256,
  "use_user_embedding": true,
  "user_embedding_dim": 32,
  "use_metadata": true,
  "metadata_embedding_dim": 384,
  "metadata_fc_dim": 64,
  "train_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\train_sample_interactions_16407u_neg_seq.parquet",
  "val_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\

## Init model

In [5]:
# Load metadata embedding
metadata_embedding_matrix = np.load("../data_for_ai/interim/metadata_title_embedding.npy")  # shape (4817, 898)
metadata_embedding_tensor = torch.tensor(metadata_embedding_matrix, dtype=torch.float32)
metadata_embedding_layer = nn.Embedding.from_pretrained(
    embeddings=metadata_embedding_tensor,
    freeze=True,)

In [6]:
print(f"Metadata embedding shape: {metadata_embedding_tensor.shape}")

Metadata embedding shape: torch.Size([4817, 384])


In [7]:
def init_model(n_users, n_items, embedding_dim, dropout, item_embedding=None,
               user_embedding_dim=None,use_user_embedding=False,
               use_metadata=False, metadata_embedding=None,
               metadata_embedding_dim=898, metadata_fc_dim=256):
    return SequenceRatingPrediction(
        item_embedding=item_embedding,
        num_users=n_users,
        num_items=n_items,
        embedding_dim=embedding_dim,
        dropout=dropout,
        user_embedding_dim=user_embedding_dim,
        use_user_embedding=use_user_embedding,
        use_metadata=use_metadata,
        metadata_embedding=metadata_embedding,
        metadata_embedding_dim=metadata_embedding_dim,
        metadata_fc_dim=metadata_fc_dim,
    )

## Test implementation

In [8]:
embedding_dim = 32
user_embedding_dim = 64
batch_size = 2

# Mock data
user_indices = [0, 0, 1, 2, 2]
item_indices = [0, 1, 2, 3, 4]
timestamps = [0, 1, 2, 3, 4]
ratings = [0, 3, 1, 3, 0]
# item_sequences = [
#     [2, 3, -1, -1],
#     [2, 4, -1, -1],
#     [1, 3, -1, -1],
#     [2, 1, -1, -1],
#     [4, 1, -1, -1],
# ]

item_sequences = [
    [-1, -1, 2, 3],
    [-1, -1, 2, 4],
    [-1, -1, 1, 3],
    [-1, -1, 2, 1],
    [-1, -1, 4, 1],
]


n_users = len(set(user_indices))

n_items = len(set(item_indices))

train_df = pd.DataFrame(
    {
        "user_indice": user_indices,
        "item_indice": item_indices,
        args.timestamp_col: timestamps,
        args.rating_col: ratings,
        "item_sequence": item_sequences,
    }
)
# Mock metadata embedding (giả sử mỗi item có 32 chiều metadata embedding)
mock_metadata_embedding_matrix = np.random.randn(n_items, embedding_dim).astype(np.float32)
mock_metadata_embedding_layer = nn.Embedding.from_pretrained(
    embeddings=torch.tensor(mock_metadata_embedding_matrix),
    freeze=True)

model = init_model(n_users, 
                   n_items, 
                   embedding_dim, 
                   args.dropout,
                   user_embedding_dim=user_embedding_dim,
                   use_user_embedding=args.use_user_embedding,
                   use_metadata=args.use_metadata,
                   metadata_embedding=mock_metadata_embedding_layer,
                   metadata_embedding_dim=embedding_dim,
                   metadata_fc_dim=16,)

# Example forward pass
model.eval()
user = torch.tensor([0])
item_sequence = torch.tensor([[-0, 1, -1, -1]])
target_item = torch.tensor([2])
target_metadata = torch.tensor([2])  # giả sử metadata index trùng với item index
predictions = model(user, item_sequence, target_item, target_metadata)
print(predictions)
model.train()


enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.activation_relu_or_gelu was not True

[32m2025-06-25 23:11:03.308[0m | [1mINFO    [0m | [36msrc.algo.sequence.model[0m:[36m__init__[0m:[36m139[0m - [1mStart token used: 4, Padding token used: 5[0m


tensor([[0.4281]], grad_fn=<MaskedFillBackward0>)


SequenceRatingPrediction(
  (item_embedding): Embedding(6, 32, padding_idx=5)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
    )
    (linear1): Linear(in_features=32, out_features=32, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
    (linear2): Linear(in_features=32, out_features=32, bias=True)
    (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.3, inplace=False)
    (dropout2): Dropout(p=0.3, inplace=False)
    (activation): PReLU(num_parameters=1)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_feat

In [9]:
rating_dataset = UserItemBinaryRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col,"item_sequence"
)

train_loader = DataLoader(
    rating_dataset, batch_size=batch_size, shuffle=False, drop_last=True
)

In [10]:
for batch_input in train_loader:
    print(batch_input)

{'user': tensor([0, 0]), 'item': tensor([0, 1]), 'rating': tensor([0., 1.]), 'item_sequence': tensor([[-1, -1,  2,  3],
        [-1, -1,  2,  4]], dtype=torch.int32)}
{'user': tensor([1, 2]), 'item': tensor([2, 3]), 'rating': tensor([1., 1.]), 'item_sequence': tensor([[-1, -1,  1,  3],
        [-1, -1,  2,  1]], dtype=torch.int32)}


In [11]:
# model
lit_model = SeqModellingLitModule(model, log_dir=args.notebook_persit_dp)

# train model
trainer = L.Trainer(
    default_root_dir=f"{args.notebook_persit_dp}/test",
    max_epochs=100,
    accelerator=args.device if args.device else "auto",
)
# trainer.fit(
#     model=lit_model, train_dataloaders=train_loader, val_dataloaders=train_loader
# )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
model.eval()
user = torch.tensor([0])
item_sequence = torch.tensor([[-1, -1, -1, 0, 1]])
target_item = torch.tensor([2])
target_metadata = torch.tensor([2])  # giả sử metadata index trùng với item index
predictions = model.predict(user, item_sequence, target_item,target_metadata)
print(predictions)

tensor([[0.5759]], grad_fn=<SigmoidBackward0>)


## Training loop

In [13]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."

assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [14]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
151343,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B009RUZ7TS,0.0,2014-07-17 19:15:55.000,1412,4220,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 455..."
40958,AF7KZV4NJ5GBDVFTB7PEEUN4U53A,B0BBMLD8QT,5.0,2015-07-29 20:38:06.000,4871,4476,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
218918,AFVQ4K4KZPLQ3E2VFYSGX6HFXGNQ,B0BB6R89VF,0.0,2017-12-13 20:35:02.334,7616,1218,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 129..."
43115,AFCLWJMGYFCOJQR7T4454OF5A5WA,B00ENFP224,5.0,2015-09-06 12:09:59.000,5250,1355,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
233421,AFP4PHJ6Q2RRXLDPSDSH6VXJRUTA,B07CMXS5FP,0.0,2018-11-23 09:44:21.734,6792,838,"[-1.0, -1.0, -1.0, 1055.0, 3572.0, 3865.0, 176..."
...,...,...,...,...,...,...,...
250960,AGQHC7YNLYP4QV2PSBD6URSMJSVA,B07H65KP63,0.0,2020-02-08 04:09:50.457,11001,3568,"[-1.0, -1.0, -1.0, -1.0, 3585.0, 1866.0, 4040...."
217058,AHD65JAOVTTPDNJWOLSSGS3QVK6Q,B07DKMJ61N,0.0,2017-11-02 15:25:18.351,13410,4239,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
61324,AF32PWYNLPCVAU4UX35IEAZOFA3Q,B011BRUOMO,5.0,2016-07-18 05:42:21.000,4264,2253,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AGM65FYYAPHOLESGIDMFMPUQIYNA,B0016BVDIK,0.0,2010-12-16 19:59:19.000,10445,4250,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [15]:
with pd.option_context("display.max_colwidth", None):
    display(train_df.loc[train_df["user_id"] == "AEEV5YWQKPBTLFWHKOBBULYA2RDQ"].sort_values(by=args.timestamp_col, ascending=False))

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
172167,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B07C1RSV9C,0.0,2015-08-05 16:31:49,1412,934,"[-1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0, 4685.0, 107.0, 3295.0]"
41296,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B07C1RSV9C,5.0,2015-08-05 16:31:49,1412,3276,"[-1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0, 4685.0, 107.0, 3295.0]"
151346,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B07CB22VVJ,0.0,2014-07-17 19:19:28,1412,4599,"[-1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0, 4685.0, 107.0]"
20475,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B07CB22VVJ,5.0,2014-07-17 19:19:28,1412,3295,"[-1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0, 4685.0, 107.0]"
20474,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B000I23TTE,5.0,2014-07-17 19:16:43,1412,107,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0, 4685.0]"
151345,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B000I23TTE,0.0,2014-07-17 19:16:43,1412,4659,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0, 4685.0]"
151344,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B0BYSP9676,0.0,2014-07-17 19:16:20,1412,3678,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0]"
20473,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B0BYSP9676,5.0,2014-07-17 19:16:20,1412,4685,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0, 1047.0]"
151343,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B009RUZ7TS,0.0,2014-07-17 19:15:55,1412,4220,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0]"
20472,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B009RUZ7TS,5.0,2014-07-17 19:15:55,1412,1047,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4559.0, 4443.0, 3164.0]"


## Convert user_id and item_id into indices

In [16]:
# idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
# idm = IDMapper().load(idm_path)
# idm.get_user_id(1)

In [17]:
# train_df = train_df.pipe(idm.map_indices)
# val_df = val_df.pipe(idm.map_indices)

# assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
# assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
# assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
# assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [18]:
# train_df.head(3)

In [19]:
# assert train_df.groupby(args.user_col)[args.item_col].nunique().min() >= 5, "Each user must have at least five items."
# assert train_df.groupby(args.item_col)[args.user_col].nunique().min() >= 10, "Each item must have at least ten users."

## Training loop

In [20]:
rating_dataset = UserItemBinaryRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col, "item_sequence"
)
val_rating_dataset = UserItemBinaryRatingDFDataset(
    val_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col, "item_sequence"
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False
)

## Load the weight from SkipGram Model

In [21]:
item_indices = train_df[args.item_col].unique()
user_indices = train_df[args.user_col].unique()
n_items = len(item_indices)
n_users = len(user_indices)

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")

assert args.embedding_dim == 256, "Embedding dimension must be 256"
best_trainer = LitSkipGram.load_from_checkpoint(
    "../data_for_ai/interim/best-item2vec-weight.ckpt",
    skipgram_model=SkipGram(n_items, args.embedding_dim).to(args.device),
)
skipgram_item_embedding = best_trainer.skipgram_model.embeddings.weight.data.cpu()
print(f"SkipGram Item embedding shape: {skipgram_item_embedding.shape}")
print(f"SkipGram Item embedding dtype: {skipgram_item_embedding.dtype}")
# convert skipgram_item_embedding into torch.nn.Embedding and take the first n_items rows
skipgram_item_embedding = torch.nn.Embedding.from_pretrained(
    skipgram_item_embedding[:n_items], freeze=False
)

model = init_model(n_users, 
                   n_items, 
                   args.embedding_dim, 
                   args.dropout, 
                   item_embedding=skipgram_item_embedding,
                   user_embedding_dim=args.user_embedding_dim,
                   use_user_embedding=args.use_user_embedding,)

[32m2025-06-25 23:11:09.292[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of users: 16407, Number of items: 4817[0m
[32m2025-06-25 23:11:09.292[0m | [1mINFO    [0m | [36msrc.algo.item2vec.model[0m:[36m__init__[0m:[36m12[0m - [1mInitializing item embeddings with num items 4817, embedding dim 256[0m

The loaded checkpoint was produced with Lightning v2.5.2, which is newer than your current Lightning version: v2.5.0


enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.activation_relu_or_gelu was not True

[32m2025-06-25 23:11:10.946[0m | [1mINFO    [0m | [36msrc.algo.sequence.model[0m:[36m__init__[0m:[36m139[0m - [1mStart token used: 4816, Padding token used: 4817[0m


SkipGram Item embedding shape: torch.Size([4818, 256])
SkipGram Item embedding dtype: torch.float32


In [22]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

4817 items in the dataset


'AE227WAM4NWQPJI33OPN7ZARNNZQ'

## Overfit 1 batch

In [23]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=5, mode="min", verbose=False
)

model = init_model(n_users, n_items, args.embedding_dim, args.dropout,item_embedding=skipgram_item_embedding,
                   user_embedding_dim=args.user_embedding_dim,
                   use_user_embedding=args.use_user_embedding,
                   use_metadata=args.use_metadata,
                   metadata_embedding=metadata_embedding_layer,
                   metadata_embedding_dim=args.metadata_embedding_dim,
                   metadata_fc_dim=args.metadata_fc_dim,)

lit_model = SeqModellingLitModule(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/overfit"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=100,
    overfit_batches=1,
    callbacks=[early_stopping],
)
# trainer.fit(
#     model=lit_model,
#     train_dataloaders=train_loader,
#     val_dataloaders=train_loader,
# )
# logger.info(f"Logs available at {trainer.log_dir}")

[32m2025-06-25 23:11:12.454[0m | [1mINFO    [0m | [36msrc.algo.sequence.model[0m:[36m__init__[0m:[36m139[0m - [1mStart token used: 4816, Padding token used: 4817[0m
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.


In [24]:
for i in train_loader:
    print(i)
    break

{'user': tensor([15643,  5011,  1608,  4922,  5019, 14584, 14314, 13948, 13286, 11142,
         6569,  5527, 11506,  3068, 15823,  2199,  8283,  5215,  5425,  3310,
         9266, 10769,  8809, 14871, 15064, 14930, 15141,  2710, 13912,  6380,
         9480, 10621,  8964, 11753, 16104, 14836, 15977, 15699,  3112, 10274,
         2460,  7407, 14882,  3879,  7809, 10917, 15700, 15773,  8969, 13008,
        13461, 16229,  7816,  4601,  6203,  4348, 11473, 15228, 13129,  2763,
         1303,  6898,  6831,  5788, 11677,  7654,  2409, 15448, 16306,  6895,
        12442, 13932,  3547,  5166,  8253,  2854, 16276,  1984, 16361, 14002,
         6481,  9698,  1069,  4666, 14172, 11283, 15737, 14952,  6466, 12450,
         6862, 14018, 12269,  6984,  6671,  9372,  2628,  6028,  8945,  9932,
         4313,  7728, 13738, 10288,  5007, 12346,  4168, 13771, 11787,  7608,
        12712,  9770, 15713, 11183, 11895, 10785,  4069, 10561, 14258,  2442,
        15107,  3978, 15969, 10347,  8257,  3190, 14296

## Run on all data

In [25]:
import lightning
print(lightning.__version__)

2.5.0


In [None]:
early_stopping = EarlyStopping(
    monitor="val_roc_auc", patience=args.early_stopping_patience, mode="max", verbose=False, min_delta=0.001
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persit_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_roc_auc",
    mode="max",
)

model = init_model(n_users, n_items, 
                   args.embedding_dim, 
                   args.dropout, 
                   item_embedding=skipgram_item_embedding,
                   user_embedding_dim=args.user_embedding_dim,
                   use_user_embedding=args.use_user_embedding,
                   use_metadata=args.use_metadata,
                   metadata_embedding=metadata_embedding_layer,
                   metadata_embedding_dim=args.metadata_embedding_dim,
                   metadata_fc_dim=args.metadata_fc_dim,)

print(f"Model: {model}")
lit_model = SeqModellingLitModule(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/run"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=args.max_epochs,
    callbacks=[early_stopping, checkpoint_callback],
    logger=args._mlf_logger if args.log_to_mlflow else None,
    # limit_train_batches=0.1    
)

trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
    
)

# Change the library as a workaround for the issue in the latest Lightning release
#https://github.com/Lightning-AI/pytorch-lightning/pull/20669/commits/429f732a0528c558e701da7ec01e51c1e2e4f32e

[32m2025-06-25 23:11:14.193[0m | [1mINFO    [0m | [36msrc.algo.sequence.model[0m:[36m__init__[0m:[36m139[0m - [1mStart token used: 4816, Padding token used: 4817[0m
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Model: SequenceRatingPrediction(
  (item_embedding): Embedding(4818, 256, padding_idx=4817)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=256, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
    (linear2): Linear(in_features=256, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.3, inplace=False)
    (dropout2): Dropout(p=0.3, inplace=False)
    (activation): PReLU(num_parameters=1)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(


Checkpoint directory C:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\pruning-sequence-modelling\attn-256-dim-bce-prelu\checkpoints exists and is not empty.


  | Name               | Type                     | Params | Mode 
------------------------------------------------------------------------
0 | model              | SequenceRatingPrediction | 4.7 M  | train
1 | val_roc_auc_metric | BinaryAUROC              | 0      | train
2 | val_pr_auc_metric  | BinaryAveragePrecision   | 0      | train
------------------------------------------------------------------------
2.9 M     Trainable params
1.8 M     Non-trainable params
4.7 M     Total params
18.848    Total estimated model params size (MB)
38        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.




The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


The number of training batches (49) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


🏃 View run attn-256-dim-bce-prelu at: http://138.2.61.6:5002/#/experiments/11/runs/bb63143d05c54db19c3941ce7e05885c
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/11


## Log metrics

In [27]:
logger.info(f"Loading best checkpoint from {checkpoint_callback.best_model_path}...")
args.best_checkpoint_path = checkpoint_callback.best_model_path

best_trainer = SeqModellingLitModule.load_from_checkpoint(
    checkpoint_path=checkpoint_callback.best_model_path,
    model=init_model(n_users, 
                     n_items, 
                     args.embedding_dim, 
                     args.dropout, 
                     item_embedding=skipgram_item_embedding,
                     user_embedding_dim=args.user_embedding_dim,
                     use_user_embedding=args.use_user_embedding,
                     use_metadata=args.use_metadata,
                     metadata_embedding=metadata_embedding_layer,
                     metadata_embedding_dim=args.metadata_embedding_dim,
                     metadata_fc_dim=args.metadata_fc_dim,),
)

[32m2025-06-25 23:11:40.298[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading best checkpoint from C:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\pruning-sequence-modelling\attn-256-dim-bce-prelu\checkpoints\best-checkpoint-v1.ckpt...[0m
[32m2025-06-25 23:11:40.340[0m | [1mINFO    [0m | [36msrc.algo.sequence.model[0m:[36m__init__[0m:[36m139[0m - [1mStart token used: 4816, Padding token used: 4817[0m


In [28]:
best_model = best_trainer.model.to(args.device)
best_model.eval()

SequenceRatingPrediction(
  (item_embedding): Embedding(4818, 256, padding_idx=4817)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=256, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
    (linear2): Linear(in_features=256, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.3, inplace=False)
    (dropout2): Dropout(p=0.3, inplace=False)
    (activation): PReLU(num_parameters=1)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_feat

In [29]:
val_recs_df = val_df.sort_values(by=args.timestamp_col).drop_duplicates(subset=[args.user_col], keep="first")

In [30]:
mlflow.start_run(run_id = trainer.logger.run_id)

<ActiveRun: >

### Classification metrics

In [31]:
val_user_indices = val_df["user_indice"].values
val_item_indices = val_df["item_indice"].values
val_item_sequences = val_df["item_sequence"].values.tolist()

In [32]:
users = torch.tensor(val_user_indices, device=args.device)
item_sequences = torch.tensor(val_item_sequences, device=args.device)
items = torch.tensor(val_item_indices, device=args.device)
classifications = best_model.predict(users, item_sequences, items)


Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\utils\tensor_new.cpp:257.)



In [33]:
classifications.shape

torch.Size([6958, 1])

In [34]:
eval_classification_df = val_df.assign(
    classification_proba=classifications.cpu().detach().numpy(),
    label=lambda df: df[args.rating_col].gt(0).astype(int),
)

In [35]:
eval_classification_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,classification_proba,label
260331,AGMJWWTZ6HMM2FBRDLFW2CWMV5DQ,B00E0ISVLI,0.0,2021-07-18 15:44:29.739,10483,2563,"[-1, 2906, 3011, 4674, 4593, 4755, 3810, 3921,...",0.505045,0
259198,AE3XVOCHEO5MTDIAIET5BZS26AJA,B07GPGVYGX,0.0,2021-03-12 03:28:00.854,254,3381,"[-1, -1, -1, -1, 1188, 1510, 4399, 3089, 2290,...",0.500309,0
258841,AESPJW3GNHXNJNW5CYV7PTEX44MQ,B07GZFM1ZM,0.0,2021-02-09 16:08:20.512,3190,921,"[-1, -1, -1, -1, -1, 2569, 2742, 2855, 2351, 346]",0.486175,0


In [36]:
classification_report = log_classification_metrics(
    args,
    eval_classification_df,
    target_col="label",
    prediction_col="classification_proba",
)

### Ranking metrics

In [37]:
val_recs_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
258263,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,0.0,2020-12-27 00:30:31.146,11295,3051,"[1715, 2537, 3743, 506, 4490, 3479, 3908, 2723..."
258264,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,0.0,2020-12-27 01:44:52.242,1784,3316,"[-1, -1, -1, -1, -1, 3382, 4330, 423, 3167, 2677]"
258265,AGAVHCK42EGMVS7DGPRX6HBCUCNQ,B09Q3NR84W,0.0,2020-12-27 02:25:48.357,9042,32,"[-1, -1, -1, -1, 3104, 1416, 3743, 2694, 3612,..."
127395,AEFVBMCJAFNULDI5V2CKKTBCPURA,B07N1L5HX1,5.0,2020-12-27 02:32:15.171,1542,3550,"[-1, -1, -1, -1, -1, 1320, 2162, 2472, 2694, 3..."
127396,AGLXMKHBLTBNT3X2CLBAPW6QUTQA,B0BB6Y5N3M,5.0,2020-12-27 03:37:22.772,10418,4471,"[341, 3803, 4431, 1067, 4530, 4018, 2688, 4365..."
...,...,...,...,...,...,...,...
261735,AGGDNWGN3NDJ2DI5CBSFOMUAM6XA,B083YHS7SV,0.0,2022-02-18 19:43:25.492,9711,2513,"[-1, -1, -1, -1, 1019, 754, 2059, 413, 4262, 3..."
130866,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 9..."
130867,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625,..."
261740,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,0.0,2022-02-19 17:28:55.519,14550,2383,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]"


In [38]:
recommendations = best_model.recommend(
    torch.tensor(val_recs_df["user_indice"].values, device=args.device),
    torch.tensor(val_recs_df["item_sequence"].values.tolist(), device=args.device),
    k=args.top_K,
    batch_size=32)

Generating recommendations:   0%|          | 0/364964 [00:00<?, ?it/s]

IndexError: index out of range in self

In [None]:
recommendations_df = pd.DataFrame(recommendations).pipe(
    create_rec_df, idm, args.user_col, args.item_col
)
recommendations_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,11295,3032,0.834206,1.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B074F3M2W8
1,11295,3188,0.830017,2.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B0791TX5P5
2,11295,3472,0.829827,3.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07HZLHPKP
3,11295,3915,0.829290,4.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B08D7JPKLZ
4,11295,3983,0.825969,5.0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B08N5TC2Z3
...,...,...,...,...,...,...
242395,2446,3200,0.825623,96.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B0795DP124
242396,2446,2998,0.824843,97.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B072KGYYKX
242397,2446,3476,0.824502,98.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B07J2FGZSM
242398,2446,3659,0.823378,99.0,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B07RR6HQKX


In [None]:
label_df = create_label_df(
    val_df,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
    timestamp_col=args.timestamp_col,
)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
128912,AG2EMAD6UILFF4ITMMKH2NEFTYHA,B0BZJGGX2T,5.0,1.0
127943,AHPI7N36W4JJYOAA6MBAGWTDF3FA,B072MKFNV6,4.0,1.0
129406,AGOBLEZGF5OSPDVTIMA3DPWAENGA,B07H65KP63,5.0,1.0
130173,AFXRWTDGQJIDOTGMCIZKH5QPK5KA,B00PKTU83U,5.0,1.0
128534,AHWDVLU5PTXMLD6PSJ2Z5Q3JP4OA,B0719SNR5N,3.0,1.0
...,...,...,...,...
386737,AEN2KQVSR5TWRXNQS3OTFT4EZQCA,B0BRT7XFM5,0.0,27.0
388052,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B07G5YXYFL,0.0,27.0
388051,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B07G5YXYFL,0.0,28.0
387885,AFKERAMSXU4MWO3H53R7DEFOHUVQ,B07ZZVX1F2,0.0,29.0


In [None]:
eval_df = merge_recs_with_target(
    recommendations_df,
    label_df,
    k=args.top_K,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
26,8.0,3251.0,0.912598,1,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07BPKL2D2,0,
96,8.0,4669.0,0.908320,2,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B0BXP3P132,0,
72,8.0,4365.0,0.907467,3,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B09ZMQWGCG,0,
102,8.0,4785.0,0.904956,4,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B0C6PRR41P,0,
27,8.0,3300.0,0.904170,5,AE24AB4DW5KYK3F5DYOT5VPW2VLA,B07CG2PGY6,0,
...,...,...,...,...,...,...,...,...
252360,16403.0,3550.0,0.787922,99,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B07N1L5HX1,0,
252390,16403.0,3903.0,0.787658,100,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B08BZSCHZ3,0,
252328,,,,101,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B075QC3TZY,1,1.0
252329,,,,101,AHZZM7BCJAF2UEMMBHZCLXBB2SVA,B075QC3TZY,0,2.0


In [None]:
ranking_report = log_ranking_metrics(args, eval_df)

In [None]:
mlflow.end_run()

🏃 View run 006-sequence-modelling-attn-256-dim-bce-prelu at: http://138.2.61.6:5002/#/experiments/2/runs/2a81df7b8a4d4fbaba79e8ff0d416664
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2


## Clean up

In [None]:
all_params = [args]

if args.log_to_mlflow:
    run_id = trainer.logger.run_id

    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.model_dump()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)

🏃 View run 006-sequence-modelling-attn-256-dim-bce-prelu at: http://138.2.61.6:5002/#/experiments/2/runs/2a81df7b8a4d4fbaba79e8ff0d416664
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2
