In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import time
import json
import torch
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.gSASRec.model import SASRec
from src.algo.gSASRec.dataset import SASRecDataset
from src.algo.gSASRec.trainer import SASRecLitModule
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
load_dotenv(override = True)

False

In [None]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    run_name: str = f"018-sasrec"
    notebook_persit_dp: str = None
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "seq-modelling"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 512
    lr: float = 0.0001
    l2_emb: float = 0.001
    early_stopping_patience: int = 10
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_epochs: int = 200

    # SASrec specific
    max_len: int = 10
    dropout: float = 0.3
    hidden_units: int = 128
    num_blocks: int = 1
    num_heads: int = 2
    num_workers: int = 3
    # seq_length: int = 10
    
    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))



{
  "testing": false,
  "log_to_mlflow": false,
  "experiment_name": "first-attempt",
  "run_name": "018-sasrec",
  "notebook_persit_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\first-attempt\\018-sasrec",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "seq-modelling",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 512,
  "lr": 0.001,
  "l2_emb": 0.001,
  "early_stopping_patience": 10,
  "device": "cpu",
  "num_epochs": 100,
  "max_len": 10,
  "dropout": 0.3,
  "hidden_units": 128,
  "num_blocks": 1,
  "num_heads": 2,
  "num_workers": 3,
  "train_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\train_sample_interactions_16407u_neg_seq.parquet",
  "val_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\val_sample_interactions_16407u_neg_seq.parquet"
}


In [5]:
train_df = pd.read_parquet(args.train_data_fp)
train_df[args.rating_col] = train_df[args.rating_col].apply(lambda x: 1 if x > 0 else 0)    
train_df = train_df[train_df['item_sequence'].apply(lambda seq: not all(item == -1 for item in seq))]        

val_df = pd.read_parquet(args.val_data_fp)
val_df[args.rating_col] = val_df[args.rating_col].apply(lambda x: 1 if x > 0 else 0)
val_df = val_df[val_df['item_sequence'].apply(lambda seq: not all(item == -1 for item in seq))]

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."

assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [6]:
val_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1,2020-12-27 00:30:31.146,11295,528,"[1898, 3479, 3908, 1570, 91, 2723, 2962, 106, ..."
1,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07KFQFDNB,0,2020-12-27 00:30:31.146,11295,3503,"[3479, 3908, 1570, 91, 2723, 2962, 106, 3557, ..."
2,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,1,2020-12-27 01:44:52.242,1784,3925,"[4319, 3382, 4330, 1173, 1330, 423, 2868, 3167..."
3,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B00HXT8EKE,0,2020-12-27 01:44:52.242,1784,1507,"[3382, 4330, 1173, 1330, 423, 2868, 3167, 1071..."
4,AGAVHCK42EGMVS7DGPRX6HBCUCNQ,B09Q3NR84W,1,2020-12-27 02:25:48.357,9042,4273,"[1311, 1416, 455, 3743, 1823, 2694, 3612, 3462..."
...,...,...,...,...,...,...,...
6953,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B01A08E70K,0,2022-02-19 16:56:53.030,1396,2441,"[3451, 3827, 1839, 1347, 2504, 2694, 4546, 427..."
6954,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,1,2022-02-19 17:28:55.519,14550,4772,"[2950, 1812, 4735, 4165, 4575, 2440, 607, 4807..."
6955,AHLN6GKTKZE22AON34YAQXTGK63A,B09SWWCN6Q,0,2022-02-19 17:28:55.519,14550,4303,"[1812, 4735, 4165, 4575, 2440, 607, 4807, 374,..."
6956,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B091K4WYD1,1,2022-02-19 22:08:53.253,2446,4086,"[644, 3602, 4569, 1865, 3030, 3653, 3803, 3998..."


In [7]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
1,AFZ4EK2LJ655XQKTEUELCARO6RYA,B095JX15XF,0,2003-01-23 03:28:15.000,8071,4132,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00OQVZDJM,0,2003-11-25 18:12:09.000,7935,1859,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
5,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,1,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
7,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B07W4GJGCM,0,2004-09-13 20:18:44.000,12730,3734,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
9,AEX3L4NKDESOCGWOFNF63GRFGXCA,B0067HY7HW,0,2004-10-22 14:26:12.000,3735,746,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...
254779,AES2U6KIAORYLTBPENQWMDVALTDQ,B07ZZVX1F2,1,2020-12-26 21:37:58.968,3109,3800,"[-1.0, 2237.0, 1691.0, 2694.0, 1633.0, 934.0, ..."
254780,AGU6SDEIMLBQZII2FVFJ6YIUZRKQ,B0BSF5LM3J,1,2020-12-26 22:29:54.459,11489,4622,"[107.0, 3997.0, 2858.0, 1680.0, 2919.0, 4109.0..."
254781,AGU6SDEIMLBQZII2FVFJ6YIUZRKQ,B0BZJ9BYZ3,0,2020-12-26 22:29:54.459,11489,4696,"[3997.0, 2858.0, 1680.0, 2919.0, 4109.0, 3695...."
254782,AG2HB7HEYSIAGYBEFFL666KVYTHA,B0895KGSY1,0,2020-12-26 23:06:03.454,8251,3896,"[-1.0, -1.0, 2531.0, 382.0, 2756.0, 3373.0, 34..."


In [8]:
def init_model(n_user, n_items, dropout, hidden_units, num_blocks, num_heads):
    """
    Initialize the model with the given parameters.
    """
    model = SASRec(
        user_num = n_user,
        item_num = n_items,
        dropout_rate = dropout,
        hidden_units = hidden_units,
        num_blocks = num_blocks,
        num_heads = num_heads,
    )
    return model

In [9]:
item_indices = train_df[args.item_col].unique()
user_indices = train_df[args.user_col].unique()
n_items = len(item_indices)
n_users = len(user_indices)

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")
model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
emb_weights = model.item_emb.weight.data
nan_rows = torch.isnan(emb_weights).any(dim=1).nonzero().squeeze().tolist()
print("Các row NaN ngay sau init:", nan_rows)

[32m2025-05-03 00:02:32.936[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of users: 16407, Number of items: 4817[0m


Các row NaN ngay sau init: []


In [10]:
user = torch.tensor([7411])
seq = torch.tensor([[1782, 1975, 3089, 3719, 4721, 3443, 4178, 2953, 684, 3401]])
target_item = torch.tensor([[474]])
predictions = model(user, seq, target_item)
predictions

tensor([[-0.1111]], grad_fn=<SumBackward1>)

In [11]:
print(f"item_num = {model.item_num}")
print(f"Padding token index: {model.item_emb.padding_idx}")
model.item_emb(torch.tensor([3089]))  

item_num = 4817
Padding token index: 4817


tensor([[ 2.3936e-03, -7.6499e-03,  1.1814e-02, -2.4329e-02,  1.4673e-02,
          2.5769e-02, -1.1656e-03, -3.3041e-02,  6.1734e-03,  7.9613e-03,
         -1.7432e-02,  8.7700e-03, -1.6371e-02,  2.4884e-02, -1.8612e-02,
         -1.2904e-02, -8.6961e-03,  7.3918e-03, -3.0689e-02, -2.4997e-02,
          3.1686e-02,  1.3092e-02,  5.7486e-03,  2.5254e-02,  2.7055e-02,
         -2.6505e-02,  1.0796e-02, -2.6389e-02, -5.0969e-03,  2.1111e-02,
         -5.5239e-03,  8.4614e-03, -3.1410e-02, -5.0082e-03, -8.6538e-03,
          1.9261e-02,  3.0144e-02,  3.4273e-02,  2.8078e-02,  1.1149e-02,
          2.0210e-03, -9.4555e-03, -1.4229e-02,  1.1223e-04, -3.1423e-02,
          5.0647e-03, -1.9311e-02, -4.5509e-04, -3.1216e-02,  2.7416e-02,
          1.9963e-02,  1.6873e-02,  2.3036e-02, -4.5613e-03,  8.3995e-03,
         -7.6507e-03,  3.2092e-02,  1.3936e-02,  1.2433e-02, -2.3799e-02,
         -1.0306e-02,  2.6887e-02,  1.0682e-02, -1.9861e-02,  2.8178e-02,
          4.6828e-03,  3.4133e-02, -3.

In [12]:
# batch_size = 4
# for i in range(0,10000):
#     user = torch.tensor([[0]])
#     seq = torch.randint(0, model.item_num, (batch_size, args.max_len))
#     seq[:, :np.random.randint(1,10)] = model.item_num 
#     target_item = torch.tensor([[4000]])
#     # print(f"seq: {seq}")
#     predictions = model(user, seq, target_item)
#     # if prediction is returned by nan values, then print the seq
#     if torch.isnan(predictions).any():
#         print("nan prediction")
#         print(seq)
#         break

In [13]:
model

SASRec(
  (item_emb): Embedding(4818, 128, padding_idx=4817)
  (pos_emb): Embedding(10, 128)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (attention_layernorms): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (attention_layers): ModuleList(
    (0): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
  )
  (forward_layernorms): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (forward_layers): ModuleList(
    (0): PointWiseFeedForward(
      (conv1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (dropout1): Dropout(p=0.3, inplace=False)
      (relu): ReLU()
      (conv2): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (dropout2): Dropout(p=0.3, inplace=False)
    )
  )
  (final_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [14]:
rating_dataset = SASRecDataset(
    train_df, "user_indice", "item_sequence", "item_indice", "rating",args.max_len, n_items, args.timestamp_col, 
)
val_rating_dataset = SASRecDataset(
    val_df, "user_indice", "item_sequence", "item_indice", "rating", args.max_len, n_items, args.timestamp_col, 
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=args.num_workers, persistent_workers=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False, num_workers=args.num_workers, persistent_workers=True
)

In [15]:
# for i in train_loader:
#     print(i["user"])
#     print(i["sequence"])
#     print(i["item"])
#     print(i["rating"])
#     break

In [16]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [17]:
# import torch
# import torch.nn as nn
# import torch.optim as optim

# # 1. Hyper-params & setup
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# num_epochs = 10
# lr = 1e-4          # giảm thêm nếu vẫn NaN
# weight_decay = 1e-4
# grad_clip_norm = 1.0

# # 2. Dataset & DataLoader
# # exist above 

# # 3. Model, Loss, Optimizer, Scheduler
# model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
# model = model.to(device)

# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)

# # 4. Anomaly detection (chỉ bật khi debug)
# # torch.autograd.detect_anomaly(check_nan=True)

# # for epoch in range(1, num_epochs + 1):
#     ##### Training #####
# epoch = 0
# model.train()
# total_train_loss = 0.0
# for batch_idx, batch in enumerate(train_loader, 1):
#     print(f"Epoch {epoch} ─ batch {batch_idx}/{len(train_loader)}")
#     # if batch_idx == 10:
#     #     break
#     users    = batch["user"].to(device)
#     items    = batch["item"].to(device)
#     seqs     = batch["sequence"].long().to(device)
#     labels   = batch["rating"].float().to(device)

#     # Zero gradients
#     optimizer.zero_grad()

#     # Forward
#     logits = model(users, seqs, items).view_as(labels)

#     # Loss
#     loss = criterion(logits, labels)
#     total_train_loss += loss.item()
#     # print(f"Epoch {epoch} ─ batch {batch_idx}/{len(train_loader)} ─ loss: {loss.item():.4f}")
#     # Backward + gradient clipping
#     try:
#         loss.backward()
#     except RuntimeError as e:
#         print(f"🚨 Backward failed: {e}")
#         # inspect tất cả gradients
#         for name, p in model.named_parameters():
#             if p.grad is not None:
#                 has_nan = torch.isnan(p.grad).any().item()
#                 print(f"  grad for {name}: contains_nan={has_nan}, max_abs={p.grad.abs().max().item():.4e}")
#         raise  # vẫn ném exception để dừng
#     torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_norm)

#     # (optional) log max grad for each layer
#     for name, p in model.named_parameters():
#         if p.grad is not None:
#             if torch.isnan(p.grad).any():
#                 print(f"[NaN grad] {name}")
#             # print(f"{name} grad_norm={p.grad.norm():.4f}")

#     # Step optimizer
#     optimizer.step()

# avg_train_loss = total_train_loss / len(train_loader)
# print(f"Epoch {epoch} ─ train_loss: {avg_train_loss:.4f}")

# ##### Validation #####
# # model.eval()
# # total_val_loss = 0.0
# # with torch.no_grad():
# #     for batch in val_loader:
# #         users  = batch["user"].to(device)
# #         items  = batch["item"].to(device)
# #         seqs   = batch["sequence"].long().to(device)
# #         labels = batch["rating"].float().to(device)

# #         logits = model(users, seqs, items).view_as(labels)
# #         loss = criterion(logits, labels)
# #         total_val_loss += loss.item()

# # avg_val_loss = total_val_loss / len(val_loader)
# # print(f"Epoch {epoch} ─ val_loss:   {avg_val_loss:.4f}")

# # Scheduler step
# # scheduler.step(avg_val_loss)

# print("Training finished.")


## check nan cases in data

In [18]:
# model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
# # Extract the attention layer from the model
# attention_layer = model.attention_layers[0]

# for batch_idx, batch in enumerate(train_loader):
#     user_ids, seq, target_item, labels = batch

#     # Get the embeddings for the sequence
#     sequence_embeddings = model.item_emb(batch["sequence"])
    

#     # Pass the embeddings through the attention layer
#     attention_output, _ = attention_layer(sequence_embeddings, sequence_embeddings, sequence_embeddings)
#     # if any NaN values are found in the attention output, print the batch and the attention output
#     if torch.isnan(attention_output).any():
#         print(f"Batch {batch_idx} - Attention output contains NaN values.")
#         print(batch["sequence"])
#         print("Sample seq:", batch["sequence"][0])
#         print("Item emb weight stats:", model.item_emb.weight.min(), model.item_emb.weight.max())
#         print(attention_output)
#         break
#     # Example: Perform some operation with the attention output
#     predictions = model.final_layer(attention_output[:, -1, :])  # Use the last position for predictions

#     # Check for NaN values in predictions
#     if torch.isnan(predictions).any():
#         print(f"Batch {batch_idx} - Predictions contain NaN values.")
#         print(batch["sequence"])
#         print("Sample seq:", batch["sequence"][0])
#         print("Item emb weight stats:", model.item_emb.weight.min(), model.item_emb.weight.max())
#         print(predictions)
#         break


## overfit 1 batch

In [None]:
# early_stopping = EarlyStopping(
#     monitor="val_loss", patience=5, mode="min", verbose=False
# )
# # create log_dir if it does not exist
# if not os.path.exists(args.notebook_persit_dp):
#     os.makedirs(args.notebook_persit_dp, exist_ok=True)

# model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
# lit_model = SASRecLitModule(
#     model,
#     log_dir=args.notebook_persit_dp,
#     accelerator=args.device,
#     lr=args.lr,
#     l2_emb=args.l2_emb,
#     idm= idm
# )

# log_dir = f"{args.notebook_persit_dp}/logs/overfit"
# # create log_dir if it does not exist
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir, exist_ok=True)

# # train model
# trainer = L.Trainer(
#     default_root_dir=log_dir,
#     accelerator=args.device if args.device else "auto",
#     max_epochs=args.num_epochs,
#     overfit_batches=1,
#     callbacks=[early_stopping],
# )
# trainer.fit(
#     model=lit_model,
#     train_dataloaders=train_loader,
#     val_dataloaders=train_loader,
# )
# logger.info(f"Logs available at {trainer.log_dir}")

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.

  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | SASRec | 717 K  | train
-----------------------------------------
717 K     Trainable params
0         Non-trainable params
717 K     Total params
2.871     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


You requested to overfit but enabled val dataloader shuffling. We are turning off the val dataloader shuffling for you.


You requested to overfit but enabled train dataloader shuffling. We are turning off the train dataloader shuffling for you.


The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[32m2025-05-03 00:05:26.821[0m | [1mINFO    [0m | [36msrc.algo.gSASRec.trainer[0m:[36mon_fit_end[0m:[36m127[0m - [1mLogging classification metrics...[0m
[32m2025-05-03 00:06:29.767[0m | [1mINFO    [0m | [36msrc.algo.gSASRec.trainer[0m:[36mon_fit_end[0m:[36m130[0m - [1mLogging ranking metrics...[0m

invalid value encountered in divide

[32m2025-05-03 00:07:05.837[0m | [1mINFO    [0m | [36msrc.algo.gSASRec.trainer[0m:[36mon_fit_end[0m:[36m133[0m - [1mEvidently metrics are available at: c:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\first-attempt\018-sasrec[0m
[32m2025-05-03 00:07:05.837[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mLogs available at c:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\first-attempt\018-sasrec\logs\overfit\lightning_logs\version_86[0m


{'metrics': [{'metric': 'NDCGKMetric', 'result': {'k': 10, 'current': 1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
dtype: float64, 'current_value': 0.0, 'reference': None, 'reference_value': None}}, {'metric': 'RecallTopKMetric', 'result': {'k': 100, 'current': 0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
        ...   
95    0.000103
96    0.000118
97    0.000118
98    0.000118
99    0.000118
Length: 100, dtype: float64, 'current_value': 0.00011809644216692368, 'reference': None, 'reference_value': None}}, {'metric': 'PrecisionTopKMetric', 'result': {'k': 100, 'current': 0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
        ...   
95    0.000008
96    0.000008
97    0.000008
98    0.000008
99    0.000008
Length: 100, dtype: float64, 'current_value': 7.923447309075395e-06, 'reference': None, 'reference_value': None}}, {'metric': 'FBetaTopKMetric', 'result': {'k': 10, 'curre

In [None]:
# torch.autograd.set_detect_anomaly(True)

early_stopping = EarlyStopping(
    monitor="val_loss", patience=args.early_stopping_patience, mode="min", verbose=False, min_delta=0.0025
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persit_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_loss",
    mode="min",
)

model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
model = model.double()
lit_model = SASRecLitModule(
    model,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    lr=args.lr,
    l2_emb=args.l2_emb,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/run"
# create log_dir if it does not exist
if not os.path.exists(log_dir):
    os.makedirs(log_dir, exist_ok=True)
    
# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    # max_epochs=1,
    # detect_anomaly=True,
    max_epochs=args.num_epochs,
    # gradient_clip_val=1.0,     
    # gradient_clip_algorithm="norm",
    callbacks=[early_stopping, checkpoint_callback],
    logger=args._mlf_logger if args.log_to_mlflow else None,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)


# Change the library as a workaround for the issue in the latest Lightning release
#https://github.com/Lightning-AI/pytorch-lightning/pull/20669/commits/429f732a0528c558e701da7ec01e51c1e2e4f32e

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

Checkpoint directory C:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\first-attempt\018-sasrec\checkpoints exists and is not empty.


  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | SASRec | 717 K  | train
-----------------------------------------
717 K     Trainable params
0         Non-trainable params
717 K     Total params
2.871     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
all_params = [args]

if args.log_to_mlflow:
    run_id = trainer.logger.run_id

    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.model_dump()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)