In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import time
import json
import torch
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.gSASRec.model import SASRec
from src.algo.gSASRec.dataset import SASRecDataset
from src.algo.gSASRec.trainer import SASRecLitModule
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [3]:
load_dotenv(override = True)

True

In [None]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    # run name contains the hyperparameters "lr bs dropout num_heads" used for the run
    run_name: str = f"050-sasrec lr=0.0005 dropout=0.5 num_heads=4" 
    notebook_persit_dp: str = None
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "seq-modelling"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 256
    lr: float = 0.0005
    l2_emb: float = 0.0001
    early_stopping_patience: int = 5
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_epochs: int = 100

    # SASrec specific
    max_len: int = 50
    dropout: float = 0.5
    hidden_units: int = 128
    num_blocks: int = 1
    num_heads: int = 4
    num_workers: int = 3
    seq_length: int = 50
    
    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-05-11 22:26:35.622[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m45[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 050-sasrec lr=0.00006 dropout=0.2 num_heads=4[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "run_name": "050-sasrec lr=0.00006 dropout=0.2 num_heads=4",
  "notebook_persit_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\first-attempt\\050-sasrec lr=0.00006 dropout=0.2 num_heads=4",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "seq-modelling",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 256,
  "lr": 0.00006,
  "l2_emb": 0.0,
  "early_stopping_patience": 10,
  "device": "cpu",
  "num_epochs": 100,
  "max_len": 10,
  "dropout": 0.2,
  "hidden_units": 128,
  "num_blocks": 1,
  "num_heads": 4,
  "num_workers": 3,
  "seq_length": 50,
  "train_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\train_sample_interactions_16407u_neg_seq.parquet",
  "val_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for

In [5]:
train_df = pd.read_parquet(args.train_data_fp)
train_df[args.rating_col] = train_df[args.rating_col].apply(lambda x: 1 if x > 0 else 0)    
# train_df = train_df[train_df['item_sequence'].apply(lambda seq: not all(item == -1 for item in seq))]        

val_df = pd.read_parquet(args.val_data_fp)
val_df[args.rating_col] = val_df[args.rating_col].apply(lambda x: 1 if x > 0 else 0)
# val_df = val_df[val_df['item_sequence'].apply(lambda seq: not all(item == -1 for item in seq))]

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."

assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [6]:
val_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1,2020-12-27 00:30:31.146,11295,528,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07KFQFDNB,0,2020-12-27 00:30:31.146,11295,3503,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
2,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,1,2020-12-27 01:44:52.242,1784,3925,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
3,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B00HXT8EKE,0,2020-12-27 01:44:52.242,1784,1507,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
4,AGAVHCK42EGMVS7DGPRX6HBCUCNQ,B09Q3NR84W,1,2020-12-27 02:25:48.357,9042,4273,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
...,...,...,...,...,...,...,...
6953,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,1,2022-02-19 16:56:53.030,1396,32,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
6954,AHLN6GKTKZE22AON34YAQXTGK63A,B09SWWCN6Q,0,2022-02-19 17:28:55.519,14550,4303,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
6955,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,1,2022-02-19 17:28:55.519,14550,4772,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
6956,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B091K4WYD1,1,2022-02-19 22:08:53.253,2446,4086,"[528, 395, 3226, 2286, 4734, 856, 631, 890, 45..."


In [7]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,1,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AFZ4EK2LJ655XQKTEUELCARO6RYA,B095JX15XF,0,2003-01-23 03:28:15.000,8071,4132,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,1,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00OQVZDJM,0,2003-11-25 18:12:09.000,7935,1859,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,1,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...
254779,AES2U6KIAORYLTBPENQWMDVALTDQ,B07ZZVX1F2,1,2020-12-26 21:37:58.968,3109,3800,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
254780,AGU6SDEIMLBQZII2FVFJ6YIUZRKQ,B0BZJ9BYZ3,0,2020-12-26 22:29:54.459,11489,4696,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
254781,AGU6SDEIMLBQZII2FVFJ6YIUZRKQ,B0BSF5LM3J,1,2020-12-26 22:29:54.459,11489,4622,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
254782,AG2HB7HEYSIAGYBEFFL666KVYTHA,B0895KGSY1,0,2020-12-26 23:06:03.454,8251,3896,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [8]:
one_user_train_df = train_df[train_df["user_id"] == "AGSP5XAQPQBUUXZHEZSC65FD7NOQ"][["timestamp","parent_asin","rating","item_sequence"]]
# change type of item_sequence column into int and save as csv
one_user_train_df["item_sequence"] = one_user_train_df["item_sequence"].apply(lambda x: [int(i) for i in x])
# sort the dataframe by timestamp
one_user_train_df = one_user_train_df.sort_values(by="timestamp")
# save as csv
one_user_train_df.to_csv("one_user_train_df.csv", index=False)

In [9]:
one_user_val_df = val_df[val_df["user_id"] == "AGSP5XAQPQBUUXZHEZSC65FD7NOQ"][["timestamp","parent_asin","rating","item_sequence"]]
# change type of item_sequence column into int and save as csv
one_user_val_df["item_sequence"] = one_user_val_df["item_sequence"].apply(lambda x: [int(i) for i in x])
# sort the dataframe by timestamp
one_user_val_df = one_user_val_df.sort_values(by="timestamp")
# save as csv
one_user_val_df.to_csv("one_user_val_df.csv", index=False)

In [10]:
# Add this code to analyze your sequence data
def analyze_sequence_data(df, seq_col='item_sequence', rating_col='rating'):
    # 1. Sequence length distribution
    seq_lengths = df[seq_col].apply(lambda x: sum(1 for i in x if i != -1))
    
    # 2. Rating distribution
    rating_dist = df[rating_col].value_counts(normalize=True)
    
    # 3. Item frequency in sequences
    all_items = [item for seq in df[seq_col].tolist() for item in seq if item != -1]
    item_counts = pd.Series(all_items).value_counts()
    
    # 4. Calculate item popularity skew
    top_10_pct = item_counts.head(int(len(item_counts)*0.1)).sum() / item_counts.sum()
    
    return {
        "avg_seq_length": seq_lengths.mean(),
        "median_seq_length": seq_lengths.median(),
        "short_seqs_pct": (seq_lengths <= 2).mean() * 100,  # % sequences with ≤2 items
        "max_seq_length": seq_lengths.max(),
        "rating_distribution": rating_dist.to_dict(),
        "unique_items_in_sequences": len(set(all_items)),
        "top_10pct_items_coverage": top_10_pct * 100,  # % of interactions covered by top 10% popular items
    }

# Run the analysis on train and validation sets
train_stats = analyze_sequence_data(train_df)
val_stats = analyze_sequence_data(val_df)
# Print the statistics
print("Training data statistics:", train_stats)
print("Validation data statistics:", val_stats)

# print("Training data statistics:")
# print(json.dumps(train_stats, indent=2))
# print("\nValidation data statistics:")
# print(json.dumps(val_stats, indent=2))

Training data statistics: {'avg_seq_length': 9.692512088671188, 'median_seq_length': 7.0, 'short_seqs_pct': 19.31871703089676, 'max_seq_length': 50, 'rating_distribution': {1: 0.5, 0: 0.5}, 'unique_items_in_sequences': 4817, 'top_10pct_items_coverage': 38.00919782449624}
Validation data statistics: {'avg_seq_length': 20.355849382006323, 'median_seq_length': 16.0, 'short_seqs_pct': 0.0, 'max_seq_length': 50, 'rating_distribution': {1: 0.5, 0: 0.5}, 'unique_items_in_sequences': 4779, 'top_10pct_items_coverage': 39.69965263068712}


In [11]:
def init_model(n_user, n_items, dropout, hidden_units, num_blocks, num_heads, seq_length):
    """
    Initialize the model with the given parameters.
    """
    model = SASRec(
        user_num = n_user,
        item_num = n_items,
        dropout_rate = dropout,
        hidden_units = hidden_units,
        num_blocks = num_blocks,
        num_heads = num_heads,
        sequence_length = seq_length,
    )
    return model

In [12]:
item_indices = train_df[args.item_col].unique()
user_indices = train_df[args.user_col].unique()
n_items = len(item_indices)
n_users = len(user_indices)

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")
model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads, args.seq_length)
emb_weights = model.item_emb.weight.data
nan_rows = torch.isnan(emb_weights).any(dim=1).nonzero().squeeze().tolist()
print("Các row NaN ngay sau init:", nan_rows)

[32m2025-05-11 22:26:44.662[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of users: 16407, Number of items: 4817[0m


Các row NaN ngay sau init: []


In [13]:
# user = torch.tensor([7411])
# seq = torch.tensor([[1782, 1975, 3089, 3719, 4721, 3443, 4178, 2953, 684, 3401]])
# target_item = torch.tensor([[474]])
# predictions = model(user, seq, target_item)
# predictions

In [14]:
print(f"item_num = {model.item_num}")
print(f"Padding token index: {model.item_emb.padding_idx}")
model.item_emb(torch.tensor([3089]))  

item_num = 4817
Padding token index: 4817


tensor([[-0.0103,  0.0065,  0.0075,  0.0237,  0.0049,  0.0029,  0.0139,  0.0252,
          0.0081, -0.0062,  0.0020, -0.0315, -0.0343,  0.0042, -0.0158,  0.0296,
         -0.0159, -0.0278, -0.0075, -0.0069, -0.0115,  0.0147, -0.0008,  0.0308,
         -0.0011,  0.0205,  0.0137, -0.0200, -0.0156, -0.0093, -0.0026, -0.0121,
          0.0036,  0.0191, -0.0027,  0.0071, -0.0120,  0.0513, -0.0123, -0.0188,
          0.0141,  0.0460, -0.0023, -0.0069,  0.0172,  0.0159,  0.0003, -0.0241,
         -0.0308,  0.0469,  0.0203,  0.0354,  0.0001,  0.0184,  0.0469,  0.0094,
          0.0031,  0.0338,  0.0093, -0.0141,  0.0088,  0.0001,  0.0164,  0.0127,
          0.0229,  0.0028, -0.0058,  0.0062, -0.0145,  0.0137, -0.0051, -0.0308,
         -0.0158, -0.0099, -0.0373, -0.0178, -0.0207, -0.0090, -0.0080, -0.0176,
         -0.0122, -0.0028,  0.0011, -0.0195,  0.0016, -0.0017, -0.0239,  0.0210,
         -0.0323, -0.0191, -0.0384, -0.0082, -0.0156,  0.0078,  0.0141, -0.0006,
         -0.0170, -0.0156,  

In [15]:
# batch_size = 4
# for i in range(0,10000):
#     user = torch.tensor([[0]])
#     seq = torch.randint(0, model.item_num, (batch_size, args.max_len))
#     seq[:, :np.random.randint(1,10)] = model.item_num 
#     target_item = torch.tensor([[4000]])
#     # print(f"seq: {seq}")
#     predictions = model(user, seq, target_item)
#     # if prediction is returned by nan values, then print the seq
#     if torch.isnan(predictions).any():
#         print("nan prediction")
#         print(seq)
#         break

In [16]:
model

SASRec(
  (item_emb): Embedding(4818, 128, padding_idx=4817)
  (pos_emb): Embedding(50, 128)
  (emb_dropout): Dropout(p=0.2, inplace=False)
  (attention_layernorms): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (attention_layers): ModuleList(
    (0): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
  )
  (forward_layernorms): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (forward_layers): ModuleList(
    (0): PointWiseFeedForward(
      (conv1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (dropout1): Dropout(p=0.2, inplace=False)
      (relu): ReLU()
      (conv2): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (dropout2): Dropout(p=0.2, inplace=False)
    )
  )
  (final_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [17]:
# small_train_df = train_df.sample(frac=0.2, random_state=42)

rating_dataset = SASRecDataset(
    train_df, "user_indice", "item_sequence", "item_indice", "rating",args.max_len, n_items, args.timestamp_col, 
)
val_rating_dataset = SASRecDataset(
    val_df, "user_indice", "item_sequence", "item_indice", "rating", args.max_len, n_items, args.timestamp_col, 
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=args.num_workers, persistent_workers=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False, num_workers=args.num_workers, persistent_workers=True
)

In [18]:
for i in train_loader:
    print(i["user"].shape)
    print(i["sequence"].shape)
    print(i["item"].shape)
    print(i["rating"].shape)
    break

torch.Size([256])
torch.Size([256, 50])
torch.Size([256])
torch.Size([256])


In [19]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [20]:
# import torch
# import torch.nn as nn
# import torch.optim as optim

# # 1. Hyper-params & setup
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# num_epochs = 10
# lr = 1e-4          # giảm thêm nếu vẫn NaN
# weight_decay = 1e-4
# grad_clip_norm = 1.0

# # 2. Dataset & DataLoader
# # exist above 

# # 3. Model, Loss, Optimizer, Scheduler
# model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
# model = model.to(device)

# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)

# # 4. Anomaly detection (chỉ bật khi debug)
# # torch.autograd.detect_anomaly(check_nan=True)

# # for epoch in range(1, num_epochs + 1):
#     ##### Training #####
# epoch = 0
# model.train()
# total_train_loss = 0.0
# for batch_idx, batch in enumerate(train_loader, 1):
#     print(f"Epoch {epoch} ─ batch {batch_idx}/{len(train_loader)}")
#     # if batch_idx == 10:
#     #     break
#     users    = batch["user"].to(device)
#     items    = batch["item"].to(device)
#     seqs     = batch["sequence"].long().to(device)
#     labels   = batch["rating"].float().to(device)

#     # Zero gradients
#     optimizer.zero_grad()

#     # Forward
#     logits = model(users, seqs, items).view_as(labels)

#     # Loss
#     loss = criterion(logits, labels)
#     total_train_loss += loss.item()
#     # print(f"Epoch {epoch} ─ batch {batch_idx}/{len(train_loader)} ─ loss: {loss.item():.4f}")
#     # Backward + gradient clipping
#     try:
#         loss.backward()
#     except RuntimeError as e:
#         print(f"🚨 Backward failed: {e}")
#         # inspect tất cả gradients
#         for name, p in model.named_parameters():
#             if p.grad is not None:
#                 has_nan = torch.isnan(p.grad).any().item()
#                 print(f"  grad for {name}: contains_nan={has_nan}, max_abs={p.grad.abs().max().item():.4e}")
#         raise  # vẫn ném exception để dừng
#     torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_norm)

#     # (optional) log max grad for each layer
#     for name, p in model.named_parameters():
#         if p.grad is not None:
#             if torch.isnan(p.grad).any():
#                 print(f"[NaN grad] {name}")
#             # print(f"{name} grad_norm={p.grad.norm():.4f}")

#     # Step optimizer
#     optimizer.step()

# avg_train_loss = total_train_loss / len(train_loader)
# print(f"Epoch {epoch} ─ train_loss: {avg_train_loss:.4f}")

# ##### Validation #####
# # model.eval()
# # total_val_loss = 0.0
# # with torch.no_grad():
# #     for batch in val_loader:
# #         users  = batch["user"].to(device)
# #         items  = batch["item"].to(device)
# #         seqs   = batch["sequence"].long().to(device)
# #         labels = batch["rating"].float().to(device)

# #         logits = model(users, seqs, items).view_as(labels)
# #         loss = criterion(logits, labels)
# #         total_val_loss += loss.item()

# # avg_val_loss = total_val_loss / len(val_loader)
# # print(f"Epoch {epoch} ─ val_loss:   {avg_val_loss:.4f}")

# # Scheduler step
# # scheduler.step(avg_val_loss)

# print("Training finished.")


## check nan cases in data

In [21]:
# model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)
# # Extract the attention layer from the model
# attention_layer = model.attention_layers[0]

# for batch_idx, batch in enumerate(train_loader):
#     user_ids, seq, target_item, labels = batch

#     # Get the embeddings for the sequence
#     sequence_embeddings = model.item_emb(batch["sequence"])
    

#     # Pass the embeddings through the attention layer
#     attention_output, _ = attention_layer(sequence_embeddings, sequence_embeddings, sequence_embeddings)
#     # if any NaN values are found in the attention output, print the batch and the attention output
#     if torch.isnan(attention_output).any():
#         print(f"Batch {batch_idx} - Attention output contains NaN values.")
#         print(batch["sequence"])
#         print("Sample seq:", batch["sequence"][0])
#         print("Item emb weight stats:", model.item_emb.weight.min(), model.item_emb.weight.max())
#         print(attention_output)
#         break
#     # Example: Perform some operation with the attention output
#     predictions = model.final_layer(attention_output[:, -1, :])  # Use the last position for predictions

#     # Check for NaN values in predictions
#     if torch.isnan(predictions).any():
#         print(f"Batch {batch_idx} - Predictions contain NaN values.")
#         print(batch["sequence"])
#         print("Sample seq:", batch["sequence"][0])
#         print("Item emb weight stats:", model.item_emb.weight.min(), model.item_emb.weight.max())
#         print(predictions)
#         break


## overfit 1 batch

In [22]:
# early_stopping = EarlyStopping(
#     monitor="val_loss", patience=5, mode="min", verbose=False
# )
# # create log_dir if it does not exist
# if not os.path.exists(args.notebook_persit_dp):
#     os.makedirs(args.notebook_persit_dp, exist_ok=True)

# model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads, args.seq_length)
# lit_model = SASRecLitModule(
#     model,
#     log_dir=args.notebook_persit_dp,
#     accelerator=args.device,
#     lr=args.lr,
#     l2_emb=args.l2_emb,
#     idm= idm
# )

# log_dir = f"{args.notebook_persit_dp}/logs/overfit"
# # create log_dir if it does not exist
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir, exist_ok=True)

# # train model
# trainer = L.Trainer(
#     default_root_dir=log_dir,
#     accelerator=args.device if args.device else "auto",
#     max_epochs=10,
#     # max_epochs=args.num_epochs,
#     overfit_batches=1,
#     callbacks=[early_stopping],
# )
# trainer.fit(
#     model=lit_model,
#     train_dataloaders=train_loader,
#     val_dataloaders=train_loader,
# )
# logger.info(f"Logs available at {trainer.log_dir}")

In [None]:
# torch.autograd.set_detect_anomaly(True)

early_stopping = EarlyStopping(
    monitor="val_loss", patience=args.early_stopping_patience, mode="min", verbose=False, min_delta=0.0025
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persit_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_loss",
    mode="min",
)

model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads, args.seq_length)
# model = model.double()
lit_model = SASRecLitModule(
    model,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    lr=args.lr,
    l2_emb=args.l2_emb,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/run"
# create log_dir if it does not exist
if not os.path.exists(log_dir):
    os.makedirs(log_dir, exist_ok=True)
    
# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    # max_epochs=1,
    # detect_anomaly=True,
    max_epochs=args.num_epochs,
    # gradient_clip_val=1.0,     
    # gradient_clip_algorithm="norm",
    callbacks=[early_stopping, checkpoint_callback],
    logger=args._mlf_logger if args.log_to_mlflow else None,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)


# Change the library as a workaround for the issue in the latest Lightning release
#https://github.com/Lightning-AI/pytorch-lightning/pull/20669/commits/429f732a0528c558e701da7ec01e51c1e2e4f32e

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | SASRec | 722 K  | train
-----------------------------------------
722 K     Trainable params
0         Non-trainable params
722 K     Total params
2.891     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Val Loss: 0.6974566578865051
Val Loss: 0.6997570991516113


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Val Loss: 0.6962968111038208
Val Loss: 0.6970601677894592
Val Loss: 0.6930683255195618
Val Loss: 0.7009881138801575
Val Loss: 0.6959498524665833
Val Loss: 0.6952573657035828
Val Loss: 0.6927010416984558
Val Loss: 0.7005616426467896
Val Loss: 0.6954341530799866
Val Loss: 0.6972519159317017
Val Loss: 0.6979076862335205
Val Loss: 0.691720724105835
Val Loss: 0.6986429691314697
Val Loss: 0.6956564784049988
Val Loss: 0.7043942213058472
Val Loss: 0.6992948651313782
Val Loss: 0.6967430114746094
Val Loss: 0.7009515762329102
Val Loss: 0.6960301399230957
Val Loss: 0.7002236247062683
Val Loss: 0.704680323600769
Val Loss: 0.6964830756187439
Val Loss: 0.6953856348991394
Val Loss: 0.699321448802948
Val Loss: 0.6958011388778687
Val Loss: 0.6973512172698975
Val Loss: 0.6872048377990723
Val Loss: 0.7017459869384766


`Trainer.fit` stopped: `max_epochs=1` reached.


Recommendations_df:    user_indice recommendation     score
0        11295           4172   0.63765
1        11295            422  0.620351
2        11295             13  0.614194
3        11295           4218  0.597724
4        11295            744  0.597677
Recommendations_df:      user_indice item_indice     score  rec_ranking  \
0          11295        4172   0.63765          1.0   
1          11295         422  0.620351          2.0   
2          11295          13  0.614194          3.0   
3          11295        4218  0.597724          4.0   
4          11295         744  0.597677          5.0   
..           ...         ...       ...          ...   
995         1134        2464  0.551014         96.0   
996         1134         881  0.550933         97.0   
997         1134        1078  0.550834         98.0   
998         1134        3893  0.550776         99.0   
999         1134        1197  0.550762        100.0   

                          user_id parent_asin  
0    AGSP5X


invalid value encountered in divide



{'metrics': [{'metric': 'NDCGKMetric', 'result': {'k': 10, 'current': 1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
dtype: float64, 'current_value': 0.0, 'reference': None, 'reference_value': None}}, {'metric': 'RecallTopKMetric', 'result': {'k': 100, 'current': 0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
        ...   
95    0.000206
96    0.000206
97    0.000206
98    0.000206
99    0.000206
Length: 100, dtype: float64, 'current_value': 0.00020627062706270627, 'reference': None, 'reference_value': None}}, {'metric': 'PrecisionTopKMetric', 'result': {'k': 100, 'current': 0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
        ...   
95    0.000004
96    0.000004
97    0.000004
98    0.000004
99    0.000004
Length: 100, dtype: float64, 'current_value': 4.125412541254125e-06, 'reference': None, 'reference_value': None}}, {'metric': 'FBetaTopKMetric', 'result': {'k': 10, 'curre

[32m2025-05-11 22:35:08.944[0m | [1mINFO    [0m | [36msrc.algo.gSASRec.trainer[0m:[36mon_fit_end[0m:[36m135[0m - [1mEvidently metrics are available at: c:\Users\Trieu\OneDrive\Desktop\recsys\real_time_recsys\notebooks\data\first-attempt\050-sasrec lr=0.00006 dropout=0.2 num_heads=4[0m


🏃 View run 050-sasrec lr=0.00006 dropout=0.2 num_heads=4 at: http://138.2.61.6:5002/#/experiments/2/runs/f00e9203599d48bfaf1072c0b79aad64
🧪 View experiment at: http://138.2.61.6:5002/#/experiments/2


In [43]:
all_params = [args]

if args.log_to_mlflow:
    run_id = trainer.logger.run_id

    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.model_dump()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)

AttributeError: 'TensorBoardLogger' object has no attribute 'run_id'

In [21]:
args = Args().init()  # Load lại các tham số từ class Args
model = init_model(n_users, n_items, args.dropout, args.hidden_units, args.num_blocks, args.num_heads)

# Đường dẫn đến file checkpoint
checkpoint_path = f"C:/Users/Trieu/OneDrive/Desktop/recsys/real_time_recsys/notebooks/data/first-attempt/050-sasrec/checkpoints/best_checkpoint.ckpt"

# Load checkpoint
checkpoint = torch.load(checkpoint_path, map_location=torch.device(args.device))
print(checkpoint.keys())
print(checkpoint['state_dict'].keys())

# Tạo một state_dict mới, loại bỏ tiền tố "model."
model_state_dict = {k.replace("model.", ""): v for k, v in checkpoint['state_dict'].items() if k.startswith("model.")}

# Load state_dict đã điều chỉnh vào mô hình SASRec
model.load_state_dict(model_state_dict)
model.eval()  # Chuyển sang chế độ đánh giá

[32m2025-05-10 20:48:05.748[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m45[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 050-sasrec lr=0.00006 dropout=0.2 num_heads=4[0m


dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])
odict_keys(['model.item_emb.weight', 'model.pos_emb.weight', 'model.attention_layernorms.0.weight', 'model.attention_layernorms.0.bias', 'model.attention_layers.0.in_proj_weight', 'model.attention_layers.0.in_proj_bias', 'model.attention_layers.0.out_proj.weight', 'model.attention_layers.0.out_proj.bias', 'model.forward_layernorms.0.weight', 'model.forward_layernorms.0.bias', 'model.forward_layers.0.conv1.weight', 'model.forward_layers.0.conv1.bias', 'model.forward_layers.0.conv2.weight', 'model.forward_layers.0.conv2.bias', 'model.final_layer.weight', 'model.final_layer.bias'])


SASRec(
  (item_emb): Embedding(4818, 128, padding_idx=4817)
  (pos_emb): Embedding(10, 128)
  (emb_dropout): Dropout(p=0.2, inplace=False)
  (attention_layernorms): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (attention_layers): ModuleList(
    (0): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
  )
  (forward_layernorms): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (forward_layers): ModuleList(
    (0): PointWiseFeedForward(
      (conv1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (dropout1): Dropout(p=0.2, inplace=False)
      (relu): ReLU()
      (conv2): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (dropout2): Dropout(p=0.2, inplace=False)
    )
  )
  (final_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [56]:
# Ví dụ: chọn một batch từ val_loader
# create a random index to get a sample from a batch
sample_index = np.random.randint(0, 512)
sample_batch = next(iter(train_loader))
user = sample_batch["user"].to(args.device)
seq = sample_batch["sequence"].to(args.device)
target_item = sample_batch["item"].to(args.device)
rating = sample_batch["rating"].to(args.device)
# print shape
# print(f"User: {user.shape}, Seq: {seq.shape}, Target item: {target_item.shape} ")
# print(f"User: {user}, Seq: {seq}, Target item: {target_item}, Rating: {rating}")
# user_indice = torch.tensor([11295])  # ID người dùng đã được ánh xạ
# sequence = torch.tensor([[4817, 1898, 3479, 3908, 1570, 91, 2723, 2962, 106, 3557]])  # Lịch sử tương tác
# target = torch.tensor([528])  # Item mục tiêu cần dự đoán

In [57]:
with torch.no_grad():
    # Chuyển dữ liệu sang device phù hợp (CPU/GPU)
    user = user.to(args.device)
    seq = seq.to(args.device)
    target = target_item.to(args.device)
    
    # Dự đoán
    logits = model(user, seq, target)
    prediction = torch.sigmoid(logits)  # Áp dụng sigmoid để có xác suất
# print(f"Logits: {logits}")
# print(f"Prediction score: {prediction}")
# print(f"rating: {rating}")
# cal the loss
criterion = torch.nn.BCEWithLogitsLoss()
loss = criterion(logits.unsqueeze(0), rating.unsqueeze(0).float())
print(f"Loss: {loss.item()}")

Loss: 0.6237632632255554


In [63]:
# print a random sample of the sequence
sample_id = np.random.randint(0, 256)
print(f"Sample seq: {seq[sample_id]}")
print(f"Sample user: {user[sample_id]}")
print(f"Sample target item: {target_item[sample_id]}")
print(f"Sample rating: {rating[sample_id]}")
print(f"Sample prediction: {prediction[sample_id]}")

Sample seq: tensor([4817, 4817, 2615, 4678, 1936, 1192, 4734, 4264, 3723, 2168])
Sample user: 4468
Sample target item: 4516
Sample rating: 1.0
Sample prediction: 0.463296115398407
