## Setup

In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import time
import json
import torch
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow
from datasets import load_dataset

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.sequence.model import SequenceRatingPrediction
from src.algo.sequence.dataset import UserItemRatingDFDataset
from src.algo.sequence.trainer import SeqModellingLitModule
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [4]:
load_dotenv(override = True)

True

## Args

In [5]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    run_name: str = f"006-sequence-modelling"
    notebook_persit_dp: str = None

    hf_dataset_name: str = "McAuley-Lab/Amazon-Reviews-2023"
    amz_rating_hf_dataset_path: str = "0core_rating_only_Electronics"    #load o-core to demo real-world problem: cold-start, sparse data
    amz_metadata_hf_dataset_path: str = "raw_meta_Electronics"
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "seq-modelling"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 64
    learning_rate: float = 0.001
    l2_reg: float = 1e-4
    early_stopping_patience: int = 10
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    max_epochs: int = 100

    # TwoTower specific
    dropout: float = 0.2
    embedding_dim: int = 128

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-05-02 16:58:15.435[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m43[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 006-sequence-modelling[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "run_name": "006-sequence-modelling",
  "notebook_persit_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/006-sequence-modelling",
  "hf_dataset_name": "McAuley-Lab/Amazon-Reviews-2023",
  "amz_rating_hf_dataset_path": "0core_rating_only_Electronics",
  "amz_metadata_hf_dataset_path": "raw_meta_Electronics",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "seq-modelling",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 64,
  "learning_rate": 0.001,
  "l2_reg": 0.0001,
  "early_stopping_patience": 10,
  "device": "cuda",
  "max_epochs": 100,
  "dropout": 0.2,
  "embedding_dim": 128,
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_inter

## Load metadata

In [24]:
amz_metadata_raw = load_dataset(args.hf_dataset_name,
                                args.amz_metadata_hf_dataset_path,
                                split="full",
                                trust_remote_code=True)

metadata_df = amz_metadata_raw.to_pandas()

## Load dataset

In [25]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

## Load idm

In [26]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

## Load pretrained model

In [27]:
def init_model(n_user, n_items, embedding_dim, dropout):
    """
    Initialize the model with the given parameters.
    """
    model = SequenceRatingPrediction(
        num_users = n_user,
        num_items = n_items,
        embedding_dim = embedding_dim,
        dropout = dropout,
    )
    return model

In [28]:
item_indices = train_df[args.item_col].unique()
user_indices = train_df[args.user_col].unique()
n_items = len(item_indices)
n_users = len(user_indices)

model = init_model(n_users, n_items, args.embedding_dim, args.dropout)

In [29]:
checkpoint_path = "/home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/006-sequence-modelling/checkpoints/best-checkpoint-v1.ckpt"

best_model = SeqModellingLitModule.load_from_checkpoint(model = model, checkpoint_path = checkpoint_path)

In [30]:
best_model = best_model.model.to(args.device)
best_model.eval()

SequenceRatingPrediction(
  (item_embedding): Embedding(4818, 128, padding_idx=4817)
  (user_embedding): Embedding(16407, 128)
  (gru): GRU(128, 128, batch_first=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (fc_rating): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

# Sample_user

In [31]:
sample_user = train_df[args.user_col].sample(1).values[0]

test_df = (
    train_df.loc[lambda df: df[args.user_col] == sample_user]
)

# Get last row
row = -5

test_sample_df = test_df.iloc[row]

sample_user = torch.tensor(test_sample_df["user_indice"]).int().unsqueeze(0).to(args.device)
sample_item_sequence = torch.tensor(test_sample_df["item_sequence"]).int().unsqueeze(0).to(args.device)

next_item_sequence = test_df["item_sequence"].iloc[row+1:].to_list()

In [32]:
metadata_df = metadata_df[["main_category", "title", "parent_asin", "categories"]]


In [33]:
next_item_sequence 

[array([3526., 2524., 3443.,  191., 1736., 4099., 1146., 4575., 3766.,
        3912.]),
 array([2524., 3443.,  191., 1736., 4099., 1146., 4575., 3766., 3912.,
        3831.]),
 array([3443.,  191., 1736., 4099., 1146., 4575., 3766., 3912., 3831.,
         819.]),
 array([ 191., 1736., 4099., 1146., 4575., 3766., 3912., 3831.,  819.,
        3012.])]

In [34]:
recs = best_model.recommend(
    users = sample_user,
    item_sequences = sample_item_sequence,
    k = 20,
    batch_size = 1,)

Generating recommendations:   0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
recs_df = pd.DataFrame(recs)

# Convert recommendations to item IDs and merge with metadata
recs_df = (
    recs_df
    .pipe(lambda df: df.assign(
        item_id=df['recommendation'].apply(lambda x: idm.get_item_id(x))
    ))
    .pipe(lambda df: df.merge(
        metadata_df,
        left_on="item_id",
        right_on="parent_asin",
        how="left"
    )) 
)

In [36]:
item_seq = sample_item_sequence.tolist()[0]
item_seq = [i for i in item_seq if i != -1]
item_seq
# Contruct pandas DataFrame from the item sequence
item_seq_df = pd.DataFrame(item_seq, columns=["item_id"]).pipe(
    lambda df: df.assign(
        item_id=df['item_id'].apply(lambda x: idm.get_item_id(x))
    ))
item_seq_df = item_seq_df.merge(
    metadata_df,
    left_on="item_id",
    right_on="parent_asin",
    how="left"
)


In [37]:
next_item_sequence_ = [int(seq[-1]) for seq in next_item_sequence]

true_seq_df = pd.DataFrame(next_item_sequence_, columns=["item_id"]).pipe(
    lambda df: df.assign(
        item_id=df['item_id'].apply(lambda x: idm.get_item_id(x))
    ))
true_seq_df = true_seq_df.merge(
    metadata_df,
    left_on="item_id",
    right_on="parent_asin",
    how="left"
)

with pd.option_context('display.max_colwidth', None):
    print("True Sequence:")
    display(true_seq_df)


True Sequence:


Unnamed: 0,item_id,main_category,title,parent_asin,categories
0,B08CZ4KZHN,Computers,"4K HDMI Cable 6ft (2-Pack) - Atevon High Speed 18Gbps HDMI 2.0 Cable - HDCP 2.2-4K HDR, 3D, 2160P, 1080P, Ethernet - 28AWG Braided HDMI Cord - Audio Return Compatible with TV, PC, Blu-ray Player",B08CZ4KZHN,"[Electronics, Television & Video, Accessories, Cables, HDMI Cables]"
1,B082SW81WD,Cell Phones & Accessories,"Soundcore Liberty Air True Wireless Earphones with Charging Case, Bluetooth 5, 28 Hour Playtime, Touch Control Earbuds, Graphene Enhanced Sound, Noise Cancelling Microphones and Secure Fit (White)",B082SW81WD,"[Electronics, Headphones, Earbuds & Accessories, Headphones & Earbuds, Earbud Headphones]"
2,B007GMPZ0A,Computers,TP-Link TL-WDN4800 N900 Dual Band Wireless PCI Express Adapter with,B007GMPZ0A,"[Electronics, Computers & Accessories, Computer Components, Internal Components, Network Cards]"
3,B073JYC4XM,Computers,"SanDisk 128GB Ultra MicroSDXC UHS-I Memory Card with Adapter - 100MB/s, C10, U1, Full HD, A1, Micro SD Card - SDSQUAR-128G-GN6MA",B073JYC4XM,"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Memory Cards, Micro SD Cards]"


In [38]:
with pd.option_context('display.max_colwidth', None):
    print("Item Sequence:")
    display(item_seq_df)
    print("\nRecommendations:")
    display(recs_df)

Item Sequence:


Unnamed: 0,item_id,main_category,title,parent_asin,categories
0,B0C3GSXJDJ,All Electronics,"Alphasonik ASE300BT Bluetooth Headphones, V4.0 Wireless Sport Headphones, Sweatproof Running Headset with Built in Mic for Workout Exercise IPX5 SplashProof, Ergonomically Designed for Extra Comfort",B0C3GSXJDJ,"[Electronics, Headphones, Earbuds & Accessories, Headphones & Earbuds, Earbud Headphones]"
1,B07L92L2WW,All Electronics,NVIDIA Shield TV | 4K HDR Streaming Media Player,B07L92L2WW,"[Electronics, Television & Video, Streaming Media Players]"
2,B01DE4CZJM,Musical Instruments,"Hosa CMS-105 3.5 mm TRS to 1/4"" TRS Stereo Interconnect Cable, 5 Feet",B01DE4CZJM,"[Electronics, Home Audio, Home Audio Accessories, Cables, Stereo Jack Cables]"
3,B07GZFM1ZM,Amazon Devices,Fire TV Stick 4K streaming device with Alexa Voice Remote (includes TV controls) | Dolby Vision,B07GZFM1ZM,[]
4,B0018P7WZ2,All Electronics,EIGELIU Headphone Jack Adapter Cable Car Charger Dongle AUX Audio Jack Earphone Extender Jack Stereo Cable (White),B0018P7WZ2,"[Electronics, Portable Audio & Video, MP3 & MP4 Player Accessories, Bluetooth & FM Transmitters, FM Transmitters]"
5,B00M14VAD4,Amazon Devices,TotalMount Fire TV Mounting System - Not Compatible with the New Fire TV,B00M14VAD4,"[Electronics, Accessories & Supplies, Mounts]"
6,B092JDKT8F,All Electronics,"Tribit Bluetooth Speaker, XSound Go Speaker with 16W Loud Sound & Deeper Bass, 24H Playtime, IPX7 Waterproof, Bluetooth 5.0 TWS Pairing Portable Wireless Speaker for Home, Outdoor (Upgraded)",B092JDKT8F,"[Electronics, Portable Audio & Video, Portable Speakers & Docks, Portable Bluetooth Speakers]"
7,B00BH5W848,Computers,"ViewSonic VX2370SMH-LED 23"" IPS 1080p Frameless LED Monitor HDMI, DVI, VGA",B00BH5W848,"[Electronics, Computers & Accessories, Monitors]"
8,B0BN74ZJDK,Computers,SABRENT 10 Port 60W USB 3.0 Hub with Individual Power Switches and LEDs Includes 60W 12V/5A Power Adapter (HB-BU10),B0BN74ZJDK,"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, USB Hubs]"
9,B07XZLW68F,Computers,"PNY CS900 250GB 3D NAND 2.5"" SATA III Internal Solid State Drive (SSD) - (SSD7CS900-250-RB)",B07XZLW68F,"[Electronics, Computers & Accessories, Data Storage, Internal Solid State Drives]"



Recommendations:


Unnamed: 0,user_indice,recommendation,score,item_id,main_category,title,parent_asin,categories
0,16035,3188,0.982723,B0791TX5P5,Amazon Devices,"Fire TV Stick streaming device with Alexa built in, includes Alexa Voice Remote, HD, latest release",B0791TX5P5,[]
1,16035,3454,0.982309,B07H65KP63,Amazon Devices,"Echo Dot (3rd Gen, 2018 release) - Smart speaker with Alexa - Charcoal",B07H65KP63,[]
2,16035,3533,0.981896,B07LG5WBTS,Computers,"Micro Center 64GB Class 10 MicroSDXC Flash Memory Card with Adapter for Mobile Device Storage Phone, Tablet, Drone & Full HD Video Recording - 80MB/s UHS-I, C10, U1 (1 Pack)",B07LG5WBTS,"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Memory Cards, Micro SD Cards]"
3,16035,3233,0.981633,B07BF9DLSJ,Computers,"tomtoc 360° Protective Laptop Sleeve for 15.6 Inch Acer Aspire 5/Nitro 5, Lenovo IdeaPad L3, HP Pavilion/ ENVY, Dell Inspiron 16, ASUS Chromebook/VivoBook, Water-resistant Notebook Accessory Bag Case",B07BF9DLSJ,"[Electronics, Computers & Accessories, Laptop Accessories, Bags, Cases & Sleeves, Sleeves]"
4,16035,4721,0.981332,B0C2PVFRWV,Cell Phones & Accessories,"USB Type C Cable, Anker [2-Pack 6Ft] Premium Nylon USB-C to USB-A Fast Charging Type C Cable, for Samsung Galaxy S10 / S9 / S8 / Note 8, LG V20 / G5 / G6 and More(Silver)",B0C2PVFRWV,"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Cables & Accessories, Cables & Interconnects, USB Cables]"
5,16035,4204,0.98129,B09G9STJF6,Computers,"ROOFULL External CD DVD Drive USB 3.0 Premium Portable USB CD ROM DVD +/-RW Optical Drive Player Reader Writer Burner for Apple Mac MacBook Pro/Air, iMac, Windows 10/11 Laptop PC, White (Updated)",B09G9STJF6,"[Electronics, Computers & Accessories, Computer Components, External Components, Optical Drives, CD & DVD Drives]"
6,16035,3915,0.981151,B08D7JPKLZ,Amazon Devices,Echo Dot (3rd Gen) - Smart speaker with clock and Alexa - Sandstone,B08D7JPKLZ,[]
7,16035,2823,0.981056,B01MXXQKGM,,UGREEN USB Switch Selector 2 Computers Sharing 4 USB Devices USB 2.0 Peripheral Switcher Box Hub for Mouse Keyboard Scanner Printer PCs with One-Button Swapping and 2 Pack USB A to A Cable,B01MXXQKGM,"[Electronics, Computers & Accessories, Computer Components, External Components, KVM Switches]"
8,16035,3976,0.981025,B08MRTZWJ5,Amazon Devices,Ring Charging Station for Quick Release Battery Packs,B08MRTZWJ5,[]
9,16035,3590,0.980921,B07PHQ93TV,Amazon Devices,"Fire HD 10 Tablet (10.1"" 1080p full HD display, 64 GB) – Black (2019 Release)",B07PHQ93TV,[]


## Sample item sequence

In [6]:
import pandas as pd

val_df = pd.read_parquet(args.val_data_fp)

In [10]:
with pd.option_context('display.max_colwidth', None):
    print("Val DataFrame:")
    display(val_df.loc[lambda df: df["user_id"] == "AGSP5XAQPQBUUXZHEZSC65FD7NOQ"])
# val_df.loc[lambda df: df["user_id"] == "AGSP5XAQPQBUUXZHEZSC65FD7NOQ"]

Val DataFrame:


Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,2020-12-27 00:30:31.146,11295,528,"[1898, 3479, 3908, 1570, 91, 2723, 2962, 106, 3557, 4172]"
1,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07KFQFDNB,0.0,2020-12-27 00:30:31.146,11295,3503,"[3479, 3908, 1570, 91, 2723, 2962, 106, 3557, 4172, 528]"
286,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B084H2NHNN,4.0,2021-01-06 11:29:25.850,11295,3851,"[3908, 1570, 91, 2723, 2962, 106, 3557, 4172, 528, 3503]"
287,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B00DGJSDQ0,0.0,2021-01-06 11:29:25.850,11295,1270,"[1570, 91, 2723, 2962, 106, 3557, 4172, 528, 3503, 3851]"
1342,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B00P28VN38,5.0,2021-02-18 01:09:31.028,11295,1873,"[91, 2723, 2962, 106, 3557, 4172, 528, 3503, 3851, 1270]"
1343,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B01AUR77FI,0.0,2021-02-18 01:09:31.028,11295,2455,"[2723, 2962, 106, 3557, 4172, 528, 3503, 3851, 1270, 1873]"
1736,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B08YB2CN2M,4.0,2021-03-06 07:34:00.477,11295,4062,"[2962, 106, 3557, 4172, 528, 3503, 3851, 1270, 1873, 2455]"
1737,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07KF6WSL8,0.0,2021-03-06 07:34:00.477,11295,3502,"[106, 3557, 4172, 528, 3503, 3851, 1270, 1873, 2455, 4062]"
2810,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07J2FGZSM,0.0,2021-04-30 04:21:05.032,11295,3476,"[3557, 4172, 528, 3503, 3851, 1270, 1873, 2455, 4062, 3502]"
2811,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B01690F5KO,5.0,2021-04-30 04:21:05.032,11295,2351,"[4172, 528, 3503, 3851, 1270, 1873, 2455, 4062, 3502, 3476]"
