## Setup

In [20]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [23]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import time
import json
import torch
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow

sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.two_tower.model import TwoTowerRating
from src.algo.two_tower.dataset import UserItemRatingDFDataset
from src.algo.two_tower.trainer import TwoTowerLitModule
from src.eval.utils import create_rec_df, create_label_df, merge_recs_with_target
from src.eval.log_metrics import log_ranking_metrics, log_classification_metrics

In [2]:
load_dotenv(override = True)

True

In [None]:
hidden_dim = 128
embedding_dim: int = 128

In [None]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "first-attempt"
    run_name: str = f"005-two-tower-{embedding_dim}-{hidden_dim}"
    notebook_persit_dp: str = None
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"
    group_name: str = "two-tower"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 64
    embedding_dim: int = embedding_dim
    learning_rate: float = 0.001
    l2_reg: float = 1e-4
    early_stopping_patience: int = 10
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    max_epochs: int = 100

    # TwoTower specific
    hidden_dim: int = hidden_dim
    dropout: float = 0.2

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg.parquet")

    def init(self):
        self.notebook_persit_dp = os.path.abspath(f"data/{self.experiment_name}/{self.run_name}")

        if not (mlflow_uri := os.environ.get("MLFLOW_TRACKING_URI")):
            self.log_to_mlflow = False
            logger.warning("MLFlow is not enabled. Turn off tracking to Mlflow.")

        if self.log_to_mlflow:
            logger.info(
                f"Setting up Mlflow experiment: {self.experiment_name}, run_name: {self.run_name}"
            )

            self._mlf_logger = MLFlowLogger(
                experiment_name=self.experiment_name,
                run_name=self.run_name,
                tracking_uri=mlflow_uri,
                log_model=True,
            )

        if not self.testing:
            os.makedirs(self.notebook_persit_dp, exist_ok=True)
        return self
    
args = Args().init()
print(args.model_dump_json(indent=2))

[32m2025-04-25 03:12:33.242[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m40[0m - [1mSetting up Mlflow experiment: first-attempt, run_name: 005-two-tower-dim256[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "first-attempt",
  "run_name": "005-two-tower-dim256",
  "notebook_persit_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/005-two-tower-dim256",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "group_name": "two-tower",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 64,
  "embedding_dim": 128,
  "learning_rate": 0.001,
  "l2_reg": 0.0001,
  "early_stopping_patience": 10,
  "device": "cuda",
  "max_epochs": 100,
  "hidden_dim": 256,
  "dropout": 0.2,
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u_neg.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u_neg.parquet"
}


In [4]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

assert set(val_df[args.user_col].unique()).issubset(set(train_df[args.user_col].unique())), "Validation users must be present in training users."

assert set(val_df[args.item_col].unique()).issubset(set(train_df[args.item_col].unique())), "Validation items must be present in training items."
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "Validation data must be after training data. Otherwise, its a data contamination problem."

In [5]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101


## Convert user_id and item_id into indices

In [6]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [7]:
train_df = train_df.pipe(idm.map_indices)
val_df = val_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [8]:
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,2546,1890
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,2416,2467
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,4292,311


In [9]:
assert train_df.groupby(args.user_col)[args.item_col].nunique().min() >= 5, "Each user must have at least five items."
assert train_df.groupby(args.item_col)[args.user_col].nunique().min() >= 10, "Each item must have at least ten users."

## Init model

In [10]:
def init_model(n_user, n_items, embedding_dim, hidden_dim, dropout):
    """
    Initialize the model with the given parameters.
    """
    model = TwoTowerRating(
        num_users = n_user,
        num_items = n_items,
        embedding_dim = embedding_dim,
        hidden_units_dim = hidden_dim,
        dropout = dropout,
    )
    return model

## Test

In [None]:
# embedding_dim = 8
# batch_size = 2
# hidden_dim = 8
# dropout = 0.2

# # Mock data
# user_indices = [0, 0, 1, 2, 2]
# item_indices = [0, 1, 2, 3, 4]
# timestamps = [0, 1, 2, 3, 4]
# ratings = [0, 4, 5, 3, 0]

# n_users = len(set(user_indices))
# n_items = len(set(item_indices))

# train_test_df = pd.DataFrame(
#     {
#         "user_indice": user_indices,
#         "item_indice": item_indices,
#         args.timestamp_col: timestamps,
#         args.rating_col: ratings,
#     }
# )

# model = init_model(n_users, n_items, embedding_dim, hidden_dim, dropout)

# # Example forward pass
# model.eval()
# user = torch.tensor([0])
# target_item = torch.tensor([2])
# predictions = model.predict(user, target_item)
# print(predictions)

tensor([0.4973], grad_fn=<SigmoidBackward0>)


In [None]:
# dataset = UserItemRatingDFDataset(
#     df = train_test_df,
#     user_col = "user_indice",
#     item_col = "item_indice",
#     rating_col = args.rating_col,
#     timestamp_col = args.timestamp_col,
# )

# train_loader = DataLoader(
#     dataset,
#     batch_size = batch_size,
#     shuffle = False,
#     drop_last= True
# )

In [None]:
# for batch_input in train_loader:
#     print(batch_input)

{'user': tensor([0, 0]), 'item': tensor([0, 1]), 'rating': tensor([0., 4.])}
{'user': tensor([1, 2]), 'item': tensor([2, 3]), 'rating': tensor([5., 3.])}


In [None]:
# # model
# lit_model = TwoTowerLitModule(model, log_dir=args.notebook_persit_dp, top_K = 3,top_k=3, idm = idm)
# # train model
# trainer = L.Trainer(
#     default_root_dir=f"{args.notebook_persit_dp}/test",
#     max_epochs=150,
#     accelerator="auto",
#     log_every_n_steps=1,
# )
# trainer.fit(
#     model=lit_model, train_dataloaders=train_loader, val_dataloaders=train_loader
# )

## Training loop

In [15]:
rating_dataset = UserItemRatingDFDataset(
    train_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col
)
val_rating_dataset = UserItemRatingDFDataset(
    val_df, "user_indice", "item_indice", args.rating_col, args.timestamp_col
)

train_loader = DataLoader(
    rating_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)
val_loader = DataLoader(
    val_rating_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False
)

In [16]:
for i in train_loader:
    print(i)
    break

{'user': tensor([ 7483,  4021,  1296, 15419, 12057,  6812,  4095,  5706,  2092,  6885,
        16036, 13288,  3382,  4662,  3900,  8156, 12989,  4953,  6052,  7323,
        10355,  1383, 11643,  7412, 15915, 10093, 13050,  3121,  6619,  7057,
         2491, 15353,  3951, 12849,  7152, 15301, 11705,  1249,  1280,    60,
        15688, 16358, 13393, 10492, 15331,  9958,  4044,  2939,  1796, 12420,
         2733, 13908,  4242,   948,  4870,  2173,  6143,  7029, 11332,  2015,
        11615,  3939,  6709,  6255]), 'item': tensor([2069, 2290, 3469,  193,  667, 1844,  161, 2135, 2576, 1263, 3870, 3078,
        3188, 2179, 1925, 3032, 4240, 3125, 2130, 3612,  445, 4357, 4200, 4190,
        3313, 2849, 2097, 1903, 4123, 4121,  615, 3590,  164, 3303, 3923, 1844,
        2224, 4503, 3414, 2949, 1232, 3188, 2779, 2232,  445, 3411, 1217, 4039,
         139, 3130, 2003, 3373, 2576, 4404, 2725, 4710, 3839, 4616, 1706, 2575,
        4546,   30, 3411, 1067]), 'rating': tensor([0., 5., 0., 0., 0., 5., 0

In [17]:
item_indices = train_df[args.item_col].unique()
user_indices = train_df[args.user_col].unique()
n_items = len(item_indices)
n_users = len(user_indices)

logger.info(f"Number of users: {n_users}, Number of items: {n_items}")
model = init_model(n_users, n_items, args.embedding_dim, args.hidden_dim, args.dropout)

[32m2025-04-25 03:12:40.735[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of users: 16407, Number of items: 4817[0m


## Overfir 1 batch

In [None]:
# early_stopping = EarlyStopping(
#     monitor="val_loss", patience=5, mode="min", verbose=False
# )

# model = init_model(n_users, n_items, args.embedding_dim, args.hidden_dim, args.dropout)
# lit_model = TwoTowerLitModule(
#     model,
#     learning_rate=args.learning_rate,
#     l2_reg=args.l2_reg,
#     log_dir=args.notebook_persit_dp,
#     accelerator=args.device,
#     idm= idm
# )

# log_dir = f"{args.notebook_persit_dp}/logs/overfit"

# # train model
# trainer = L.Trainer(
#     default_root_dir=log_dir,
#     accelerator=args.device if args.device else "auto",
#     max_epochs=300,
#     overfit_batches=1,
#     callbacks=[early_stopping],
# )
# trainer.fit(
#     model=lit_model,
#     train_dataloaders=train_loader,
#     val_dataloaders=train_loader,
# )
# logger.info(f"Logs available at {trainer.log_dir}")

## Run on all data

In [19]:
early_stopping = EarlyStopping(
    monitor="val_loss", patience=5, mode="min", verbose=False, min_delta=0.025
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{args.notebook_persit_dp}/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    monitor="val_loss",
    mode="min",
)

model = init_model(n_users, n_items, args.embedding_dim, args.hidden_dim, args.dropout)
lit_model = TwoTowerLitModule(
    model,
    learning_rate=args.learning_rate,
    l2_reg=args.l2_reg,
    log_dir=args.notebook_persit_dp,
    accelerator=args.device,
    idm= idm
)

log_dir = f"{args.notebook_persit_dp}/logs/run"

# train model
trainer = L.Trainer(
    default_root_dir=log_dir,
    accelerator=args.device if args.device else "auto",
    max_epochs=args.max_epochs,
    callbacks=[early_stopping, checkpoint_callback],
    logger=args._mlf_logger if args.log_to_mlflow else None,
)
trainer.fit(
    model=lit_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

# Change the library as a workaround for the issue in the latest Lightning release
#https://github.com/Lightning-AI/pytorch-lightning/pull/20669/commits/429f732a0528c558e701da7ec01e51c1e2e4f32e

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type           | Params | Mode 
-------------------------------------------------
0 | model | TwoTowerRating | 2.8 M  | train
-------------------------------------------------
2.8 M     Trainable params
0         Non-trainable params
2.8 M     Total params
11.131    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[32m2025-04-25 03:28:20.444[0m | [1mINFO    [0m | [36msrc.algo.two_tower.trainer[0m:[36mon_fit_end[0m:[36m118[0m - [1mLogging classification metrics...[0m
[32m2025-04-25 03:28:21.456[0m | [1mINFO    [0m | [36msrc.algo.two_tower.trainer[0m:[36mon_fit_end[0m:[36m121[0m - [1mLogging ranking metrics...[0m


Generating recommendations:   0%|          | 0/606 [00:00<?, ?it/s]

[32m2025-04-25 03:28:26.522[0m | [1mINFO    [0m | [36msrc.algo.two_tower.trainer[0m:[36mon_fit_end[0m:[36m124[0m - [1mEvidently metrics are available at: /home/dinhln/Desktop/real_time_recsys/notebooks/data/first-attempt/005-two-tower-dim256[0m


🏃 View run 005-two-tower-dim256 at: http://localhost:5002/#/experiments/9/runs/1aa40b57c3814b78a556bf41b83fc40c
🧪 View experiment at: http://localhost:5002/#/experiments/9


## Clean up

In [25]:
all_params = [args]

if args.log_to_mlflow:
    run_id = trainer.logger.run_id

    with mlflow.start_run(run_id=run_id):
        for params in all_params:
            params_dict = params.model_dump()
            params_ = dict()
            for k, v in params_dict.items():
                if k == "top_K":
                    k = "top_big_K"
                if k == "top_k":
                    k = "top_small_k"
                params_[f"{params.__repr_name__()}.{k}"] = v
            mlflow.log_params(params_)

🏃 View run 005-two-tower-dim256 at: http://localhost:5002/#/experiments/9/runs/1aa40b57c3814b78a556bf41b83fc40c
🧪 View experiment at: http://localhost:5002/#/experiments/9
