In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
from loguru import logger
from typing import Literal

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.sparse as sparse
import torch.optim as optim

import pandas as pd
import plotly.express as px
from pydantic import BaseModel
from dotenv import load_dotenv
import mlflow

load_dotenv()

sys.path.insert(0, '..')

from src.viz import blueq_colors

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '028-sequence-relu-and-dropout'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    learning_task: Literal["rating", "ranking"] = "ranking"
    num_negative_samples: int = 5
    embedding_dim: int = 128
    max_input_sequence_length: int = 5
    learning_rate: float = 0.001

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        
        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )
            import mlflow

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-18 06:27:24.939[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m35[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 028-sequence-relu-and-dropout...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "028-sequence-relu-and-dropout",
  "notebook_persist_dp": "/home/jupyter/frostmourne/reco-algo/notebooks/data/028-sequence-relu-and-dropout",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128,
  "learning_task": "ranking",
  "num_negative_samples": 5,
  "embedding_dim": 128,
  "max_input_sequence_length": 5,
  "learning_rate": 0.001
}


# Implement

In [4]:
from src.train_utils import train, MetricLogCallback, MLflowLogCallback
# from src.model import GRUPairwiseRanking as SequencePairwiseRanking, GRURatingPrediction as SequenceRatingPrediction
from src.model import SequenceUserPairwiseRanking as SequencePairwiseRanking, SequenceUserRatingPrediction as SequenceRatingPrediction
from torch.utils.data import DataLoader
from src.dataset_loader import ItemSequenceDataset, ItemSequencePairwiseDataset

In [5]:
def init_model():
    if args.learning_task == 'ranking':
        model = SequencePairwiseRanking(n_users, n_items, embedding_dim, max_input_sequence_length=max_input_sequence_length, device=device)
    else:
        model = SequenceRatingPrediction(n_users, n_items, embedding_dim, max_input_sequence_length=max_input_sequence_length, device=device)
    return model

In [6]:
device = (
    "cuda"
    if torch.cuda.is_available()
    # else "mps"
    # if torch.backends.mps.is_available()
    else "cpu"
)
# device = 'cpu'
logger.info(f"Using {device} device")

[32m2024-09-18 06:27:25.757[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mUsing cuda device[0m


# Test implementation

In [7]:
embedding_dim = 8
max_input_sequence_length = 5
batch_size = 4

# Mock data
user_ids = [0, 0, 1, 2, 2]
item_ids = [0, 1, 2, 3, 4]
timestamps = [0, 1, 2, 3, 4]
ratings = [1, 4, 5, 3, 2]
val_timestamp = 5
n_users = len(set(user_ids))
n_items = len(set(item_ids))

val_user_ids = [0, 1, 2]
val_item_ids = [2, 1, 2]
val_timestamps = [6, 7, 8]
val_ratings = [2, 4, 5]

model = init_model()

# Example forward pass
user = torch.tensor([0])
input_sequence = torch.tensor([[0, 1]]) # simulate batch_size = 1
target_item = torch.tensor([2])
predictions = model.predict(user, input_sequence, target_item)
print(predictions)

tensor([[0.2703]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [8]:
full_df = pd.DataFrame({
    "user_indice": user_ids + val_user_ids,
    "item_indice": item_ids + val_item_ids,
    args.rating_col: ratings + val_ratings,
    args.timestamp_col: timestamps + val_timestamps,
})
full_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,0,0,1,0
1,0,1,4,1
2,1,2,5,2
3,2,3,3,3
4,2,4,2,4
5,0,2,2,6
6,1,1,4,7
7,2,2,5,8


In [9]:
import random
import numpy as np

# Display mock dataset
print("Mock User IDs:", user_ids)
print("Mock Item IDs:", item_ids)
print("Ratings:", ratings)

if args.learning_task == 'ranking':
    rating_dataset = ItemSequencePairwiseDataset(
        full_df,
        "user_indice",
        "item_indice",
        args.rating_col,
        args.timestamp_col,
        val_timestamp=val_timestamp,
        is_train=True,
        max_input_sequence_length=args.max_input_sequence_length,
        num_negative_samples=2,
    )
    val_rating_dataset = ItemSequencePairwiseDataset(
        full_df,
        "user_indice",
        "item_indice",
        args.rating_col,
        args.timestamp_col,
        val_timestamp=val_timestamp,
        is_train=False,
        max_input_sequence_length=args.max_input_sequence_length,
        num_negative_samples=2,
    )
else:
    rating_dataset = ItemSequenceDataset(
        full_df,
        'user_indice',
        'item_indice',
        'rating',
        'timestamp',
        val_timestamp=val_timestamp,
        is_train=True
    )    
    val_rating_dataset = ItemSequenceDataset(
        full_df,
        'user_indice',
        'item_indice',
        'rating',
        'timestamp',
        val_timestamp=val_timestamp,
        is_train=False
    )

dataloader = DataLoader(rating_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_rating_dataset, batch_size=batch_size, shuffle=True)

Mock User IDs: [0, 0, 1, 2, 2]
Mock Item IDs: [0, 1, 2, 3, 4]
Ratings: [1, 4, 5, 3, 2]


In [10]:
for batch_input in dataloader:
    print(batch_input)

{'user_indice': tensor([1, 2, 0, 0]), 'item_sequence': tensor([[-1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1],
        [-1, -1, -1, -1,  0],
        [-1, -1, -1, -1, -1]]), 'target': tensor([2, 3, 1, 0]), 'rating': tensor([5., 3., 4., 1.]), 'neg_items': tensor([[4, 0],
        [1, 0],
        [0, 3],
        [4, 3]]), 'labels': tensor([[5., 5.],
        [3., 3.],
        [3., 4.],
        [1., 1.]]), 'pos_item_metadata': tensor([], size=(4, 0)), 'neg_item_metadata': tensor([], size=(4, 0))}
{'user_indice': tensor([2]), 'item_sequence': tensor([[-1, -1, -1, -1,  3]]), 'target': tensor([4]), 'rating': tensor([2.]), 'neg_items': tensor([[1, 0]]), 'labels': tensor([[2., 2.]]), 'pos_item_metadata': tensor([], size=(1, 0)), 'neg_item_metadata': tensor([], size=(1, 0))}


In [11]:
# Training loop
n_epochs = 50

train(model, dataloader, val_dataloader, epochs=n_epochs, patience=2, print_steps=1, lr=0.001, device=device, progress_bar_type='tqdm_notebook')

Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

Training Epoch 1:   0%|          | 0/2 [00:00<?, ?it/s]

[32m2024-09-18 06:27:28.943[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 1, Global Loss: 0.9770[0m
[32m2024-09-18 06:27:28.944[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 1, Learning Rate: 0.001000[0m
[32m2024-09-18 06:27:28.945[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 1, Gradient Norms: {'grad_norm_item_embedding.weight': 0.34048643708229065, 'grad_norm_user_embedding.weight': 0.1401926875114441, 'grad_norm_fc_rating.0.weight': 2.2439517974853516, 'grad_norm_fc_rating.0.bias': 0.4760269820690155, 'grad_norm_fc_rating.3.weight': 0.9475918412208557, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 2.5090708754661515}[0m
[32m2024-09-18 06:27:28.957[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 2, Global Loss: 1.1417[0m
[32m2024-09-18 06:27:28.958[0m | [1mINFO    [0m | [36msrc.train_utils[0m:

Training Epoch 2:   0%|          | 0/2 [00:00<?, ?it/s]

[32m2024-09-18 06:27:28.998[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 3, Global Loss: 0.9837[0m
[32m2024-09-18 06:27:28.998[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 3, Learning Rate: 0.001000[0m
[32m2024-09-18 06:27:28.999[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 3, Gradient Norms: {'grad_norm_item_embedding.weight': 0.3038548529148102, 'grad_norm_user_embedding.weight': 0.28274211287498474, 'grad_norm_fc_rating.0.weight': 3.140876054763794, 'grad_norm_fc_rating.0.bias': 0.3895033299922943, 'grad_norm_fc_rating.3.weight': 1.5145330429077148, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 3.5331142703790173}[0m
[32m2024-09-18 06:27:29.008[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 4, Global Loss: 0.7313[0m
[32m2024-09-18 06:27:29.009[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[

Training Epoch 3:   0%|          | 0/2 [00:00<?, ?it/s]

[32m2024-09-18 06:27:29.045[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 5, Global Loss: 0.8093[0m
[32m2024-09-18 06:27:29.046[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 5, Learning Rate: 0.001000[0m
[32m2024-09-18 06:27:29.047[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 5, Gradient Norms: {'grad_norm_item_embedding.weight': 0.21851514279842377, 'grad_norm_user_embedding.weight': 0.24283650517463684, 'grad_norm_fc_rating.0.weight': 2.4859731197357178, 'grad_norm_fc_rating.0.bias': 0.6722594499588013, 'grad_norm_fc_rating.3.weight': 0.6974145174026489, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 2.687954717823398}[0m
[32m2024-09-18 06:27:29.056[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 6, Global Loss: 2.0070[0m
[32m2024-09-18 06:27:29.057[0m | [1mINFO    [0m | [36msrc.train_utils[0m:

# Prep data

In [12]:
from src.id_mapper import IDMapper
from src.train_utils import map_indice

In [13]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")
val_timestamp = 1628643414042  # https://amazon-reviews-2023.github.io/data_processing/5core.html
full_df = pd.concat([train_df, val_df], axis=0)

In [14]:
user_col = 'user_id'
item_col = 'parent_asin'
timestamp_col = 'timestamp' 
rating_col = 'rating'

In [15]:
user_ids = full_df['user_id'].values
item_ids = full_df['parent_asin'].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))

logger.info(f"{len(unique_user_ids)=:,.0f}, {len(unique_item_ids)=:,.0f}")

[32m2024-09-18 06:27:29.298[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mlen(unique_user_ids)=2,000, len(unique_item_ids)=24,723[0m


In [16]:
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [17]:
train_df = train_df.pipe(map_indice, idm)
val_df = val_df.pipe(map_indice, idm)
full_df = full_df.pipe(map_indice, idm)

if args.learning_task == 'rating':
    user_indices = [idm.get_user_index(user_id) for user_id in user_ids]
    item_indices = [idm.get_item_index(item_id) for item_id in item_ids]
    ratings = train_df[rating_col].values.tolist()

    val_user_indices = [idm.get_user_index(user_id) for user_id in val_df[user_col]]
    val_item_indices = [idm.get_item_index(item_id) for item_id in val_df[item_col]]
    val_ratings = val_df[rating_col].values.tolist()

# Train

In [18]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
23,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0920668372,5.0,1430056169000,925,9969
24,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1589255208,5.0,1443926150000,925,5390
25,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764322836,5.0,1463967052000,925,9390
26,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764330898,5.0,1489085694000,925,23852
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0062380761,5.0,1526591330983,925,1043
...,...,...,...,...,...,...
422078,AF7F5V4G3SWPRIKQEATNV7WACR6A,0062915320,5.0,1638675622205,1247,13203
422346,AF2T4ZDAXUTFGFFRDG5GA5BWQXRA,1733090312,4.0,1630014011916,311,18514
422347,AF2T4ZDAXUTFGFFRDG5GA5BWQXRA,1501128035,4.0,1630014353678,311,10699
423566,AG3A7NFV7ZKBXWF6FV3VMF6CK3BA,1101930926,5.0,1637012094603,974,20780


In [19]:
batch_size = args.batch_size

if args.learning_task == 'ranking':
    rating_dataset = ItemSequencePairwiseDataset(
        full_df,
        "user_indice",
        "item_indice",
        args.rating_col,
        args.timestamp_col,
        val_timestamp=val_timestamp,
        is_train=True,
        max_input_sequence_length=args.max_input_sequence_length,
        num_negative_samples=args.num_negative_samples,
    )
    val_rating_dataset = ItemSequencePairwiseDataset(
        full_df,
        "user_indice",
        "item_indice",
        args.rating_col,
        args.timestamp_col,
        val_timestamp=val_timestamp,
        is_train=False,
        max_input_sequence_length=args.max_input_sequence_length,
        num_negative_samples=args.num_negative_samples,
    )
else:
    rating_dataset = ItemSequenceDataset(
        full_df,
        'user_id',
        'item_id',
        'rating',
        'timestamp',
        val_timestamp=val_timestamp,
        is_train=True
    )    
    val_rating_dataset = ItemSequenceDataset(
        full_df,
        'user_id',
        'item_id',
        'rating',
        'timestamp',
        val_timestamp=val_timestamp,
        is_train=False
    )

dataloader = DataLoader(rating_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_rating_dataset, batch_size=batch_size, shuffle=True)

In [20]:
embedding_dim = args.embedding_dim
max_input_sequence_length = args.max_input_sequence_length
n_items = len(unique_item_ids)
n_users = len(unique_user_ids)

model = init_model()

#### Predict before train

In [21]:
val_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0451450523,2.0,1635710722120,925,21664
293,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B08CV9SPDQ,4.0,1635609140286,1566,4725
294,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B07R3QYGHY,4.0,1657998389024,1566,12891
763,AFBXVB2GIANS2DHWDK3HXISL2WEA,1291332162,5.0,1651000430747,496,13967
1205,AGSGLHB6G6QSTSIXWRD6ZZ7V5VZA,B0C8GJYMNH,5.0,1656800368338,1559,15772
...,...,...,...,...,...,...
422078,AF7F5V4G3SWPRIKQEATNV7WACR6A,0062915320,5.0,1638675622205,1247,13203
422346,AF2T4ZDAXUTFGFFRDG5GA5BWQXRA,1733090312,4.0,1630014011916,311,18514
422347,AF2T4ZDAXUTFGFFRDG5GA5BWQXRA,1501128035,4.0,1630014353678,311,10699
423566,AG3A7NFV7ZKBXWF6FV3VMF6CK3BA,1101930926,5.0,1637012094603,974,20780


In [22]:
user_id = 'AHXBL3QDWZGJYH7A5CMPFNUPMF7Q'
val_df.loc[lambda df: df['user_id'].eq(user_id)]

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,451450523,2.0,1635710722120,925,21664


In [23]:
timestamp = 1635710722120
full_df.loc[lambda df: df['user_id'].eq(user_id) & df['timestamp'].lt(timestamp)].sort_values(timestamp_col)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
23,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,920668372,5.0,1430056169000,925,9969
24,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1589255208,5.0,1443926150000,925,5390
25,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764322836,5.0,1463967052000,925,9390
26,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764330898,5.0,1489085694000,925,23852
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,62380761,5.0,1526591330983,925,1043
28,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,385467974,1.0,1613173700911,925,12480
29,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,765357151,2.0,1622466348378,925,11136


In [24]:
item_id = '0451450523'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)
target = [idm.get_item_index(item_id)]
item_sequence = rating_dataset.get_item_sequence(user_indice, timestamp).reshape(1, -1)
item_sequence

array([[ 9390, 23852,  1043, 12480, 11136]])

In [25]:
model.predict([user_indice], item_sequence, target)

tensor([[-0.2168]], device='cuda:0', grad_fn=<AddmmBackward0>)

#### Training loop

In [None]:
n_epochs = 50

metric_log_cb = MetricLogCallback()
mlflow_log_cb = MLflowLogCallback()

train(
    model,
    dataloader,
    val_dataloader,
    epochs=n_epochs,
    patience=3,
    print_steps=100,
    lr=args.learning_rate,
    gradient_clipping=False,
    device=device,
    progress_bar_type='tqdm_notebook',
    callbacks=[metric_log_cb.process_payload, mlflow_log_cb.process_payload]
)

Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

Training Epoch 1:   0%|          | 0/237 [00:00<?, ?it/s]

[32m2024-09-18 06:28:06.152[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 100, Global Loss: 1.0143[0m
[32m2024-09-18 06:28:06.154[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 100, Learning Rate: 0.001000[0m
[32m2024-09-18 06:28:06.155[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 100, Gradient Norms: {'grad_norm_item_embedding.weight': 0.03459254279732704, 'grad_norm_user_embedding.weight': 0.023554233834147453, 'grad_norm_fc_rating.0.weight': 1.3074274063110352, 'grad_norm_fc_rating.0.bias': 0.05893808230757713, 'grad_norm_fc_rating.3.weight': 0.8521343469619751, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 1.5622818284612754}[0m
[32m2024-09-18 06:28:42.342[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 200, Global Loss: 1.0086[0m
[32m2024-09-18 06:28:42.343[0m | [1mINFO    [0m | [36msrc.train_

Training Epoch 2:   0%|          | 0/237 [00:00<?, ?it/s]

[32m2024-09-18 06:29:21.224[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 300, Global Loss: 0.9445[0m
[32m2024-09-18 06:29:21.225[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 300, Learning Rate: 0.001000[0m
[32m2024-09-18 06:29:21.226[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 300, Gradient Norms: {'grad_norm_item_embedding.weight': 0.0350610613822937, 'grad_norm_user_embedding.weight': 0.02117985486984253, 'grad_norm_fc_rating.0.weight': 0.9963933825492859, 'grad_norm_fc_rating.0.bias': 0.05379411205649376, 'grad_norm_fc_rating.3.weight': 0.8388879895210266, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 1.3042639696473044}[0m
[32m2024-09-18 06:29:57.139[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 400, Global Loss: 0.9499[0m
[32m2024-09-18 06:29:57.140[0m | [1mINFO    [0m | [36msrc.train_ut

Training Epoch 3:   0%|          | 0/237 [00:00<?, ?it/s]

[32m2024-09-18 06:30:35.972[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 500, Global Loss: 0.8902[0m
[32m2024-09-18 06:30:35.973[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 500, Learning Rate: 0.001000[0m
[32m2024-09-18 06:30:35.974[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 500, Gradient Norms: {'grad_norm_item_embedding.weight': 0.04008409008383751, 'grad_norm_user_embedding.weight': 0.024294305592775345, 'grad_norm_fc_rating.0.weight': 1.041199803352356, 'grad_norm_fc_rating.0.bias': 0.06197928637266159, 'grad_norm_fc_rating.3.weight': 0.8244413733482361, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 1.3303529562077347}[0m
[32m2024-09-18 06:31:12.603[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 600, Global Loss: 0.8864[0m
[32m2024-09-18 06:31:12.605[0m | [1mINFO    [0m | [36msrc.train_u

Training Epoch 4:   0%|          | 0/237 [00:00<?, ?it/s]

[32m2024-09-18 06:32:27.593[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 800, Global Loss: 0.8061[0m
[32m2024-09-18 06:32:27.594[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m164[0m - [1mStep 800, Learning Rate: 0.001000[0m
[32m2024-09-18 06:32:27.595[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m167[0m - [1mStep 800, Gradient Norms: {'grad_norm_item_embedding.weight': 0.05636896565556526, 'grad_norm_user_embedding.weight': 0.03619137033820152, 'grad_norm_fc_rating.0.weight': 1.2539727687835693, 'grad_norm_fc_rating.0.bias': 0.06464296579360962, 'grad_norm_fc_rating.3.weight': 0.6900045275688171, 'grad_norm_fc_rating.3.bias': 0.0, 'total_grad_norm': 1.4343012032062257}[0m
[32m2024-09-18 06:33:03.900[0m | [1mINFO    [0m | [36msrc.train_utils[0m:[36mtrain[0m:[36m161[0m - [1mStep 900, Global Loss: 0.8162[0m
[32m2024-09-18 06:33:03.902[0m | [1mINFO    [0m | [36msrc.train_u

Training Epoch 5:   0%|          | 0/237 [00:00<?, ?it/s]

# Visualize training

In [None]:
step_metrics = [p for p in metric_log_cb.payloads if 'step' in p]
epoch_metrics = [p for p in metric_log_cb.payloads if 'epoch' in p]
step_metrics_df = pd.DataFrame(step_metrics)
step_metrics_df

In [None]:
epoch_metrics_df = pd.DataFrame(epoch_metrics)
epoch_metrics_df = epoch_metrics_df.pipe(lambda df: pd.melt(df, id_vars=["epoch"], var_name="loss_type", value_name="value"))
epoch_metrics_df

In [None]:
from src.eval import plot_metric, plot_train_vs_val_loss

plot_metric(step_metrics_df, col='global_loss')
plot_metric(step_metrics_df, col='learning_rate')
plot_metric(step_metrics_df, col='total_grad_norm')

In [None]:
plot_train_vs_val_loss(epoch_metrics_df, height=500)

# Evaluate

## Ranking metrics

In [None]:
user_col = args.user_col
item_col = args.item_col
rating_col = args.rating_col

In [None]:
from src.eval import create_label_df, create_rec_df, merge_recs_with_target

In [None]:
val_user_indices = val_df['user_indice'].unique()
val_timestamps = np.array([val_timestamp] * len(val_user_indices))
val_item_sequences = np.array([rating_dataset.get_item_sequence(val_user_indices[i], val_timestamps[i]) for i in range(len(val_user_indices))])

In [None]:
user = val_user_indices[0]
user

In [None]:
input_seq = val_item_sequences[0]
input_seq

In [None]:
model.predict([user], np.array([input_seq]), [22042])

In [None]:
recommendations = model.recommend(
    val_user_indices,
    user_item_sequences=val_item_sequences,
    k=args.top_K,
    batch_size=4,
    progress_bar_type='tqdm_notebook'
)

In [None]:
recommendations_df = pd.DataFrame(recommendations).pipe(create_rec_df, idm)
recommendations_df

In [None]:
label_df = create_label_df(val_df)
label_df

In [None]:
eval_df = merge_recs_with_target(recommendations_df, label_df, k=args.top_K)
eval_df

In [None]:
recommendations_df.sample(10)

In [None]:
recommendations_df.loc[lambda df: df['user_id'].eq('AELCDBVDWEXMFQSCLNEMM2KGDZDQ')]

In [None]:
recommendations_df.loc[lambda df: df['user_id'].eq('AH5ZE3FDA4UALJY4VU6RNF74DLDQ')]

### Visualize

In [None]:
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metrics import PrecisionTopKMetric
from evidently.metrics import RecallTopKMetric
from evidently.metrics import FBetaTopKMetric
from evidently.metrics import NDCGKMetric
from evidently.metrics import PersonalizationMetric
import warnings

warnings.filterwarnings(
    action='ignore',
    category=FutureWarning,
    module=r'evidently.metrics.recsys.precision_recall_k'
)

from src.viz import color_scheme

In [None]:
column_mapping = ColumnMapping(
    recommendations_type='rank',
    target=rating_col,
    prediction='rec_ranking',
    item_id=item_col,
    user_id=user_col
)

report = Report(metrics=[
    NDCGKMetric(k=args.top_k),
    RecallTopKMetric(k=args.top_K),
    PrecisionTopKMetric(k=args.top_k),
    FBetaTopKMetric(k=args.top_k),
    PersonalizationMetric(k=args.top_k),
], options=[color_scheme])

report.run(
    reference_data=None,
    current_data=eval_df,
    column_mapping=column_mapping
)

evidently_report_fp = f"{args.notebook_persist_dp}/evidently_report.html"
os.makedirs(args.notebook_persist_dp, exist_ok=True)
report.save_html(evidently_report_fp)

if args.log_to_mlflow:
    mlflow.log_artifact(evidently_report_fp)
    for metric_result in report.as_dict()['metrics']:
        metric = metric_result['metric']
        if metric == 'PersonalizationMetric':
            metric_value = float(metric_result['result']['current_value'])
            mlflow.log_metric(f"val_{metric}", metric_value)
            continue
        result = metric_result['result']['current'].to_dict()
        for kth, metric_value in result.items():
            mlflow.log_metric(f"val_{metric}_at_k_as_step", metric_value, step=kth)

# Predict

In [None]:
full_df

In [None]:
user_id = 'AHXBL3QDWZGJYH7A5CMPFNUPMF7Q'
val_df.loc[lambda df: df['user_id'].eq(user_id)]

In [None]:
timestamp = 1635710722120
full_df.loc[lambda df: df['user_id'].eq(user_id) & df['timestamp'].lt(timestamp)].sort_values(timestamp_col)

In [None]:
item_id = '0451450523'
user_indice = idm.get_user_index(user_id)
item_indice = idm.get_item_index(item_id)
target = [idm.get_item_index(item_id)]
item_sequence = rating_dataset.get_item_sequence(user_indice, timestamp).reshape(1, -1)
item_sequence

In [None]:
model.predict([user_indice], item_sequence, target)

# Clean up

In [None]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()