In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pydantic import BaseModel
import sys
import os
from lightning.pytorch.loggers import MLFlowLogger
from loguru import logger
from load_dotenv import load_dotenv
import time
import json
import torch
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
import mlflow


sys.path.insert(0, "..")


from src.utils.embedding_id_mapper import IDMapper 
from src.utils.sequence import generate_item_sequences

In [12]:
class Args(BaseModel):
    notebook_persit_dp: str = None
    
    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg.parquet")

    sequence_length: int = 10

    
args = Args()
print(args.model_dump_json(indent=2))

{
  "notebook_persit_dp": null,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u_neg.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u_neg.parquet",
  "sequence_length": 10
}


## Read df

In [4]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

In [5]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101
43115,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000
233421,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000
...,...,...,...,...
250960,AGFRYVIF7CVPOK777KN3PSOSWSMA,B086QGXBRW,0.0,2013-12-17 03:19:23.000
217058,AGMAUSEXCG2JEGI245KGJJYHOWBQ,B0BGS23YKX,0.0,2019-04-30 00:21:36.489
61324,AGGEMMEOSRGTGESZ56F7ESETFRHQ,B00U3FPN4U,5.0,2017-07-22 01:23:03.787
132003,AGU6EIWIZSV6AIQSAVRDHTIJCHPA,B003PEUA30,0.0,2020-10-27 16:09:49.492


## Convert user_id and item_id to indices

In [6]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [7]:
train_df = train_df.pipe(idm.map_indices)
val_df = val_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [8]:
train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,2546,1890
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,2416,2467
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,4292,311
43115,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,728,689
233421,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000,5481,1611


## Read pos and neg df

In [9]:
full_pos_df = pd.concat(
    [
        train_df.loc[train_df[args.rating_col] > 0].assign(subset="train"),
        val_df.loc[val_df[args.rating_col] > 0].assign(subset="val"),
    ]
)

full_pos_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,subset
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,2416,2467,train
43115,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,728,689,train
34704,AHWPCIWRQOHHGGYOAYMTOEFT7ZZA,B07JGHZ45W,5.0,2019-10-11 22:52:29.818,15980,3481,train
36611,AHJPRWJT4ZPVYCOP6E3LNTI7TIMQ,B07G8BR51Z,5.0,2017-06-28 03:09:04.752,14314,3420,train
92387,AGNDNXH7IFQBIGJEEI2JGNPRPKWA,B00AYJFXIQ,4.0,2016-01-24 03:38:47.000,10582,1107,train
...,...,...,...,...,...,...,...
129536,AFYJ3P7ZXZ5CSOTMUJY7SZPFIJUA,B0058I7CVE,5.0,2021-11-10 01:42:01.117,7997,643,val
129909,AH4JN2DNC7W7TOZMRYB5HNQ3O3PQ,B08F1P3BCC,5.0,2021-01-25 19:24:23.259,12534,3925,val
130708,AGPDIY2JHE7EVMJTAV3SCLYDORQA,B0723D3FVL,4.0,2021-05-26 16:44:51.699,10867,2986,val
130349,AFQTOVETKDBVDIHKIPEZMPMMLCMQ,B0C2HWSXNL,5.0,2021-01-14 13:43:17.216,7022,4718,val


In [10]:
full_neg_df = pd.concat(
    [
        train_df.loc[train_df[args.rating_col] == 0].assign(subset="train"),
        val_df.loc[val_df[args.rating_col] == 0].assign(subset="val"),
    ]
)
full_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,subset
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,2546,1890,train
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,4292,311,train
233421,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000,5481,1611,train
165637,AEFFQVREXZFOMKC3KVCQAMUCCG5A,B071JN3KYN,0.0,2020-02-15 01:30:45.402,1472,2961,train
245604,AHH5OK6ASEFSCNUUJ4IHODKIKPMQ,B005ARQV6U,0.0,2014-10-28 01:24:45.000,13981,648,train
...,...,...,...,...,...,...,...
258666,AEBFLM5NVXHRN772RCAF2YEZ2I4Q,B00E0ISVLI,0.0,2021-11-17 23:58:45.042,975,1317,val
260871,AGWMOYFARDDTF2N4PDEURJ5HFKWA,B08BJM6LCG,0.0,2021-07-31 19:01:36.396,11821,3901,val
259138,AH6UU3WI2MLHL7ETZC4BYSCZUAUA,B001LL5JDA,0.0,2022-02-12 20:17:22.732,12847,235,val
261331,AHW4A2IHDBOQ3RAKPRE34TGEQRSA,B0BS2ZMHCL,0.0,2021-10-01 09:04:57.543,15908,4616,val


## Generate seq

In [13]:
pos_df_with_sequences = generate_item_sequences(
    full_pos_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1,
)

In [None]:
pos_df_with_sequences


Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,subset,item_sequence
49934,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
82123,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
95499,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
122364,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
97449,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...
127737,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,val,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 9..."
129601,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,val,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625,..."
129976,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,val,"[-1, 99, 1265, 3514, 2271, 3451, 3827, 2504, 4..."
130454,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,val,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]"


## Merge neg_df with pos_df_with_sequence

In [27]:
neg_df_with_sequences = full_neg_df.merge(
    pos_df_with_sequences[["user_indice", "timestamp", "item_sequence"]].drop_duplicates(
        subset=["user_indice", "timestamp"]
    ),
    on = ["user_indice", "timestamp"],
    how = "left",
)

In [28]:
neg_df_with_sequences

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,subset,item_sequence
0,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,2546,1890,train,"[-1, -1, -1, -1, -1, -1, -1, -1, 218, 2648]"
1,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,4292,311,train,"[-1, -1, -1, -1, 3541, 3089, 4168, 3936, 4066,..."
2,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000,5481,1611,train,"[-1, -1, -1, -1, -1, -1, -1, 3965, 4617, 2003]"
3,AEFFQVREXZFOMKC3KVCQAMUCCG5A,B071JN3KYN,0.0,2020-02-15 01:30:45.402,1472,2961,train,"[3269, 992, 4443, 3645, 3784, 883, 472, 3316, ..."
4,AHH5OK6ASEFSCNUUJ4IHODKIKPMQ,B005ARQV6U,0.0,2014-10-28 01:24:45.000,13981,648,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3595]"
...,...,...,...,...,...,...,...,...
130866,AEBFLM5NVXHRN772RCAF2YEZ2I4Q,B00E0ISVLI,0.0,2021-11-17 23:58:45.042,975,1317,val,"[1686, 3426, 3805, 1513, 4530, 1926, 2392, 472..."
130867,AGWMOYFARDDTF2N4PDEURJ5HFKWA,B08BJM6LCG,0.0,2021-07-31 19:01:36.396,11821,3901,val,"[3241, 3571, 1244, 4393, 2842, 2003, 4242, 347..."
130868,AH6UU3WI2MLHL7ETZC4BYSCZUAUA,B001LL5JDA,0.0,2022-02-12 20:17:22.732,12847,235,val,"[-1, -1, -1, -1, 1197, 1492, 426, 3868, 4627, ..."
130869,AHW4A2IHDBOQ3RAKPRE34TGEQRSA,B0BS2ZMHCL,0.0,2021-10-01 09:04:57.543,15908,4616,val,"[-1, -1, -1, -1, -1, 4275, 2526, 3834, 3768, 4..."


## Random eye balling

In [29]:
user_id = 'AENOXSRSNC5VGY3JQKZQ5DD7HIUA'

In [30]:
with pd.option_context("display.max_colwidth", None):
    display(neg_df_with_sequences[neg_df_with_sequences["user_id"] == user_id].sort_values("timestamp").head(10))

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,subset,item_sequence
38594,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B0BJW2XLMR,0.0,2016-03-13 16:26:42.000,2546,4541,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
96681,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B07FLXV8TT,0.0,2017-06-10 00:29:53.692,2546,3392,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 218]"
0,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,2546,1890,train,"[-1, -1, -1, -1, -1, -1, -1, -1, 218, 2648]"
61551,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B011BRUOMO,0.0,2017-06-10 00:31:25.121,2546,2253,train,"[-1, -1, -1, -1, -1, -1, -1, 218, 2648, 1950]"
117226,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B001TZUS98,0.0,2018-12-15 15:32:45.209,2546,266,train,"[-1, -1, -1, -1, -1, -1, 218, 2648, 1950, 2163]"
5621,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B09WDS7KLW,0.0,2019-04-18 19:37:20.377,2546,4325,train,"[-1, -1, -1, -1, -1, 218, 2648, 1950, 2163, 4516]"
27847,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B0C2HWSXNL,0.0,2019-04-18 19:50:49.402,2546,4718,train,"[-1, -1, -1, -1, 218, 2648, 1950, 2163, 4516, 2019]"
11325,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B08F1P3BCC,0.0,2019-04-18 19:55:04.974,2546,3925,train,"[-1, -1, -1, 218, 2648, 1950, 2163, 4516, 2019, 3091]"
31185,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B09ZF4LT72,0.0,2019-04-18 19:57:27.771,2546,4360,train,"[-1, -1, 218, 2648, 1950, 2163, 4516, 2019, 3091, 4108]"
56028,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B01597ENBA,0.0,2020-12-17 00:28:33.753,2546,2318,train,"[-1, 218, 2648, 1950, 2163, 4516, 2019, 3091, 4108, 4292]"


In [31]:
with pd.option_context("display.max_colwidth", None):
    display(pos_df_with_sequences[pos_df_with_sequences["user_id"] == user_id].sort_values("timestamp").head(10))

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,subset,item_sequence
20470,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B001F42MKG,5.0,2016-03-13 16:26:42.000,2546,218,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
20471,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B01I4TE612,5.0,2017-06-10 00:29:53.692,2546,2648,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 218]"
20472,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00SG3CWGS,5.0,2017-06-10 00:30:32.698,2546,1950,train,"[-1, -1, -1, -1, -1, -1, -1, -1, 218, 2648]"
20473,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00XIVH2LI,5.0,2017-06-10 00:31:25.121,2546,2163,train,"[-1, -1, -1, -1, -1, -1, -1, 218, 2648, 1950]"
20474,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B0BGNG1294,5.0,2018-12-15 15:32:45.209,2546,4516,train,"[-1, -1, -1, -1, -1, -1, 218, 2648, 1950, 2163]"
20475,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00UPLKF4A,5.0,2019-04-18 19:37:20.377,2546,2019,train,"[-1, -1, -1, -1, -1, 218, 2648, 1950, 2163, 4516]"
20476,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B075XN1NZC,5.0,2019-04-18 19:50:49.402,2546,3091,train,"[-1, -1, -1, -1, 218, 2648, 1950, 2163, 4516, 2019]"
20477,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B09315SB39,5.0,2019-04-18 19:55:04.974,2546,4108,train,"[-1, -1, -1, 218, 2648, 1950, 2163, 4516, 2019, 3091]"
20478,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B09S6Y5BRG,5.0,2019-04-18 19:57:27.771,2546,4292,train,"[-1, -1, 218, 2648, 1950, 2163, 4516, 2019, 3091, 4108]"
20479,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B0791TX5P5,5.0,2020-12-17 00:28:33.753,2546,3188,train,"[-1, 218, 2648, 1950, 2163, 4516, 2019, 3091, 4108, 4292]"


## Concat and split train and val

In [32]:
full_seq_df = pd.concat(
    [
        pos_df_with_sequences,
        neg_df_with_sequences,
    ]
)

In [38]:
train_seq_df = full_seq_df[full_seq_df["subset"] == "train"].drop(columns=["subset"])
val_seq_df = full_seq_df[full_seq_df["subset"] == "val"].drop(columns=["subset"])

In [41]:
train_seq_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet", index=False)
val_seq_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet", index=False)