## Controller

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.utils.embedding_id_mapper import IDMapper 
from src.utils.sequence import generate_item_sequences

## Args

In [11]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-data-prep"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    sequence_length: int = 50
    
    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-data-prep",
  "notebook_persist_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\train_sample_interactions_16407u_neg_seq.parquet",
  "val_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\val_sample_interactions_16407u_neg_seq.parquet",
  "sequence_length": 50
}


## Test

In [12]:
# Sample DataFrame
data = {
    "user_indices": [0, 0, 1, 1, 1],
    "item_indices": [0, 1, 2, 3, 4],
    "timestamp": [0, 0, 2, 3, 4],
    "ratings": [1, 4, 5, 3, 2],
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences_test = generate_item_sequences(
    df,
    user_col="user_indices",
    item_col="item_indices",
    timestamp_col="timestamp",
    sequence_length=3,
    padding=True,
    padding_value=-1,
)

df_with_sequences_test

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,0,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Load data

In [13]:
train_neg_df = pd.read_parquet(args.train_data_fp)
val_neg_df = pd.read_parquet(args.val_data_fp)

In [14]:
full_neg_df = pd.concat(
    [train_neg_df.assign(source="train"), val_neg_df.assign(source="val")],
    axis=0,
)
full_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,source
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
...,...,...,...,...,...,...,...,...
6953,AEBFLM5NVXHRN772RCAF2YEZ2I4Q,B00E0ISVLI,0.0,2021-11-17 23:58:45.042,975,1317,"[1686, 3426, 3805, 1513, 4530, 1926, 2392, 472...",val
6954,AGWMOYFARDDTF2N4PDEURJ5HFKWA,B08BJM6LCG,0.0,2021-07-31 19:01:36.396,11821,3901,"[3241, 3571, 1244, 4393, 2842, 2003, 4242, 347...",val
6955,AH6UU3WI2MLHL7ETZC4BYSCZUAUA,B001LL5JDA,0.0,2022-02-12 20:17:22.732,12847,235,"[-1, -1, -1, -1, 1197, 1492, 426, 3868, 4627, ...",val
6956,AHW4A2IHDBOQ3RAKPRE34TGEQRSA,B0BS2ZMHCL,0.0,2021-10-01 09:04:57.543,15908,4616,"[-1, -1, -1, -1, -1, 4275, 2526, 3834, 3768, 4...",val


## Convert user_id and item_id to indices

In [15]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [16]:
train_neg_df = train_neg_df.pipe(idm.map_indices)
val_neg_df = val_neg_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_neg_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_neg_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_neg_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_neg_df["user_indice"].values, "Unknown user index must be present in validation data."

In [17]:
train_neg_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


## Generate sequence

In [18]:
full_neg_df = pd.concat(
    [train_neg_df.assign(source="train"), val_neg_df.assign(source="val")],
    axis=0,
)
full_neg_df

assert len(full_neg_df) == len(train_neg_df) + len(val_neg_df), "The length of the concatenated DataFrame should equal the sum of the lengths of the individual DataFrames."

In [19]:
df_with_sequences = generate_item_sequences(
    full_neg_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1,
)

In [20]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, "item_indice", "item_sequence"]])

Unnamed: 0,user_id,item_indice,item_sequence
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
192731,AFZ4EK2LJ655XQKTEUELCARO6RYA,4132,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4]"
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
180543,AFY2C4YOUP2SSMM43HD2L3FIEFZA,1859,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 36]"
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
...,...,...,...
3476,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,32,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 99, 2596, 1265, 1273, 3514, 3394, 2271, 4084, 3451, 3801, 3827, 1839, 2504, 1347, 4546, 2694, 4270, 369, 2441]"
4694,AHLN6GKTKZE22AON34YAQXTGK63A,4303,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1812, 2950, 4165, 4735, 2440, 4575, 4807, 607, 3091, 374]"
3477,AHLN6GKTKZE22AON34YAQXTGK63A,4772,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1812, 2950, 4165, 4735, 2440, 4575, 4807, 607, 3091, 374, 4303]"
3478,AEMYBWDN67IB5IBTMHLHN76V4QHQ,4086,"[528, 395, 3226, 2286, 4734, 856, 631, 890, 4516, 4364, 285, 1218, 2220, 811, 4727, 1518, 4033, 4616, 1659, 3023, 4422, 1924, 1442, 4777, 3057, 1183, 1631, 4040, 1355, 2995, 3810, 2677, 2237, 1610, 3194, 2694, 1617, 3695, 4429, 3438, 3602, 644, 1865, 4569, 3653, 3030, 3803, 3998, 4720, 285]"


In [21]:
# Check sample user
user_id = df_with_sequences.sample(n=1)[args.user_col].values[0]

(
    df_with_sequences.loc[lambda df: df[args.user_col].eq(user_id)]
    .sort_values(args.timestamp_col)[
        [args.user_col, args.timestamp_col, "item_indice", "item_sequence"]
    ]
    .head(10)
)

Unnamed: 0,user_id,timestamp,item_indice,item_sequence
51204,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-01-22 22:17:21,4095,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
179156,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-01-22 22:17:21,1997,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
218587,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:31:17,3813,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
61737,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:31:17,4232,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
61738,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:35:19,3789,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
166214,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:35:19,2355,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
61739,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:36:17,2810,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
248289,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:36:17,1208,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
199120,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:37:08,1513,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
61740,AGQ66KEBB4CKRKLYZ65FRI6HB5IQ,2016-07-24 18:37:08,1677,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


## Persit

In [22]:
train_neg_df = df_with_sequences.loc[lambda df: df["source"].eq("train")].drop(
    columns=["source"]
)
val_neg_df = df_with_sequences.loc[lambda df: df["source"].eq("val")].drop(
    columns=["source"]
)

In [23]:
val_neg_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,2020-12-27 00:30:31.146,11295,528,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
3852,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07KFQFDNB,0.0,2020-12-27 00:30:31.146,11295,3503,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,2.0,2020-12-27 01:44:52.242,1784,3925,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."


In [24]:
train_neg_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet", index=False)
val_neg_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet", index=False)