## Controller

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.utils.embedding_id_mapper import IDMapper 
from src.utils.sequence import generate_item_sequences

## Args

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-data-prep"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u.parquet")

    sequence_length: int = 10
    
    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-data-prep",
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u.parquet",
  "sequence_length": 10
}


## Test

In [4]:
# Sample DataFrame
data = {
    "user_indices": [0, 0, 1, 1, 1],
    "item_indices": [0, 1, 2, 2, 4],
    "timestamp": [0, 0, 2, 2, 4],
    "ratings": [1, 4, 0, 3, 2],
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences_test = generate_item_sequences(
    df,
    user_col="user_indices",
    item_col="item_indices",
    timestamp_col="timestamp",
    sequence_length=3,
    padding=True,
    padding_value=-1,
)

df_with_sequences_test

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,0,4,"[-1, -1, 0]"
2,1,2,2,0,"[-1.0, -1.0, -1.0]"
3,1,2,2,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 2]"


## Load data

In [5]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)

In [6]:
full_df = pd.concat(
    [train_df.assign(source="train"), val_df.assign(source="val")],
    axis=0,
)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,source
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,train
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,train
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,train
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,train
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,train
...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,val
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,val
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,val
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,val


## Convert user_id and item_id to indices

In [7]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [8]:
train_df = train_df.pipe(idm.map_indices)
val_df = val_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_df["user_indice"].values, "Unknown user index must be present in validation data."

In [9]:
train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10,3931,2905
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13,3931,89
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46,3931,758
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45,3931,959
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39,3931,1096


## Generate sequence

In [10]:
full_df = pd.concat(
    [train_df.assign(source="train"), val_df.assign(source="val")],
    axis=0,
)
full_df

assert len(full_df) == len(train_df) + len(val_df), "The length of the concatenated DataFrame should equal the sum of the lengths of the individual DataFrames."

In [11]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,3931,2905,train
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,3931,89,train
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,3931,758,train
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,3931,959,train
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,3931,1096,train
...,...,...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,14144,4693,val
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,7343,3923,val
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,5497,4335,val
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,6427,4147,val


In [12]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1,
)

In [13]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, "item_indice", "item_sequence"]])

Unnamed: 0,user_id,item_indice,item_sequence
7302338,AFZ4EK2LJ655XQKTEUELCARO6RYA,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
13436267,AFY2C4YOUP2SSMM43HD2L3FIEFZA,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
17186750,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
29786537,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
17898861,AEX3L4NKDESOCGWOFNF63GRFGXCA,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
...,...,...,...
1363677,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,3626,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 951, 4461]"
10443069,AFBTD25HPE4BE4LUFV3DTI2E2N2A,3699,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625, 4079]"
12945774,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,32,"[-1, 99, 1265, 3514, 2271, 3451, 3827, 2504, 4546, 4270]"
19198673,AHLN6GKTKZE22AON34YAQXTGK63A,4772,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]"


In [14]:
# Check sample user
user_id = df_with_sequences.sample(n=1)[args.user_col].values[0]

(
    df_with_sequences.loc[lambda df: df[args.user_col].eq(user_id)]
    .sort_values(args.timestamp_col)[
        [args.user_col, args.timestamp_col, "item_indice", "item_sequence"]
    ]
    .head(10)
)

Unnamed: 0,user_id,timestamp,item_indice,item_sequence
25123930,AFCWT6AMPVX6OE4IOHDWZFIYNRJQ,2015-02-24 12:46:16.000,1083,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
25123931,AFCWT6AMPVX6OE4IOHDWZFIYNRJQ,2017-02-07 06:10:05.000,2237,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1083]"
25123932,AFCWT6AMPVX6OE4IOHDWZFIYNRJQ,2018-02-03 10:24:08.139,4098,"[-1, -1, -1, -1, -1, -1, -1, -1, 1083, 2237]"
25123933,AFCWT6AMPVX6OE4IOHDWZFIYNRJQ,2020-11-22 22:42:11.874,4570,"[-1, -1, -1, -1, -1, -1, -1, 1083, 2237, 4098]"
25123934,AFCWT6AMPVX6OE4IOHDWZFIYNRJQ,2020-11-26 02:24:03.833,949,"[-1, -1, -1, -1, -1, -1, 1083, 2237, 4098, 4570]"


## Persit

In [15]:
train_neg_df = df_with_sequences.loc[lambda df: df["source"].eq("train")].drop(
    columns=["source"]
)
val_neg_df = df_with_sequences.loc[lambda df: df["source"].eq("val")].drop(
    columns=["source"]
)

In [16]:
val_neg_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
7915094,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,2020-12-27 00:30:31.146,11295,528,"[1715, 2537, 3743, 506, 4490, 3479, 3908, 2723..."
2464556,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,2.0,2020-12-27 01:44:52.242,1784,3925,"[-1, -1, -1, -1, -1, 3382, 4330, 423, 3167, 2677]"
3011999,AGAVHCK42EGMVS7DGPRX6HBCUCNQ,B09Q3NR84W,5.0,2020-12-27 02:25:48.357,9042,4273,"[-1, -1, -1, -1, 3104, 1416, 3743, 2694, 3612,..."


In [17]:
train_neg_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_seq.parquet", index=False)
val_neg_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_seq.parquet", index=False)