## Controller

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.utils.embedding_id_mapper import IDMapper 
from src.utils.sequence import generate_item_sequences

## Args

In [None]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-data-prep"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")

    sequence_length: int = 50
    
    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-data-prep",
  "notebook_persist_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\train_sample_interactions_16407u_neg.parquet",
  "val_data_fp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\val_sample_interactions_16407u_neg.parquet",
  "sequence_length": 100
}


## Test

In [7]:
# Sample DataFrame
data = {
    "user_indices": [0, 0, 1, 1, 1],
    "item_indices": [0, 1, 2, 3, 4],
    "timestamp": [0, 0, 2, 3, 4],
    "ratings": [1, 4, 5, 3, 2],
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences_test = generate_item_sequences(
    df,
    user_col="user_indices",
    item_col="item_indices",
    timestamp_col="timestamp",
    sequence_length=3,
    padding=True,
    padding_value=-1,
)

df_with_sequences_test

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,0,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Load data

In [8]:
train_neg_df = pd.read_parquet(args.train_data_fp)
val_neg_df = pd.read_parquet(args.val_data_fp)

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\data_for_ai\\interim\\train_sample_interactions_16407u_neg.parquet'

In [None]:
full_neg_df = pd.concat(
    [train_neg_df.assign(source="train"), val_neg_df.assign(source="val")],
    axis=0,
)
full_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,source
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,train
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,train
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,train
43115,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,train
233421,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000,train
...,...,...,...,...,...
130708,AGPDIY2JHE7EVMJTAV3SCLYDORQA,B0723D3FVL,4.0,2021-05-26 16:44:51.699,val
261331,AHW4A2IHDBOQ3RAKPRE34TGEQRSA,B0BS2ZMHCL,0.0,2021-10-01 09:04:57.543,val
130349,AFQTOVETKDBVDIHKIPEZMPMMLCMQ,B0C2HWSXNL,5.0,2021-01-14 13:43:17.216,val
130673,AEZKOL32LQNPV2K5R3U25Q3GGQQA,B07WQKKS8V,5.0,2021-06-25 21:29:34.271,val


## Convert user_id and item_id to indices

In [None]:
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'

In [None]:
train_neg_df = train_neg_df.pipe(idm.map_indices)
val_neg_df = val_neg_df.pipe(idm.map_indices)

assert idm.unknown_item_index not in train_neg_df["item_indice"].values, "Unknown item index must be present in training data."
assert idm.unknown_user_index not in train_neg_df["user_indice"].values, "Unknown user index must be present in training data."
assert idm.unknown_item_index not in val_neg_df["item_indice"].values, "Unknown item index must be present in validation data."
assert idm.unknown_user_index not in val_neg_df["user_indice"].values, "Unknown user index must be present in validation data."

In [None]:
train_neg_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,2546,1890
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,2416,2467
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,4292,311
43115,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,728,689
233421,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000,5481,1611


## Generate sequence

In [None]:
full_neg_df = pd.concat(
    [train_neg_df.assign(source="train"), val_neg_df.assign(source="val")],
    axis=0,
)
full_neg_df

assert len(full_neg_df) == len(train_neg_df) + len(val_neg_df), "The length of the concatenated DataFrame should equal the sum of the lengths of the individual DataFrames."

In [None]:
df_with_sequences = generate_item_sequences(
    full_neg_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1,
)

In [None]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, "item_indice", "item_sequence"]])

Unnamed: 0,user_id,item_indice,item_sequence
49934,AFZ4EK2LJ655XQKTEUELCARO6RYA,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
180805,AFZ4EK2LJ655XQKTEUELCARO6RYA,4132,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4]"
82123,AFY2C4YOUP2SSMM43HD2L3FIEFZA,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
212994,AFY2C4YOUP2SSMM43HD2L3FIEFZA,1859,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 36]"
226370,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,4773,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
...,...,...,...
260847,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,2441,"[3451, 3827, 1839, 1347, 2504, 2694, 4546, 4270, 369, 32]"
130454,AHLN6GKTKZE22AON34YAQXTGK63A,4772,"[2950, 1812, 4735, 4165, 4575, 2440, 607, 4807, 374, 3091]"
261325,AHLN6GKTKZE22AON34YAQXTGK63A,4303,"[1812, 4735, 4165, 4575, 2440, 607, 4807, 374, 3091, 4772]"
129956,AEMYBWDN67IB5IBTMHLHN76V4QHQ,4086,"[644, 3602, 4569, 1865, 3030, 3653, 3803, 3998, 285, 4720]"


In [None]:
# Check sample user
user_id = df_with_sequences.sample(n=1)[args.user_col].values[0]

(
    df_with_sequences.loc[lambda df: df[args.user_col].eq(user_id)]
    .sort_values(args.timestamp_col)[
        [args.user_col, args.timestamp_col, "item_indice", "item_sequence"]
    ]
    .head(10)
)

Unnamed: 0,user_id,timestamp,item_indice,item_sequence
212849,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2016-12-13 00:32:32.000,3084,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
81978,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2016-12-13 00:32:32.000,1059,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3084]"
81979,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2017-12-23 04:02:53.547,2702,"[-1, -1, -1, -1, -1, -1, -1, -1, 3084, 1059]"
212850,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2017-12-23 04:02:53.547,4692,"[-1, -1, -1, -1, -1, -1, -1, 3084, 1059, 2702]"
81980,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2018-10-14 05:18:39.988,4644,"[-1, -1, -1, -1, -1, -1, 3084, 1059, 2702, 4692]"
212851,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2018-10-14 05:18:39.988,1610,"[-1, -1, -1, -1, -1, 3084, 1059, 2702, 4692, 4..."
212852,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2020-03-12 01:30:32.440,1000,"[-1, -1, -1, -1, 3084, 1059, 2702, 4692, 4644,..."
81981,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2020-03-12 01:30:32.440,3726,"[-1, -1, -1, 3084, 1059, 2702, 4692, 4644, 161..."
81982,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2020-03-28 22:52:44.756,3054,"[-1, -1, 3084, 1059, 2702, 4692, 4644, 1610, 1..."
212853,AFKBPVV3ZM2JWHPT4SE3PSVTUPLQ,2020-03-28 22:52:44.756,603,"[-1, 3084, 1059, 2702, 4692, 4644, 1610, 1000,..."


## Persit

In [None]:
train_neg_df = df_with_sequences.loc[lambda df: df["source"].eq("train")].drop(
    columns=["source"]
)
val_neg_df = df_with_sequences.loc[lambda df: df["source"].eq("val")].drop(
    columns=["source"]
)

In [None]:
val_neg_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
129191,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B004FV4ROA,1.0,2020-12-27 00:30:31.146,11295,528,"[1898, 3479, 3908, 1570, 91, 2723, 2962, 106, ..."
260062,AGSP5XAQPQBUUXZHEZSC65FD7NOQ,B07KFQFDNB,0.0,2020-12-27 00:30:31.146,11295,3503,"[3479, 3908, 1570, 91, 2723, 2962, 106, 3557, ..."
128040,AEHS7YR7BGGWMZS24H5UR5IP46HQ,B08F1P3BCC,2.0,2020-12-27 01:44:52.242,1784,3925,"[4319, 3382, 4330, 1173, 1330, 423, 2868, 3167..."


In [None]:
train_neg_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet", index=False)
val_neg_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet", index=False)