## Controller

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.utils.embedding_id_mapper import IDMapper 
from src.negative_sampling import generate_negative_samples

## Args

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-data-prep"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_seq.parquet")

    neg_to_pos_ratio: int = 1

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-data-prep",
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u_seq.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u_seq.parquet",
  "neg_to_pos_ratio": 1
}


## Test beforhand


In [4]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df, neg_to_pos_ratio=1)

  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,106,0,1
1,1,105,0,2
2,1,106,0,4
3,2,102,0,1
4,2,103,0,2
5,3,103,0,1
6,3,103,0,5


## Load data

In [7]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")

In [8]:
assert val_df[args.timestamp_col].min() > train_df[args.timestamp_col].max()
val_timestamp = train_df[args.timestamp_col].max() + timedelta(seconds=1)
logger.info(f"{val_timestamp=}")

[32m2025-06-17 17:10:10.928[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=Timestamp('2020-12-26 23:06:04.454000')[0m


In [9]:
full_df = pd.concat([
    train_df.assign(subset="train"), val_df.assign(subset="val")], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,subset
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
...,...,...,...,...,...,...,...,...
3474,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[2627, 4216, 4743, 1945, 2355, 1831, 951, 4461...",val
3475,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[2260, 3517, 3609, 3495, 3625, 4079, -1, -1, -...",val
3476,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,"[99, 1265, 3514, 2271, 3451, 3827, 2504, 4546,...",val
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[1812, 4165, 4575, 4807, 374, -1, -1, -1, -1, -1]",val


In [10]:
neg_df = generate_negative_samples(
    full_df,
    args.user_col,
    args.item_col,
    seed = args.random_seed,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    features= ["subset", "item_sequence", "user_indice", "item_indice"],
)

  0%|          | 0/130871 [00:00<?, ?it/s]

In [11]:
neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,subset,item_sequence,user_indice,item_indice
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00RE1UL52,0,2003-01-23 03:28:15.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",8071,4
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B002SBB0SU,0,2003-11-25 18:12:09.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",7935,36
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B011BRUOMO,0,2004-06-18 02:02:57.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",13705,3514
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B087D9VRGC,0,2004-09-13 20:18:44.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",12730,36
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B006TZM6XO,0,2004-10-22 14:26:12.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",3735,2103
...,...,...,...,...,...,...,...,...
3474,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B0791TX5P5,0,2022-02-19 01:32:51.519,val,"[2627, 4216, 4743, 1945, 2355, 1831, 951, 4461...",2171,3626
3475,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B09RD36X6L,0,2022-02-19 16:49:57.966,val,"[2260, 3517, 3609, 3495, 3625, 4079, -1, -1, -...",5159,3699
3476,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B011U6ZIPO,0,2022-02-19 16:56:53.030,val,"[99, 1265, 3514, 2271, 3451, 3827, 2504, 4546,...",1396,32
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B00FB50SBU,0,2022-02-19 17:28:55.519,val,"[1812, 4165, 4575, 4807, 374, -1, -1, -1, -1, -1]",14550,4772


In [16]:
full_neg_df = (
    pd.concat([full_df, neg_df], axis=0)
    .reset_index(drop=True)
    .sample(frac=1, replace = False, random_state=args.random_seed))

In [17]:
full_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,subset
151343,AEEV5YWQKPBTLFWHKOBBULYA2RDQ,B09V1FT19S,0.0,2014-07-17 19:15:55.000,1412,1047,"[4559.0, 4443.0, 3164.0, -1.0, -1.0, -1.0, -1....",train
40958,AF7KZV4NJ5GBDVFTB7PEEUN4U53A,B0BBMLD8QT,5.0,2015-07-29 20:38:06.000,4871,4476,"[1924.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -...",train
218918,AFVQ4K4KZPLQ3E2VFYSGX6HFXGNQ,B00CIOA89E,0.0,2017-12-13 20:35:02.334,7616,4465,"[1293.0, 1728.0, 445.0, -1.0, -1.0, -1.0, -1.0...",train
43115,AFCLWJMGYFCOJQR7T4454OF5A5WA,B00ENFP224,5.0,2015-09-06 12:09:59.000,5250,1355,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
233421,AFP4PHJ6Q2RRXLDPSDSH6VXJRUTA,B081K9C3Q1,0.0,2018-11-23 09:44:21.734,6792,3320,"[1055.0, 3572.0, 3865.0, 1761.0, 1591.0, 3889....",train
...,...,...,...,...,...,...,...,...
250960,AGQHC7YNLYP4QV2PSBD6URSMJSVA,B07193SG3D,0.0,2020-02-08 04:09:50.457,11001,3454,"[3585.0, 1866.0, 4040.0, 4539.0, 2253.0, 4568....",train
217058,AHD65JAOVTTPDNJWOLSSGS3QVK6Q,B08KZ1TZYB,0.0,2017-11-02 15:25:18.351,13410,3365,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
61324,AF32PWYNLPCVAU4UX35IEAZOFA3Q,B011BRUOMO,5.0,2016-07-18 05:42:21.000,4264,2253,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
132003,AGM65FYYAPHOLESGIDMFMPUQIYNA,B01N9K9XOF,0.0,2010-12-16 19:59:19.000,10445,183,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train


In [18]:
# split back to train and val
train_neg_df = full_neg_df[full_neg_df["subset"] == "train"].drop(columns=["subset"])
val_neg_df = full_neg_df[full_neg_df["subset"] == "val"].drop(columns=["subset"])

## Persit

In [20]:
train_neg_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
val_neg_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")