## Controller

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.utils.embedding_id_mapper import IDMapper 
from src.negative_sampling import generate_negative_samples

## Args

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-data-prep"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u_seq.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u_seq.parquet")

    neg_to_pos_ratio: int = 2

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-data-prep",
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u_seq.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u_seq.parquet",
  "neg_to_pos_ratio": 2
}


## Test beforhand


In [4]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df, neg_to_pos_ratio=1)

  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,106,0,1
1,1,105,0,2
2,1,104,0,4
3,2,103,0,1
4,2,103,0,2
5,3,101,0,1
6,3,103,0,5


## Load data

In [7]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")

In [8]:
assert val_df[args.timestamp_col].min() > train_df[args.timestamp_col].max()
val_timestamp = train_df[args.timestamp_col].max() + timedelta(seconds=1)
logger.info(f"{val_timestamp=}")

[32m2025-06-17 16:47:51.675[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=Timestamp('2020-12-26 23:06:04.454000')[0m


In [9]:
full_df = pd.concat([
    train_df.assign(subset="train"), val_df.assign(subset="val")], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,subset
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
...,...,...,...,...,...,...,...,...
3474,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[2627, 4216, 4743, 1945, 2355, 1831, 951, 4461...",val
3475,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[2260, 3517, 3609, 3495, 3625, 4079, -1, -1, -...",val
3476,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,"[99, 1265, 3514, 2271, 3451, 3827, 2504, 4546,...",val
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[1812, 4165, 4575, 4807, 374, -1, -1, -1, -1, -1]",val


In [10]:
neg_df = generate_negative_samples(
    full_df,
    args.user_col,
    args.item_col,
    seed = args.random_seed,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    features= ["subset", "item_sequence", "user_indice", "item_indice"],
)

  0%|          | 0/130871 [00:00<?, ?it/s]

In [16]:
neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,subset,item_sequence,user_indice,item_indice
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B09XWXHRJP,0,2003-01-23 03:28:15.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",8071,4
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B01JFSW0UK,0,2003-01-23 03:28:15.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",8071,4
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B0CCQHH8Z6,0,2003-11-25 18:12:09.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",7935,36
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00P28VN38,0,2003-11-25 18:12:09.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",7935,36
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B06XKPQ6YZ,0,2004-06-18 02:02:57.000,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",13705,3514
...,...,...,...,...,...,...,...,...
3476,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B07925B8PD,0,2022-02-19 16:56:53.030,val,"[99, 1265, 3514, 2271, 3451, 3827, 2504, 4546,...",1396,32
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B08N1DF9BX,0,2022-02-19 17:28:55.519,val,"[1812, 4165, 4575, 4807, 374, -1, -1, -1, -1, -1]",14550,4772
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B0C1CPTMG9,0,2022-02-19 17:28:55.519,val,"[1812, 4165, 4575, 4807, 374, -1, -1, -1, -1, -1]",14550,4772
3478,AEMYBWDN67IB5IBTMHLHN76V4QHQ,B00G05A2MU,0,2022-02-19 22:08:53.253,val,"[2677, 1610, 2694, 3695, 4429, 3602, 4569, 365...",2446,4086


In [17]:
full_neg_df = (
    pd.concat([full_df, neg_df], axis=0)
    .reset_index(drop=True)
    .sample(frac=1, replace = False, random_state=args.random_seed))

In [18]:
full_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,subset
266890,AETCSNOCHP5YL3C67YWXHB54RCNA,B00KXTZ3BE,0.0,2016-11-18 05:32:16.000,3257,4179,"[142.0, 467.0, 1465.0, -1.0, -1.0, -1.0, -1.0,...",train
183685,AHUDTHSVE5XAFVPOHHBSPL7T7TNA,B07BSVLHYD,0.0,2014-11-23 14:06:52.000,15687,1113,"[3375.0, 3642.0, 954.0, 4266.0, 928.0, 626.0, ...",train
308523,AGYLXGPK4SKKQZ2OV3LXYGHFYM2A,B01195G5FS,0.0,2017-12-31 06:21:06.077,12067,3896,"[4458.0, 1570.0, 4749.0, 3372.0, 2006.0, -1.0,...",train
99169,AENO3VFOX37KYSKS6FQOXWSHR74Q,B078CQSHDV,5.0,2018-08-06 20:46:04.598,2540,3168,"[650.0, 2326.0, 3089.0, 2787.0, 2393.0, 2717.0...",train
64744,AHGMLXGDYDPBY3MIJZPHUAOVZFVQ,B08NW1RXPF,5.0,2016-09-14 05:18:06.000,13900,3988,"[820.0, 2217.0, 4615.0, -1.0, -1.0, -1.0, -1.0...",train
...,...,...,...,...,...,...,...,...
250960,AFV5AGWZ6DBOVMW46FMMI2NSQCVA,B0C4T2HF5W,0.0,2016-06-23 01:00:28.000,7553,1867,"[843.0, 1601.0, 4169.0, 1691.0, 4546.0, 1814.0...",train
217058,AFSFLSMTEC4M3I6EBPF72B4PLEPQ,B0BBMN8HMM,0.0,2015-09-05 23:31:51.000,7208,504,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train
323468,AEKDV6VVNKWG2WIXFOVCJBRORELQ,B09SXP5VB5,0.0,2018-05-27 21:31:44.724,2106,4692,"[667.0, 856.0, 984.0, 3151.0, 1499.0, 3774.0, ...",train
132003,AEXSP5TFL7PDBHBCWOSLKXXMCMNA,B0BD7FN8K9,0.0,2009-12-29 14:04:58.000,3839,591,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",train


In [19]:
# split back to train and val
train_neg_df = full_neg_df[full_neg_df["subset"] == "train"].drop(columns=["subset"])
val_neg_df = full_neg_df[full_neg_df["subset"] == "val"].drop(columns=["subset"])

## Persit

In [20]:
train_neg_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg_2.parquet")
val_neg_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg_2.parquet")