## Controller

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.utils.embedding_id_mapper import IDMapper 
from src.negative_sampling import generate_negative_samples

## Args

In [13]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-data-prep"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    train_data_fp: str = os.path.abspath("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
    val_data_fp: str = os.path.abspath("../data_for_ai/interim/val_sample_interactions_16407u.parquet")

    neg_to_pos_ratio: int = 1

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-data-prep",
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "train_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/train_sample_interactions_16407u.parquet",
  "val_data_fp": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim/val_sample_interactions_16407u.parquet",
  "neg_to_pos_ratio": 1
}


## Test beforhand


In [6]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [11]:
neg_df = generate_negative_samples(df, neg_to_pos_ratio=1)

  0%|          | 0/7 [00:00<?, ?it/s]

In [12]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,105,0,4
3,2,103,0,1
4,2,106,0,2
5,3,101,0,1
6,3,102,0,5


## Load data

In [15]:
train_df = pd.read_parquet(args.train_data_fp)
val_df = pd.read_parquet(args.val_data_fp)
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")

In [16]:
assert val_df[args.timestamp_col].min() > train_df[args.timestamp_col].max()
val_timestamp = train_df[args.timestamp_col].max() + timedelta(seconds=1)
logger.info(f"{val_timestamp=}")

[32m2025-04-16 22:29:27.181[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=Timestamp('2020-12-26 23:06:04.454000')[0m


In [17]:
full_df = pd.concat([
    train_df.assign(subset="train"), val_df.assign(subset="val")], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,subset
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,train
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,train
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,train
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,train
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,train
...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,val
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,val
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,val
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,val


In [21]:
neg_df = generate_negative_samples(
    full_df,
    args.user_col,
    args.item_col,
    seed = args.random_seed,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    features= ["subset"]
)

  0%|          | 0/130871 [00:00<?, ?it/s]

In [22]:
neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,subset
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B003XN3BZC,0,2012-06-11 16:41:10.000,train
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B01MY48T0O,0,2012-08-02 02:04:13.000,train
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B07X6KGF9R,0,2012-09-15 16:34:46.000,train
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00U43F99A,0,2013-01-03 23:08:45.000,train
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B07Y2L7CYF,0,2013-05-06 01:24:39.000,train
...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B00LWHUBPO,0,2021-07-16 17:08:55.044,val
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B07961C64Q,0,2021-01-14 01:48:09.423,val
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B008GAT8EU,0,2021-12-05 00:35:40.874,val
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B09G3MBH6V,0,2022-02-18 11:32:46.732,val


In [23]:
full_neg_df = (
    pd.concat([full_df, neg_df], axis=0)
    .reset_index(drop=True)
    .sample(frac=1, replace = False, random_state=args.random_seed))

In [24]:
full_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,subset
151343,AENOXSRSNC5VGY3JQKZQ5DD7HIUA,B00PKKM2HO,0.0,2017-06-10 00:30:32.698,train
40958,AEMPVT2U6BIHQDV52BDEDDKPH4HA,B01BCWKBZI,2.0,2017-08-03 00:40:30.172,train
218918,AF3CKYP3BTJ7MEKU6J64BS57MQBA,B002HWRJBM,0.0,2018-12-08 16:57:03.101,train
43115,AE7IGXXTK7XTWRJGLIAL5BJDTEAQ,B005L38VRU,5.0,2014-09-04 02:03:39.000,train
233421,AFEJ5GRYG2PQD6EWSAKVG56XMKNA,B00JO80LUI,0.0,2016-09-14 16:29:39.000,train
...,...,...,...,...,...
250960,AGFRYVIF7CVPOK777KN3PSOSWSMA,B086QGXBRW,0.0,2013-12-17 03:19:23.000,train
217058,AGMAUSEXCG2JEGI245KGJJYHOWBQ,B0BGS23YKX,0.0,2019-04-30 00:21:36.489,train
61324,AGGEMMEOSRGTGESZ56F7ESETFRHQ,B00U3FPN4U,5.0,2017-07-22 01:23:03.787,train
132003,AGU6EIWIZSV6AIQSAVRDHTIJCHPA,B003PEUA30,0.0,2020-10-27 16:09:49.492,train


In [25]:
# split back to train and val
train_neg_df = full_neg_df[full_neg_df["subset"] == "train"].drop(columns=["subset"])
val_neg_df = full_neg_df[full_neg_df["subset"] == "val"].drop(columns=["subset"])

## Persit

In [28]:
train_neg_df.to_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg.parquet")
val_neg_df.to_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg.parquet")