In [1]:
%load_ext autoreload
%autoreload 2


In [6]:
import os
import sys
from functools import partial

import numpy as np
import plotly.express as px
from loguru import logger
from pydantic import BaseModel, model_validator
from load_dotenv import load_dotenv
import pandas as pd

from sqlalchemy import create_engine
from feast import FeatureStore
import pandas as pd

sys.path.insert(0, "..")

from src.utils.split_time_based import train_test_split_timebased
from src.utils.embedding_id_mapper import IDMapper
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

_ = load_dotenv(override=True)

## Controler

In [5]:
class Args(BaseModel):
    run_name: str = "000-data-prep"
    run_description: str = "Splitting data into train, val, test sets, then sampling data for quick iteration"
    testing: bool = False
    sample_data_persit_path: str = None    # path of the sampled data: train, test and val
    notebook_persit_path: str = None    # path of the notebook
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    sample_users: int = 5000
    min_user_interactions: int = 5
    min_item_interactions: int = 10

    val_num_days: int = 15
    test_num_days: int = 30

    rating_dataset_path: str = os.path.abspath("../data_for_ai/raw/amz_raw_rating.parquet")

    def init(self):
        self.sample_data_persit_path = os.path.abspath(f"../data_for_ai/interim")
        self.notebook_persit_path = os.path.abspath(f"./data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.sample_data_persit_path, exist_ok=True)
            os.makedirs(self.notebook_persit_path, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-data-prep",
  "run_description": "Splitting data into train, val, test sets, then sampling data for quick iteration",
  "testing": false,
  "sample_data_persit_path": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim",
  "notebook_persit_path": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sample_users": 5000,
  "min_user_interactions": 5,
  "min_item_interactions": 10,
  "val_num_days": 15,
  "test_num_days": 30,
  "rating_dataset_path": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/raw/amz_raw_rating.parquet"
}


## Load data from a specific period in order to train the model

In notebook 002-simulate-oltp, we can see that the time period from March 2020 to Sep 2020 is the good choice. There are active interactions between users and items in this period and wen can keep the recency. So, we will load data from this period to train the model.

In [7]:
# Concatenate all processed chunks into a final DataFrame
full_df = pd.read_parquet(args.rating_dataset_path)
print(f"DataFrame shape: {full_df.shape}")


DataFrame shape: (43334103, 4)


In [8]:
full_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B01G8JO5F2,5.0,2018-04-07 09:23:37.534
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B07N69T6TM,1.0,2020-06-20 18:42:29.731
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B083NRGZMM,3.0,2022-07-18 22:58:37.948
3,AGGZ357AO26RQZVRLGU4D4N52DZQ,B001OC5JKY,5.0,2010-11-20 18:41:35.000
4,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B07CJYMRWM,5.0,2023-02-17 02:39:41.238


In [9]:
# Split train, val, test
train_df, val_df, test_df = train_test_split_timebased(
    full_df, user_id_col="user_id",
        item_id_col="parent_asin",
        timestamp_col="timestamp")

[32m2025-03-26 22:56:01.702[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m26[0m - [1mRemoving users from val and test sets...[0m
[32m2025-03-26 22:56:25.051[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m37[0m - [1mRemoved 25251 users from val set[0m
[32m2025-03-26 22:56:25.054[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m40[0m - [1mRemoved 43696 users from test set[0m
[32m2025-03-26 22:56:25.055[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m43[0m - [1mTrain set has 18201042 users[0m
[32m2025-03-26 22:56:25.057[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m44[0m - [1mVal set has 8908 users[0m
[32m2025-03-26 22:56:25.060[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain

In [10]:
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "There are overlapping timestamps between train and validation datasets."
assert val_df[args.timestamp_col].max() < test_df[args.timestamp_col].min(), "There are overlapping timestamps between validation and test datasets."

In [11]:
logger.info(f"Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

[32m2025-03-26 22:56:36.187[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mTrain: (43225743, 4), Val: (10406, 4), Test: (13108, 4)[0m


## Sampling data

Just randomly get X users will not guarantee that the output dataset would qualify the condition of **richness**. Instead we take an iterative approach where we gradually drop random users from the dataset while keeping an eye on the conditions and our sampling target.

In [12]:
def remove_random_users(df, k=10):
    users = df[args.user_col].unique()
    np.random.seed(args.random_seed)
    to_remove_users = np.random.choice(users, size=k, replace=False)
    return df.loc[lambda df: ~df[args.user_col].isin(to_remove_users)]


def get_unqualified(df, col: str, threshold: int):
    unqualified = df.groupby(col).size().loc[lambda s: s < threshold].index
    return unqualified


get_unqualified_users = partial(
    get_unqualified, col=args.user_col, threshold=args.min_user_interactions
)
get_unqualified_items = partial(
    get_unqualified, col=args.item_col, threshold=args.min_item_interactions
)

In [14]:
buffer_perc = 0.2
perc_users_removed_each_round = 0.01
debug = True
keep_random_removing = True
r = 1

sample_df = train_df.copy()

while keep_random_removing:
    num_users_removed_each_round = int(
        perc_users_removed_each_round * sample_df[args.user_col].nunique()
    )
    print(
        f"\n\nRandomly removing {num_users_removed_each_round} users - Round {r} started"
    )
    sample_df = remove_random_users(sample_df, k=num_users_removed_each_round)

    keep_removing = True
    i = 1

    while keep_removing:
        if debug:
            logger.info(f"Sampling round {i} started")
        keep_removing = False
        uu = get_unqualified_users(sample_df)
        if debug:
            logger.info(f"{len(uu)=}")
        if len(uu):
            sample_df = sample_df.loc[lambda df: ~df[args.user_col].isin(uu)]
            if debug:
                logger.info(f"After removing uu: {len(sample_df)=}")
            assert len(get_unqualified_users(sample_df)) == 0
            keep_removing = True
        ui = get_unqualified_items(sample_df)
        if debug:
            logger.info(f"{len(ui)=}")
        if len(ui):
            sample_df = sample_df.loc[lambda df: ~df[args.item_col].isin(ui)]
            if debug:
                logger.info(f"After removing ui: {len(sample_df)=}")
            assert len(get_unqualified_items(sample_df)) == 0
            keep_removing = True
        i += 1

    sample_users = sample_df[args.user_col].unique()
    sample_items = sample_df[args.item_col].unique()
    num_users = len(sample_users)
    logger.info(f"After randomly removing users - round {r}: {num_users=}")
    if num_users > args.sample_users * (1 + buffer_perc):
        logger.info(
            f"Number of users {num_users} are still greater than expected, keep removing..."
        )
    else:
        logger.info(
            f"Number of users {num_users} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False
    
    val_sample_df = val_df.loc[
                lambda df: df[args.user_col].isin(sample_users)
                & df[args.item_col].isin(sample_items)
            ]
    test_sample_df = test_df.loc[
                lambda df: df[args.user_col].isin(sample_users)
                & df[args.item_col].isin(sample_items)
            ]
    if (num_val_records := val_sample_df.shape[0]) < 3000:
        logger.info(
            f"Number of val_df records {num_val_records:,.0f} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False
    if (num_test_records := test_sample_df.shape[0]) < 3000:
        logger.info(
            f"Number of test_df records {num_test_records:,.0f} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False

    r += 1

sample_users = sample_df[args.user_col].unique()
sample_items = sample_df[args.item_col].unique()
logger.info(f"Final sample sizes: {len(sample_users)=:,.0f}, {len(sample_items)=:,.0f}")




Randomly removing 182010 users - Round 1 started


[32m2025-03-26 23:05:17.145[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-26 23:06:04.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=16192492[0m
[32m2025-03-26 23:06:13.077[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=17467346[0m
[32m2025-03-26 23:06:21.695[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=936952[0m
[32m2025-03-26 23:06:24.617[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=15134535[0m
[32m2025-03-26 23:06:26.588[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-26 23:06:31.253[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=328781[0m
[32m2025-03-26 23:06:32.635[0m 

In [None]:
assert sample_df[args.timestamp_col].max() < val_sample_df[args.timestamp_col].min(), "There are overlapping timestamps between train and validation datasets."
assert val_sample_df[args.timestamp_col].max() < test_sample_df[args.timestamp_col].min(), "There are overlapping timestamps between validation and test datasets."

In [None]:
assert val_sample_df.loc[lambda df: ~df[args.user_col].isin(sample_users)].shape[0] == 0, "Validation DataFrame contains unexpected users."
assert test_sample_df.loc[lambda df: ~df[args.user_col].isin(sample_users)].shape[0] == 0, "Test DataFrame contains unexpected users."
assert val_sample_df.loc[lambda df: ~df[args.item_col].isin(sample_items)].shape[0] == 0, "Validation DataFrame contains unexpected items."
assert test_sample_df.loc[lambda df: ~df[args.item_col].isin(sample_items)].shape[0] == 0, "Test DataFrame contains unexpected items."

In [None]:
px.histogram(sample_df.groupby(args.user_col).size())

In [None]:
px.histogram(sample_df.groupby(args.item_col).size())

In [None]:
sample_df

In [None]:
val_sample_df

In [None]:
test_sample_df

In [None]:
subsets = ["train", "val", "test"]
original_length = {"train": train_df.shape[0], "val": val_df.shape[0], "test": test_df.shape[0]}
sampled_length = {"train": sample_df.shape[0], "val": val_sample_df.shape[0], "test": test_sample_df.shape[0]}


In [None]:
original_length

In [None]:
sampled_length

In [None]:
fig = make_subplots(rows=1, cols=3)

# Add data for each subset
for i, subset in enumerate(subsets):
    row = i // 3 + 1
    col = i % 3 +1

    # Add trace for 'curr'
    fig.add_trace(
        go.Bar(
            name="original",
            x=[subset],
            y=[original_length[subset]],
            marker_color = "lightblue",
            showlegend=(i == 0),
            texttemplate="%{y:.2}",
        ),
        row=row,
        col=col,
    )

    # Add trace for 'new'
    fig.add_trace(
        go.Bar(
            name="sample",
            x=[subset],
            y=[sampled_length[subset]],
            marker_color="lightgreen",
            showlegend=(i == 0),
            texttemplate="%{y:.2}",
        ),
        row=row,
        col=col,
    )

    # Add diff annotation
    difference = (sampled_length[subset] - original_length[subset]) / original_length[
        subset
    ]
    fig.add_annotation(
        x=subset,
        y=sampled_length[subset] * 1.10,  # Position above the tallest bar
        text=f"Δ={difference:.2%}",
        showarrow=False,
        font=dict(color="black", size=14),
        row=row,
        col=col,
    )

fig.update_layout(showlegend=True)

fig.show()

In [None]:
# Perit the sampled data
sample_df.to_parquet(f"{args.sample_data_persit_path}/train_sample_interactions_{args.sample_users}u.parquet")
val_sample_df.to_parquet(f"{args.sample_data_persit_path}/val_sample_interactions_{args.sample_users}u.parquet")
test_sample_df.to_parquet(f"{args.sample_data_persit_path}/test_sample_interactions_{args.sample_users}u.parquet")

Remember to version your data with dvc

In [None]:
train_sample_df = pd.read_parquet(f"{args.sample_data_persit_path}/train_sample_interactions_{args.sample_users}u.parquet")

In [25]:
def plot_interactions_over_time(df):
    df = df.assign(timestamp=df[args.timestamp_col].dt.date)
    plot_df = df.groupby(args.timestamp_col).size()

    fig = px.line(
        x=plot_df.index,
        y=plot_df.values,
        labels={"x": "Date", "y": "Number of Interactions"},
        title="Interactions Over Time",
        height=500,
    )

    fig.update_layout(yaxis=dict(showticklabels=True, tickformat=","))

    fig.show()

In [27]:
train_sample_df

Unnamed: 0,timestamp,user_id,parent_asin,rating
2931,2020-11-21 11:31:14.232,AH6U3RG4SKWXF4KNH3RC6VD5P4QQ,B0953YFR2M,5.0
2992,2021-05-25 12:17:57.423,AGBOFSSHGILKH73MJZUUOTRCD4CA,B0BW9DSR52,5.0
3303,2020-07-19 13:47:10.212,AEEDFUQ7SXVEZ4VBHTED5D6MZ7QA,B08YX4HGRY,5.0
3375,2020-10-19 19:13:42.773,AHR6RGZTLOMBD7EBF3OK43JWMGNQ,B09NTXBJDM,4.0
3547,2021-05-17 23:23:05.132,AGZMKHWSCB3UXDGFUPFRZSL4EAWQ,B08XD3WW2H,4.0
...,...,...,...,...
24565423,2020-07-13 10:51:57.066,AFGEK77LF27ECZWRR5J2TZGEOJ7A,B00KDSGIPK,5.0
24565424,2020-07-13 10:55:37.739,AFGEK77LF27ECZWRR5J2TZGEOJ7A,B017250D16,4.0
24565425,2020-07-13 11:08:07.474,AFGEK77LF27ECZWRR5J2TZGEOJ7A,B076TCPKJT,3.0
24565426,2020-09-02 18:58:47.052,AFGEK77LF27ECZWRR5J2TZGEOJ7A,B01MDKA8EH,4.0


In [None]:
# Build up idm
# Sorted to make sure that even rerun we get same idm mapping
unique_user_ids = sorted(train_sample_df[args.user_col].unique())
unique_item_ids = sorted(train_sample_df[args.item_col].unique())
logger.info(f"Number of unique users: {len(unique_user_ids):,.0f}")
logger.info(f"Number of unique items: {len(unique_item_ids):,.0f}")
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [None]:
idm.save(f"{args.notebook_persit_path}/idm_{args.sample_users}u.json")
idm_persist_fp = f"{args.notebook_persit_path}/idm_{args.sample_users}u.json"
idm = IDMapper().load(idm_persist_fp)

In [None]:
len(idm.item_to_index)

In [None]:
for k, _ in idm.item_to_index.items():
    assert type(k) is str, "Type of user id should be string"
for k,_ in idm.user_to_index.items():
    assert type(k) is str, "Type of item id should be string"