In [None]:
%load_ext autoreload
%autoreload 2

In [6]:
import os
import sys
from functools import partial

import numpy as np
import plotly.express as px
from loguru import logger
from pydantic import BaseModel, model_validator
from load_dotenv import load_dotenv
import pandas as pd

from sqlalchemy import create_engine
from feast import FeatureStore
import pandas as pd

sys.path.insert(0, "..")

from src.utils.split_time_based import train_test_split_timebased
from src.utils.embedding_id_mapper import IDMapper
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

_ = load_dotenv(override=True)

## Controler

In [98]:
class Args(BaseModel):
    run_name: str = "000-data-prep"
    run_description: str = "Splitting data into train, val, test sets, then sampling data for quick iteration"
    testing: bool = False
    sample_data_persit_path: str = None    # path of the sampled data: train, test and val
    notebook_persit_path: str = None    # path of the notebook
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    sample_users: int = 5000
    min_user_interactions: int = 5
    min_item_interactions: int = 10

    val_num_days: int = 420
    test_num_days: int = 540

    rating_dataset_path: str = os.path.abspath("../data_for_ai/raw/amz_raw_rating.parquet")

    def init(self):
        self.sample_data_persit_path = os.path.abspath(f"../data_for_ai/interim")
        self.notebook_persit_path = os.path.abspath(f"./data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.sample_data_persit_path, exist_ok=True)
            os.makedirs(self.notebook_persit_path, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-data-prep",
  "run_description": "Splitting data into train, val, test sets, then sampling data for quick iteration",
  "testing": false,
  "sample_data_persit_path": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/interim",
  "notebook_persit_path": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-data-prep",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sample_users": 5000,
  "min_user_interactions": 5,
  "min_item_interactions": 10,
  "val_num_days": 420,
  "test_num_days": 540,
  "rating_dataset_path": "/home/dinhln/Desktop/real_time_recsys/data_for_ai/raw/amz_raw_rating.parquet"
}


## Load data from a specific period in order to train the model

In notebook 002-simulate-oltp, we can see that the time period from March 2020 to Sep 2020 is the good choice. There are active interactions between users and items in this period and wen can keep the recency. So, we will load data from this period to train the model.

In [65]:
# Concatenate all processed chunks into a final DataFrame
full_df = pd.read_parquet(args.rating_dataset_path)
print(f"DataFrame shape: {full_df.shape}")


DataFrame shape: (43334103, 4)


In [60]:
full_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B01G8JO5F2,5.0,2018-04-07 09:23:37.534
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B07N69T6TM,1.0,2020-06-20 18:42:29.731
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B083NRGZMM,3.0,2022-07-18 22:58:37.948
3,AGGZ357AO26RQZVRLGU4D4N52DZQ,B001OC5JKY,5.0,2010-11-20 18:41:35.000
4,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B07CJYMRWM,5.0,2023-02-17 02:39:41.238


In [31]:
def remove_random_users(df, k=10):
    users = df[args.user_col].unique()
    np.random.seed(args.random_seed)
    to_remove_users = np.random.choice(users, size=k, replace=False)
    return df.loc[lambda df: ~df[args.user_col].isin(to_remove_users)]


def get_unqualified(df, col: str, threshold: int):
    unqualified = df.groupby(col).size().loc[lambda s: s < threshold].index
    return unqualified

In [19]:
get_unqualified_users = partial(
    get_unqualified, col=args.user_col, threshold=5
)
get_unqualified_items = partial(
    get_unqualified, col=args.item_col, threshold=5
)

In [20]:
# get 5-core df
uu = get_unqualified_users(full_df)
ui = get_unqualified_items(full_df)
print(f"Number of unqualified users: {len(uu)}")
print(f"Number of unqualified items: {len(ui)}")

Number of unqualified users: 16420548
Number of unqualified items: 987017


In [21]:
full_5core_df = full_df.loc[lambda df: ~df[args.user_col].isin(uu)]
full_5core_df = full_5core_df.loc[lambda df: ~df[args.item_col].isin(ui)]
print(f"5-core DataFrame shape: {full_5core_df.shape}")

5-core DataFrame shape: (16879366, 4)


In [22]:
logger.info(f"{len(full_df) - len(full_5core_df)} ")

[32m2025-03-26 23:40:31.668[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1m26454737 [0m


In [99]:
# Split train, val, test
train_df, val_df, test_df = train_test_split_timebased(
    full_5core_df, user_id_col="user_id",
        item_id_col="parent_asin",
        timestamp_col="timestamp",
        val_num_days=args.val_num_days,
        test_num_days=args.test_num_days)

[32m2025-03-27 01:23:18.987[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m26[0m - [1mRemoving users from val and test sets...[0m
[32m2025-03-27 01:23:24.875[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m37[0m - [1mRemoved 141738 users from val set[0m
[32m2025-03-27 01:23:25.016[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m40[0m - [1mRemoved 265886 users from test set[0m
[32m2025-03-27 01:23:25.017[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m43[0m - [1mTrain set has 1762078 users[0m
[32m2025-03-27 01:23:25.236[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtrain_test_split_timebased[0m:[36m44[0m - [1mVal set has 680644 users[0m
[32m2025-03-27 01:23:25.424[0m | [1mINFO    [0m | [36msrc.utils.split_time_based[0m:[36mtr

In [100]:
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "There are overlapping timestamps between train and validation datasets."
assert val_df[args.timestamp_col].max() < test_df[args.timestamp_col].min(), "There are overlapping timestamps between validation and test datasets."

In [101]:
logger.info(f"Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

[32m2025-03-27 01:23:27.019[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mTrain: (13043980, 4), Val: (1375948, 4), Test: (749578, 4)[0m


## Sampling data

Just randomly get X users will not guarantee that the output dataset would qualify the condition of **richness**. Instead we take an iterative approach where we gradually drop random users from the dataset while keeping an eye on the conditions and our sampling target.

In [102]:
get_unqualified_users = partial(
    get_unqualified, col=args.user_col, threshold=args.min_user_interactions
)
get_unqualified_items = partial(
    get_unqualified, col=args.item_col, threshold=args.min_item_interactions
)

In [103]:
buffer_perc = 0.2
perc_users_removed_each_round = 0.05
debug = True
keep_random_removing = True
r = 1

sample_df = train_df.copy()

while keep_random_removing:
    num_users_removed_each_round = int(
        perc_users_removed_each_round * sample_df[args.user_col].nunique()
    )
    print(
        f"\n\nRandomly removing {num_users_removed_each_round} users - Round {r} started"
    )
    sample_df = remove_random_users(sample_df, k=num_users_removed_each_round)

    keep_removing = True
    i = 1

    while keep_removing:
        if debug:
            logger.info(f"Sampling round {i} started")
        keep_removing = False
        uu = get_unqualified_users(sample_df)
        if debug:
            logger.info(f"{len(uu)=}")
        if len(uu):
            sample_df = sample_df.loc[lambda df: ~df[args.user_col].isin(uu)]
            if debug:
                logger.info(f"After removing uu: {len(sample_df)=}")
            assert len(get_unqualified_users(sample_df)) == 0
            keep_removing = True
        ui = get_unqualified_items(sample_df)
        if debug:
            logger.info(f"{len(ui)=}")
        if len(ui):
            sample_df = sample_df.loc[lambda df: ~df[args.item_col].isin(ui)]
            if debug:
                logger.info(f"After removing ui: {len(sample_df)=}")
            assert len(get_unqualified_items(sample_df)) == 0
            keep_removing = True
        i += 1
    
    sample_users = sample_df[args.user_col].unique()
    sample_items = sample_df[args.item_col].unique()
    num_users = len(sample_users)
    logger.info(f"After randomly removing users - round {r}: {num_users=}")
    
    if len(sample_df) < 20000:
        perc_users_removed_each_round = 0.1

    if num_users > args.sample_users * (1 + buffer_perc):
        logger.info(
            f"Number of users {num_users} are still greater than expected, keep removing..."
        )
    else:
        logger.info(
            f"Number of users {num_users} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False
    
    val_sample_df = val_df.loc[
                lambda df: df[args.user_col].isin(sample_users)
                & df[args.item_col].isin(sample_items)
            ]
    test_sample_df = test_df.loc[
                lambda df: df[args.user_col].isin(sample_users)
                & df[args.item_col].isin(sample_items)
            ]
    if (num_val_records := val_sample_df.shape[0]) < 2000:
        logger.info(
            f"Number of val_df records {num_val_records:,.0f} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False
    if (num_test_records := test_sample_df.shape[0]) < 2000:
        logger.info(
            f"Number of test_df records {num_test_records:,.0f} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False

    r += 1

sample_users = sample_df[args.user_col].unique()
sample_items = sample_df[args.item_col].unique()
logger.info(f"Final sample sizes: {len(sample_users)=:,.0f}, {len(sample_items)=:,.0f}")




Randomly removing 88103 users - Round 1 started


[32m2025-03-27 01:23:39.852[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:23:43.850[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=503157[0m
[32m2025-03-27 01:23:44.995[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=10878907[0m
[32m2025-03-27 01:23:49.717[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=318409[0m
[32m2025-03-27 01:23:51.873[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=9688502[0m
[32m2025-03-27 01:23:53.200[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:23:55.868[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=183917[0m
[32m2025-03-27 01:23:56.932[0m | 



Randomly removing 48554 users - Round 2 started


[32m2025-03-27 01:24:40.536[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:24:42.507[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:24:43.672[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=5242[0m
[32m2025-03-27 01:24:44.679[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=8367279[0m
[32m2025-03-27 01:24:45.645[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:24:47.560[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=7120[0m
[32m2025-03-27 01:24:48.092[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=8338891[0m
[32m2025-03-27 01:24:50.925[0m | [1mINFO   



Randomly removing 45734 users - Round 3 started


[32m2025-03-27 01:25:24.717[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:25:26.480[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:25:27.369[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=5128[0m
[32m2025-03-27 01:25:28.245[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=7867836[0m
[32m2025-03-27 01:25:29.089[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:25:30.857[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=6896[0m
[32m2025-03-27 01:25:31.344[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=7840354[0m
[32m2025-03-27 01:25:33.982[0m | [1mINFO   



Randomly removing 43065 users - Round 4 started


[32m2025-03-27 01:25:59.302[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:26:00.966[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:26:01.746[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4835[0m
[32m2025-03-27 01:26:02.528[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=7401291[0m
[32m2025-03-27 01:26:03.332[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:26:04.946[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=6501[0m
[32m2025-03-27 01:26:05.392[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=7375387[0m
[32m2025-03-27 01:26:07.763[0m | [1mINFO   



Randomly removing 40552 users - Round 5 started


[32m2025-03-27 01:26:31.015[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:26:32.620[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:26:33.360[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4850[0m
[32m2025-03-27 01:26:34.078[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=6958461[0m
[32m2025-03-27 01:26:34.791[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:26:36.348[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=6952[0m
[32m2025-03-27 01:26:36.788[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=6930753[0m
[32m2025-03-27 01:26:39.063[0m | [1mINFO   



Randomly removing 38143 users - Round 6 started


[32m2025-03-27 01:27:06.717[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:27:08.225[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:27:08.956[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4679[0m
[32m2025-03-27 01:27:09.602[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=6538916[0m
[32m2025-03-27 01:27:10.280[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:27:11.755[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=6560[0m
[32m2025-03-27 01:27:12.163[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=6512774[0m
[32m2025-03-27 01:27:14.362[0m | [1mINFO   



Randomly removing 35874 users - Round 7 started


[32m2025-03-27 01:27:35.514[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:27:36.881[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:27:37.538[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4509[0m
[32m2025-03-27 01:27:38.142[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=6143967[0m
[32m2025-03-27 01:27:38.809[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:27:40.203[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=6262[0m
[32m2025-03-27 01:27:40.781[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=6119029[0m
[32m2025-03-27 01:27:42.831[0m | [1mINFO   



Randomly removing 33735 users - Round 8 started


[32m2025-03-27 01:28:02.707[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:28:04.008[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:28:04.667[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4268[0m
[32m2025-03-27 01:28:05.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=5770175[0m
[32m2025-03-27 01:28:05.743[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:28:06.977[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5882[0m
[32m2025-03-27 01:28:07.448[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=5746740[0m
[32m2025-03-27 01:28:09.297[0m | [1mINFO   



Randomly removing 31718 users - Round 9 started


[32m2025-03-27 01:28:29.911[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:28:31.375[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:28:31.968[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4276[0m
[32m2025-03-27 01:28:32.506[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=5412757[0m
[32m2025-03-27 01:28:33.075[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:28:34.268[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5981[0m
[32m2025-03-27 01:28:34.757[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=5388928[0m
[32m2025-03-27 01:28:36.479[0m | [1mINFO   



Randomly removing 29794 users - Round 10 started


[32m2025-03-27 01:28:58.650[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:28:59.828[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:29:00.356[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=4036[0m
[32m2025-03-27 01:29:00.798[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=5076756[0m
[32m2025-03-27 01:29:01.458[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:29:02.545[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5605[0m
[32m2025-03-27 01:29:02.951[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=5054434[0m
[32m2025-03-27 01:29:04.601[0m | [1mINFO   



Randomly removing 27991 users - Round 11 started


[32m2025-03-27 01:29:22.974[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:29:24.134[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:29:24.730[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3894[0m
[32m2025-03-27 01:29:25.179[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=4762437[0m
[32m2025-03-27 01:29:25.756[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:29:26.757[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5374[0m
[32m2025-03-27 01:29:27.133[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=4741050[0m
[32m2025-03-27 01:29:28.761[0m | [1mINFO   



Randomly removing 26291 users - Round 12 started


[32m2025-03-27 01:29:44.237[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:29:45.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:29:45.629[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3798[0m
[32m2025-03-27 01:29:45.998[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=4463795[0m
[32m2025-03-27 01:29:46.404[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:29:47.320[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5158[0m
[32m2025-03-27 01:29:47.655[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=4443239[0m
[32m2025-03-27 01:29:49.020[0m | [1mINFO   



Randomly removing 24683 users - Round 13 started


[32m2025-03-27 01:30:06.789[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:30:07.655[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:30:08.083[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3746[0m
[32m2025-03-27 01:30:08.411[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=4183338[0m
[32m2025-03-27 01:30:08.805[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:30:09.678[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5220[0m
[32m2025-03-27 01:30:09.985[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=4162537[0m
[32m2025-03-27 01:30:11.231[0m | [1mINFO   



Randomly removing 23155 users - Round 14 started


[32m2025-03-27 01:30:23.902[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:30:24.696[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:30:25.135[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3596[0m
[32m2025-03-27 01:30:25.505[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=3913688[0m
[32m2025-03-27 01:30:26.034[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:30:26.955[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=5047[0m
[32m2025-03-27 01:30:27.236[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=3893589[0m
[32m2025-03-27 01:30:28.552[0m | [1mINFO   



Randomly removing 21712 users - Round 15 started


[32m2025-03-27 01:30:40.562[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:30:41.308[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:30:41.654[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3521[0m
[32m2025-03-27 01:30:41.918[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=3659933[0m
[32m2025-03-27 01:30:42.256[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:30:43.003[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=4830[0m
[32m2025-03-27 01:30:43.247[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=3640710[0m
[32m2025-03-27 01:30:44.310[0m | [1mINFO   



Randomly removing 20351 users - Round 16 started


[32m2025-03-27 01:30:55.695[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:30:56.407[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:30:56.727[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3287[0m
[32m2025-03-27 01:30:56.960[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=3423192[0m
[32m2025-03-27 01:30:57.256[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:30:57.972[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=4576[0m
[32m2025-03-27 01:30:58.194[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=3404974[0m
[32m2025-03-27 01:30:59.201[0m | [1mINFO   



Randomly removing 19071 users - Round 17 started


[32m2025-03-27 01:31:14.354[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:31:15.011[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:31:15.299[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=3070[0m
[32m2025-03-27 01:31:15.657[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=3201611[0m
[32m2025-03-27 01:31:15.927[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:31:16.576[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=4365[0m
[32m2025-03-27 01:31:16.772[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=3184227[0m
[32m2025-03-27 01:31:17.694[0m | [1mINFO   



Randomly removing 17866 users - Round 18 started


[32m2025-03-27 01:31:27.253[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:31:27.864[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:31:28.137[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2986[0m
[32m2025-03-27 01:31:28.468[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=2992965[0m
[32m2025-03-27 01:31:28.732[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:31:29.343[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=4251[0m
[32m2025-03-27 01:31:29.542[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=2976045[0m
[32m2025-03-27 01:31:30.393[0m | [1mINFO   



Randomly removing 16730 users - Round 19 started


[32m2025-03-27 01:31:43.369[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:31:43.939[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:31:44.174[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2823[0m
[32m2025-03-27 01:31:44.459[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=2796753[0m
[32m2025-03-27 01:31:44.708[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:31:45.268[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=4119[0m
[32m2025-03-27 01:31:45.445[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=2780366[0m
[32m2025-03-27 01:31:46.275[0m | [1mINFO   



Randomly removing 15660 users - Round 20 started


[32m2025-03-27 01:31:54.382[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:31:54.918[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:31:55.143[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2634[0m
[32m2025-03-27 01:31:55.396[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=2613534[0m
[32m2025-03-27 01:31:55.609[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:31:56.138[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3720[0m
[32m2025-03-27 01:31:56.297[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=2598722[0m
[32m2025-03-27 01:31:57.030[0m | [1mINFO   



Randomly removing 14660 users - Round 21 started


[32m2025-03-27 01:32:04.549[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:32:05.045[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:05.251[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2582[0m
[32m2025-03-27 01:32:05.483[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=2439755[0m
[32m2025-03-27 01:32:05.686[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:06.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3781[0m
[32m2025-03-27 01:32:06.310[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=2424707[0m
[32m2025-03-27 01:32:06.980[0m | [1mINFO   



Randomly removing 13703 users - Round 22 started


[32m2025-03-27 01:32:15.563[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:32:16.019[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:16.220[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2457[0m
[32m2025-03-27 01:32:16.423[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=2275634[0m
[32m2025-03-27 01:32:16.611[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:17.061[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3531[0m
[32m2025-03-27 01:32:17.203[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=2261593[0m
[32m2025-03-27 01:32:17.833[0m | [1mINFO   



Randomly removing 12810 users - Round 23 started


[32m2025-03-27 01:32:24.278[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:32:24.692[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:24.888[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2393[0m
[32m2025-03-27 01:32:25.076[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=2122172[0m
[32m2025-03-27 01:32:25.267[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:25.678[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3382[0m
[32m2025-03-27 01:32:25.808[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=2108726[0m
[32m2025-03-27 01:32:26.380[0m | [1mINFO   



Randomly removing 11973 users - Round 24 started


[32m2025-03-27 01:32:32.596[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:32:32.984[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:33.151[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2305[0m
[32m2025-03-27 01:32:33.316[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1977145[0m
[32m2025-03-27 01:32:33.480[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:33.880[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3284[0m
[32m2025-03-27 01:32:34.001[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1964101[0m
[32m2025-03-27 01:32:34.535[0m | [1mINFO   



Randomly removing 11178 users - Round 25 started


[32m2025-03-27 01:32:40.240[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:32:40.591[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:40.746[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2112[0m
[32m2025-03-27 01:32:40.889[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1842305[0m
[32m2025-03-27 01:32:41.037[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:41.395[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3049[0m
[32m2025-03-27 01:32:41.550[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1830191[0m
[32m2025-03-27 01:32:42.061[0m | [1mINFO   



Randomly removing 10434 users - Round 26 started


[32m2025-03-27 01:32:48.577[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 1 started[0m
[32m2025-03-27 01:32:48.950[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:49.096[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=2087[0m
[32m2025-03-27 01:32:49.228[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1715032[0m
[32m2025-03-27 01:32:49.359[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:49.711[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=3071[0m
[32m2025-03-27 01:32:49.855[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1702820[0m
[32m2025-03-27 01:32:50.343[0m | [1mINFO   



Randomly removing 9728 users - Round 27 started


[32m2025-03-27 01:32:55.828[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:32:55.958[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1975[0m
[32m2025-03-27 01:32:56.075[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1595181[0m
[32m2025-03-27 01:32:56.205[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:32:56.528[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2961[0m
[32m2025-03-27 01:32:56.654[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1583409[0m
[32m2025-03-27 01:32:57.074[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=305[0m
[32m2025-03-27 01:32:57.198[0m | [1mINFO    [0m | [36m



Randomly removing 9064 users - Round 28 started


[32m2025-03-27 01:33:02.179[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:02.306[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1914[0m
[32m2025-03-27 01:33:02.412[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1482994[0m
[32m2025-03-27 01:33:02.526[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:02.817[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2842[0m
[32m2025-03-27 01:33:02.932[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1471696[0m
[32m2025-03-27 01:33:03.340[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=330[0m
[32m2025-03-27 01:33:03.464[0m | [1mINFO    [0m | [36m



Randomly removing 8435 users - Round 29 started


[32m2025-03-27 01:33:08.876[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:08.983[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1773[0m
[32m2025-03-27 01:33:09.075[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1376942[0m
[32m2025-03-27 01:33:09.173[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:09.441[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2627[0m
[32m2025-03-27 01:33:09.540[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1366495[0m
[32m2025-03-27 01:33:09.895[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=293[0m
[32m2025-03-27 01:33:10.000[0m | [1mINFO    [0m | [36m



Randomly removing 7853 users - Round 30 started


[32m2025-03-27 01:33:15.750[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:15.847[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1787[0m
[32m2025-03-27 01:33:15.935[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1277459[0m
[32m2025-03-27 01:33:16.030[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:16.283[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2588[0m
[32m2025-03-27 01:33:16.415[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1267180[0m
[32m2025-03-27 01:33:16.759[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=301[0m
[32m2025-03-27 01:33:16.860[0m | [1mINFO    [0m | [36m



Randomly removing 7301 users - Round 31 started


[32m2025-03-27 01:33:20.739[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:20.834[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1645[0m
[32m2025-03-27 01:33:20.914[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1183811[0m
[32m2025-03-27 01:33:20.999[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:21.235[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2469[0m
[32m2025-03-27 01:33:21.324[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1174003[0m
[32m2025-03-27 01:33:21.642[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=288[0m
[32m2025-03-27 01:33:21.730[0m | [1mINFO    [0m | [36m



Randomly removing 6784 users - Round 32 started


[32m2025-03-27 01:33:26.759[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:26.844[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1511[0m
[32m2025-03-27 01:33:26.951[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1096696[0m
[32m2025-03-27 01:33:27.034[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:27.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2348[0m
[32m2025-03-27 01:33:27.318[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1087377[0m
[32m2025-03-27 01:33:27.597[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=294[0m
[32m2025-03-27 01:33:27.679[0m | [1mINFO    [0m | [36m



Randomly removing 6296 users - Round 33 started


[32m2025-03-27 01:33:31.088[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:31.169[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1448[0m
[32m2025-03-27 01:33:31.264[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=1014966[0m
[32m2025-03-27 01:33:31.330[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:31.527[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2284[0m
[32m2025-03-27 01:33:31.593[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=1005898[0m
[32m2025-03-27 01:33:31.858[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=268[0m
[32m2025-03-27 01:33:31.930[0m | [1mINFO    [0m | [36m



Randomly removing 5838 users - Round 34 started


[32m2025-03-27 01:33:36.281[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:36.352[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1373[0m
[32m2025-03-27 01:33:36.433[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=938249[0m
[32m2025-03-27 01:33:36.497[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:36.677[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2139[0m
[32m2025-03-27 01:33:36.735[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=929748[0m
[32m2025-03-27 01:33:36.970[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=241[0m
[32m2025-03-27 01:33:37.033[0m | [1mINFO    [0m | [36m__



Randomly removing 5411 users - Round 35 started


[32m2025-03-27 01:33:41.642[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:41.710[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1337[0m
[32m2025-03-27 01:33:41.788[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=865906[0m
[32m2025-03-27 01:33:41.845[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:42.009[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=2045[0m
[32m2025-03-27 01:33:42.062[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=857794[0m
[32m2025-03-27 01:33:42.289[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=264[0m
[32m2025-03-27 01:33:42.351[0m | [1mINFO    [0m | [36m__



Randomly removing 5010 users - Round 36 started


[32m2025-03-27 01:33:45.794[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:45.853[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1209[0m
[32m2025-03-27 01:33:45.920[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=799535[0m
[32m2025-03-27 01:33:45.977[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:46.124[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1853[0m
[32m2025-03-27 01:33:46.175[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=792187[0m
[32m2025-03-27 01:33:46.376[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=239[0m
[32m2025-03-27 01:33:46.428[0m | [1mINFO    [0m | [36m__



Randomly removing 4642 users - Round 37 started


[32m2025-03-27 01:33:49.570[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:49.623[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1220[0m
[32m2025-03-27 01:33:49.683[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=736756[0m
[32m2025-03-27 01:33:49.731[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:49.873[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1829[0m
[32m2025-03-27 01:33:49.919[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=729498[0m
[32m2025-03-27 01:33:50.104[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=239[0m
[32m2025-03-27 01:33:50.151[0m | [1mINFO    [0m | [36m__



Randomly removing 4294 users - Round 38 started


[32m2025-03-27 01:33:53.520[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=0[0m
[32m2025-03-27 01:33:53.575[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1044[0m
[32m2025-03-27 01:33:53.625[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=679962[0m
[32m2025-03-27 01:33:53.670[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:53.797[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1665[0m
[32m2025-03-27 01:33:53.836[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=673349[0m
[32m2025-03-27 01:33:54.009[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=249[0m
[32m2025-03-27 01:33:54.055[0m | [1mINFO    [0m | [36m__



Randomly removing 3967 users - Round 39 started


[32m2025-03-27 01:33:56.692[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=1007[0m
[32m2025-03-27 01:33:56.738[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=626158[0m
[32m2025-03-27 01:33:56.779[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:33:56.896[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1613[0m
[32m2025-03-27 01:33:56.935[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=619752[0m
[32m2025-03-27 01:33:57.092[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=222[0m
[32m2025-03-27 01:33:57.131[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=617766[0m
[32m2025-03-27 01:33:57.175[



Randomly removing 3662 users - Round 40 started


[32m2025-03-27 01:33:59.885[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=964[0m
[32m2025-03-27 01:33:59.926[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=576031[0m
[32m2025-03-27 01:33:59.964[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:34:00.074[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1531[0m
[32m2025-03-27 01:34:00.118[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=569955[0m
[32m2025-03-27 01:34:00.260[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=215[0m
[32m2025-03-27 01:34:00.296[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=568028[0m
[32m2025-03-27 01:34:00.334[0



Randomly removing 3377 users - Round 41 started


[32m2025-03-27 01:34:02.552[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=529005[0m
[32m2025-03-27 01:34:02.593[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:34:02.690[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1508[0m
[32m2025-03-27 01:34:02.731[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=523023[0m
[32m2025-03-27 01:34:02.861[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=198[0m
[32m2025-03-27 01:34:02.892[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=521251[0m
[32m2025-03-27 01:34:02.927[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01



Randomly removing 3108 users - Round 42 started


[32m2025-03-27 01:34:04.977[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=484851[0m
[32m2025-03-27 01:34:05.012[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:34:05.106[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1445[0m
[32m2025-03-27 01:34:05.143[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=479123[0m
[32m2025-03-27 01:34:05.259[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=221[0m
[32m2025-03-27 01:34:05.289[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=477146[0m
[32m2025-03-27 01:34:05.324[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01



Randomly removing 2857 users - Round 43 started


[32m2025-03-27 01:34:07.257[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:34:07.345[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1364[0m
[32m2025-03-27 01:34:07.380[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=437942[0m
[32m2025-03-27 01:34:07.485[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=182[0m
[32m2025-03-27 01:34:07.523[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=436318[0m
[32m2025-03-27 01:34:07.550[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:07.634[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=313[0m
[32m2025-03-27 01:34:07.662[0m | [1mINFO    



Randomly removing 2625 users - Round 44 started


[32m2025-03-27 01:34:09.571[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 2 started[0m
[32m2025-03-27 01:34:09.653[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1320[0m
[32m2025-03-27 01:34:09.680[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=400035[0m
[32m2025-03-27 01:34:09.778[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=208[0m
[32m2025-03-27 01:34:09.803[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=398179[0m
[32m2025-03-27 01:34:09.829[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:09.905[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=360[0m
[32m2025-03-27 01:34:09.937[0m | [1mINFO    



Randomly removing 2401 users - Round 45 started


[32m2025-03-27 01:34:12.137[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1148[0m
[32m2025-03-27 01:34:12.162[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=364497[0m
[32m2025-03-27 01:34:12.250[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=184[0m
[32m2025-03-27 01:34:12.280[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=362851[0m
[32m2025-03-27 01:34:12.302[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:12.374[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=346[0m
[32m2025-03-27 01:34:12.405[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=361478[0m
[32m2025-03-27 01:34:12.488[0



Randomly removing 2197 users - Round 46 started


[32m2025-03-27 01:34:14.719[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=1148[0m
[32m2025-03-27 01:34:14.739[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=331158[0m
[32m2025-03-27 01:34:14.844[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=187[0m
[32m2025-03-27 01:34:14.872[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=329489[0m
[32m2025-03-27 01:34:14.895[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:14.958[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=329[0m
[32m2025-03-27 01:34:14.979[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=328177[0m
[32m2025-03-27 01:34:15.061[0



Randomly removing 2007 users - Round 47 started


[32m2025-03-27 01:34:16.221[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=301366[0m
[32m2025-03-27 01:34:16.299[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=174[0m
[32m2025-03-27 01:34:16.322[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=299808[0m
[32m2025-03-27 01:34:16.340[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:16.397[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=325[0m
[32m2025-03-27 01:34:16.419[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=298517[0m
[32m2025-03-27 01:34:16.494[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=51[0m
[32m2025-03-27 01:34:16.511[0m 



Randomly removing 1832 users - Round 48 started


[32m2025-03-27 01:34:17.777[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=175[0m
[32m2025-03-27 01:34:17.799[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=271478[0m
[32m2025-03-27 01:34:17.817[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:17.869[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=304[0m
[32m2025-03-27 01:34:17.887[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=270269[0m
[32m2025-03-27 01:34:17.954[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=55[0m
[32m2025-03-27 01:34:17.970[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=269777[0m
[32m2025-03-27 01:34:17.987[0m 



Randomly removing 1666 users - Round 49 started


[32m2025-03-27 01:34:19.160[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=187[0m
[32m2025-03-27 01:34:19.181[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=244355[0m
[32m2025-03-27 01:34:19.198[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:19.244[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=335[0m
[32m2025-03-27 01:34:19.258[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=243019[0m
[32m2025-03-27 01:34:19.315[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=54[0m
[32m2025-03-27 01:34:19.330[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=242533[0m
[32m2025-03-27 01:34:19.345[0m 



Randomly removing 1507 users - Round 50 started


[32m2025-03-27 01:34:20.649[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=137[0m
[32m2025-03-27 01:34:20.664[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=221335[0m
[32m2025-03-27 01:34:20.680[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:20.724[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=270[0m
[32m2025-03-27 01:34:20.738[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=220260[0m
[32m2025-03-27 01:34:20.791[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=56[0m
[32m2025-03-27 01:34:20.804[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=219758[0m
[32m2025-03-27 01:34:20.817[0m 



Randomly removing 1369 users - Round 51 started


[32m2025-03-27 01:34:21.814[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=198799[0m
[32m2025-03-27 01:34:21.826[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 3 started[0m
[32m2025-03-27 01:34:21.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=279[0m
[32m2025-03-27 01:34:21.878[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=197686[0m
[32m2025-03-27 01:34:21.923[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=57[0m
[32m2025-03-27 01:34:21.935[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=197173[0m
[32m2025-03-27 01:34:21.947[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 4 started[0m
[32m2025-03-27 01:3



Randomly removing 1235 users - Round 52 started


[32m2025-03-27 01:34:23.236[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=211[0m
[32m2025-03-27 01:34:23.253[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=177833[0m
[32m2025-03-27 01:34:23.306[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=31[0m
[32m2025-03-27 01:34:23.318[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=177554[0m
[32m2025-03-27 01:34:23.330[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 4 started[0m
[32m2025-03-27 01:34:23.361[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=58[0m
[32m2025-03-27 01:34:23.371[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=177322[0m
[32m2025-03-27 01:34:23.412[0m |



Randomly removing 1120 users - Round 53 started


[32m2025-03-27 01:34:24.328[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=159926[0m
[32m2025-03-27 01:34:24.371[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=53[0m
[32m2025-03-27 01:34:24.381[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=159452[0m
[32m2025-03-27 01:34:24.392[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 4 started[0m
[32m2025-03-27 01:34:24.420[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=96[0m
[32m2025-03-27 01:34:24.434[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=159068[0m
[32m2025-03-27 01:34:24.468[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=18[0m
[32m2025-03-27 01:34:24.479[0m | 



Randomly removing 1011 users - Round 54 started


[32m2025-03-27 01:34:25.312[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=39[0m
[32m2025-03-27 01:34:25.325[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=143307[0m
[32m2025-03-27 01:34:25.334[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 4 started[0m
[32m2025-03-27 01:34:25.358[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=86[0m
[32m2025-03-27 01:34:25.370[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=142963[0m
[32m2025-03-27 01:34:25.402[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=10[0m
[32m2025-03-27 01:34:25.413[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=142874[0m
[32m2025-03-27 01:34:25.422[0m | 



Randomly removing 913 users - Round 55 started


[32m2025-03-27 01:34:26.226[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=48[0m
[32m2025-03-27 01:34:26.239[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=128098[0m
[32m2025-03-27 01:34:26.248[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSampling round 4 started[0m
[32m2025-03-27 01:34:26.274[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mlen(uu)=89[0m
[32m2025-03-27 01:34:26.284[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAfter removing uu: len(sample_df)=127742[0m
[32m2025-03-27 01:34:26.317[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mlen(ui)=17[0m
[32m2025-03-27 01:34:26.330[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mAfter removing ui: len(sample_df)=127589[0m
[32m2025-03-27 01:34:26.342[0m | 

In [104]:
assert sample_df[args.timestamp_col].max() < val_sample_df[args.timestamp_col].min(), "There are overlapping timestamps between train and validation datasets."
assert val_sample_df[args.timestamp_col].max() < test_sample_df[args.timestamp_col].min(), "There are overlapping timestamps between validation and test datasets."

In [105]:
assert val_sample_df.loc[lambda df: ~df[args.user_col].isin(sample_users)].shape[0] == 0, "Validation DataFrame contains unexpected users."
assert test_sample_df.loc[lambda df: ~df[args.user_col].isin(sample_users)].shape[0] == 0, "Test DataFrame contains unexpected users."
assert val_sample_df.loc[lambda df: ~df[args.item_col].isin(sample_items)].shape[0] == 0, "Validation DataFrame contains unexpected items."
assert test_sample_df.loc[lambda df: ~df[args.item_col].isin(sample_items)].shape[0] == 0, "Test DataFrame contains unexpected items."

In [106]:
px.histogram(sample_df.groupby(args.user_col).size())

In [107]:
px.histogram(sample_df.groupby(args.item_col).size())

In [108]:
sample_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39
...,...,...,...,...
40882304,AFB4DWWKZBQFS22FAWDEP37EL2FA,B00KAF5RQ2,5.0,2016-02-22 17:44:10
40882305,AFB4DWWKZBQFS22FAWDEP37EL2FA,B001F6TXME,5.0,2016-02-22 17:44:40
40882306,AFB4DWWKZBQFS22FAWDEP37EL2FA,B007VGGIB6,5.0,2016-02-22 17:45:10
40882307,AFB4DWWKZBQFS22FAWDEP37EL2FA,B00WUID73W,5.0,2016-02-22 17:45:37


In [109]:
val_sample_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
4668,AGZE3IYHOEGKUTJZSQCSFSQ4IFFQ,B0B787CN26,5.0,2021-10-27 19:43:57.873
10425,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B0002MQGOA,5.0,2021-02-02 14:20:48.424
10426,AEANO5BIASSZNFWNXBR2ECHCPJQQ,B07HZLHPKP,5.0,2021-03-08 13:56:57.795
13265,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B07QWPVZJY,3.0,2021-12-11 00:34:19.152
14423,AEFHRRLFCZQ3TWNYCBA7UD3NIXCA,B00D96J8IM,1.0,2021-10-17 20:54:19.325
...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732


In [110]:
test_sample_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
13270,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B0BHMVBV9M,2.0,2022-08-27 16:17:48.228
13271,AHDXCFTV7RS3AM6E2TRPWOG3A33Q,B08F1P3BCC,4.0,2023-01-16 04:19:10.669
29821,AHZ6GFHFM6Z7CRPSXRIYQ5Z7GERQ,B08YF1VBYD,4.0,2022-05-28 18:44:43.983
38367,AHAI4X3YAVRMXXUR6USAT5L5WG3A,B0BMK6DC5W,1.0,2022-09-16 13:15:24.402
41950,AGBF2BZRN6M65YBFZCF54ENDRRAA,B0BM73T3K6,5.0,2022-08-26 12:13:41.632
...,...,...,...,...
33756582,AEUXNGJ4HXZXHHU5OF3BPR6ZCLNQ,B0BHGRJDCK,5.0,2022-04-18 15:02:10.839
33756584,AEUXNGJ4HXZXHHU5OF3BPR6ZCLNQ,B08CKZ36N7,5.0,2023-01-28 23:46:05.518
35019361,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09PRD4T26,5.0,2023-03-13 00:48:18.717
35323251,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B09PB85B9K,5.0,2022-07-05 23:12:38.472


In [111]:
subsets = ["train", "val", "test"]
original_length = {"train": train_df.shape[0], "val": val_df.shape[0], "test": test_df.shape[0]}
sampled_length = {"train": sample_df.shape[0], "val": val_sample_df.shape[0], "test": test_sample_df.shape[0]}


In [112]:
original_length

{'train': 13043980, 'val': 1375948, 'test': 749578}

In [113]:
sampled_length

{'train': 127392, 'val': 3479, 'test': 1822}

In [114]:
fig = make_subplots(rows=1, cols=3)

# Add data for each subset
for i, subset in enumerate(subsets):
    row = i // 3 + 1
    col = i % 3 +1

    # Add trace for 'curr'
    fig.add_trace(
        go.Bar(
            name="original",
            x=[subset],
            y=[original_length[subset]],
            marker_color = "lightblue",
            showlegend=(i == 0),
            texttemplate="%{y:.2}",
        ),
        row=row,
        col=col,
    )

    # Add trace for 'new'
    fig.add_trace(
        go.Bar(
            name="sample",
            x=[subset],
            y=[sampled_length[subset]],
            marker_color="lightgreen",
            showlegend=(i == 0),
            texttemplate="%{y:.2}",
        ),
        row=row,
        col=col,
    )

    # Add diff annotation
    difference = (sampled_length[subset] - original_length[subset]) / original_length[
        subset
    ]
    fig.add_annotation(
        x=subset,
        y=sampled_length[subset] * 1.10,  # Position above the tallest bar
        text=f"Δ={difference:.2%}",
        showarrow=False,
        font=dict(color="black", size=14),
        row=row,
        col=col,
    )

fig.update_layout(showlegend=True)

fig.show()

In [115]:
num_users = sample_df[args.user_col].nunique()
num_users

16407

In [116]:
# Perit the sampled data
sample_df.to_parquet(f"{args.sample_data_persit_path}/train_sample_interactions_{num_users}u.parquet")
val_sample_df.to_parquet(f"{args.sample_data_persit_path}/val_sample_interactions_{num_users}u.parquet")
test_sample_df.to_parquet(f"{args.sample_data_persit_path}/test_sample_interactions_{num_users}u.parquet")

Remember to version your data with dvc

In [117]:
train_sample_df = pd.read_parquet(f"{args.sample_data_persit_path}/train_sample_interactions_{num_users}u.parquet")

In [118]:
def plot_interactions_over_time(df):
    df = df.assign(timestamp=df[args.timestamp_col].dt.date)
    plot_df = df.groupby(args.timestamp_col).size()

    fig = px.line(
        x=plot_df.index,
        y=plot_df.values,
        labels={"x": "Date", "y": "Number of Interactions"},
        title="Interactions Over Time",
        height=500,
    )

    fig.update_layout(yaxis=dict(showticklabels=True, tickformat=","))

    fig.show()

In [119]:
train_sample_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39
...,...,...,...,...
40882304,AFB4DWWKZBQFS22FAWDEP37EL2FA,B00KAF5RQ2,5.0,2016-02-22 17:44:10
40882305,AFB4DWWKZBQFS22FAWDEP37EL2FA,B001F6TXME,5.0,2016-02-22 17:44:40
40882306,AFB4DWWKZBQFS22FAWDEP37EL2FA,B007VGGIB6,5.0,2016-02-22 17:45:10
40882307,AFB4DWWKZBQFS22FAWDEP37EL2FA,B00WUID73W,5.0,2016-02-22 17:45:37


In [120]:
# Build up idm
# Sorted to make sure that even rerun we get same idm mapping
unique_user_ids = sorted(train_sample_df[args.user_col].unique())
unique_item_ids = sorted(train_sample_df[args.item_col].unique())
logger.info(f"Number of unique users: {len(unique_user_ids):,.0f}")
logger.info(f"Number of unique items: {len(unique_item_ids):,.0f}")
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

[32m2025-03-27 01:37:15.000[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mNumber of unique users: 16,407[0m
[32m2025-03-27 01:37:15.000[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of unique items: 4,817[0m


In [121]:
idm.save(f"{args.notebook_persit_path}/idm_{num_users}u.json")
idm_persist_fp = f"{args.notebook_persit_path}/idm_{num_users}u.json"
idm = IDMapper().load(idm_persist_fp)

In [122]:
len(idm.item_to_index)

4817

In [123]:
for k, _ in idm.item_to_index.items():
    assert type(k) is str, "Type of user id should be string"
for k,_ in idm.user_to_index.items():
    assert type(k) is str, "Type of item id should be string"