## Prepare data

Our goal is to get a small sample of the dataset to work with that is good enough to iterate on. A small sample makes it faster to iterate, hence enable easier debugging. If we can not make our model work on a reasonably small dataset then it's not likely to work on bigger one either.

We define a sufficiently small training dataset as:
- **Richness**: Every user should have at least 5 interactions and each item should have at least 10 interactions
- **Enough samples**: There should be at least 1000 users in the training dataset and about 1000 interactions in the validation set

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import sys
from functools import partial

import numpy as np
import plotly.express as px
from loguru import logger
from pydantic import BaseModel, model_validator
from load_dotenv import load_dotenv
import pandas as pd

from sqlalchemy import create_engine
from feast import FeatureStore

sys.path.insert(0, "..")

from src.utils.split_time_based import train_test_split_timebased
from src.utils.embedding_id_mapper import IDMapper
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

_ = load_dotenv(override=True)

## Controler

In [3]:
class Args(BaseModel):
    run_name: str = "data_preparation"
    run_description: str = "Splitting data into train, val, test sets, then sampling data for quick iteration"
    testing: bool = False
    sample_data_persit_path: str = None    # path of the sampled data: train, test and val
    notebook_persit_path: str = None    # path of the notebook
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    sample_users: int = 5000
    min_user_interactions: int = 5
    min_item_interactions: int = 10

    val_num_days: int = 15
    test_num_days: int = 30

    # Database credentials
    user: str = None
    password: str = None
    db: str = None
    host: str = None
    port: int = None
    oltp_shema: str = None
    transtion_table_name: str = "transactions"

    @model_validator(mode="before")
    def load_env_vars(cls, values):
        # Load environment variables if not explicitly set
        values["user"] = values.get("user") or os.getenv("POSTGRES_USER")
        values["password"] = values.get("password") or os.getenv("POSTGRES_PASSWORD")
        values["db"] = values.get("db") or os.getenv("POSTGRES_DB")
        values["host"] = values.get("host") or os.getenv("POSTGRES_HOST")
        values["port"] = values.get("port") or os.getenv("POSTGRES_PORT")
        values["oltp_shema"] = values.get("oltp_shema") or os.getenv("POSTGRES_OLTP_SCHEMA")
        return values

    def init(self):
        self.sample_data_persit_path = os.path.abspath(f"../data_for_ai/interim")
        self.notebook_persit_path = os.path.abspath(f"./data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.sample_data_persit_path, exist_ok=True)
            os.makedirs(self.notebook_persit_path, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "data_preparation",
  "run_description": "Splitting data into train, val, test sets, then sampling data for quick iteration",
  "testing": false,
  "sample_data_persit_path": "/home/ubuntu/local-data/data_for_ai/interim",
  "notebook_persit_path": "/home/ubuntu/local-data/real_time_recsys/data/data_preparation",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sample_users": 5000,
  "min_user_interactions": 5,
  "min_item_interactions": 10,
  "val_num_days": 15,
  "test_num_days": 30,
  "user": "resys-user",
  "password": "hehehe",
  "db": "amazon_rating",
  "host": "0.0.0.0",
  "port": 5432,
  "oltp_shema": "oltp",
  "transtion_table_name": "transactions"
}


## Load data from a specific period in order to train the model

In notebook 002-simulate-oltp, we can see that the time period from March 2020 to Sep 2020 is the good choice. There are active interactions between users and items in this period and wen can keep the recency. So, we will load data from this period to train the model.

In [4]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm

# Create connection
connection_string = f"postgresql://{args.user}:{args.password}@{args.host}:{args.port}/{args.db}"
engine = create_engine(connection_string)

# Query with chunk processing
query = f"""
SELECT {args.timestamp_col}, {args.user_col}, {args.item_col}, {args.rating_col} 
FROM {args.oltp_shema}.{args.transtion_table_name}
WHERE {args.timestamp_col} >= EXTRACT(EPOCH FROM TIMESTAMP '2020-01-01') * 1000 
"""

# Process chunks with tqdm for progress tracking
df_list = []
chunk_size = 10000  # Adjust chunk size based on available memory

chunks = pd.read_sql(query, engine, chunksize=chunk_size)
for chunk in tqdm(chunks, desc="Loading data", unit="chunk"):
    df_list.append(chunk.drop_duplicates())  # Drop duplicates per chunk

# Concatenate all processed chunks into a final DataFrame
full_df = pd.concat(df_list, ignore_index=True)

print(f"Final DataFrame shape: {full_df.shape}")


Loading data: 0chunk [00:00, ?chunk/s]

Loading data: 1chunk [00:00,  5.18chunk/s]

Loading data: 2chunk [00:00,  6.36chunk/s]

Loading data: 8chunk [00:00, 17.39chunk/s]

Loading data: 13chunk [00:00, 25.09chunk/s]

Loading data: 16chunk [00:00, 21.27chunk/s]

Loading data: 19chunk [00:01, 19.41chunk/s]

Loading data: 24chunk [00:01, 25.62chunk/s]

Loading data: 27chunk [00:01, 22.05chunk/s]

Loading data: 30chunk [00:01, 19.88chunk/s]

Loading data: 35chunk [00:01, 25.23chunk/s]

Loading data: 38chunk [00:01, 21.57chunk/s]

Loading data: 42chunk [00:02, 20.99chunk/s]

Loading data: 47chunk [00:02, 21.83chunk/s]

Loading data: 52chunk [00:02, 26.75chunk/s]

Loading data: 56chunk [00:02, 24.75chunk/s]

Loading data: 59chunk [00:02, 22.63chunk/s]

Loading data: 64chunk [00:02, 22.91chunk/s]

Loading data: 69chunk [00:03, 27.89chunk/s]

Loading data: 73chunk [00:03, 25.48chunk/s]

Loading data: 76chunk [00:03, 23.11chunk/s]

Loading data: 81chunk [00:03, 23.34chunk/s]

Loading data: 86chunk [00:03, 27.71chunk/s]

Loading data: 90chunk [00:03, 24.46chunk/s]

Loading data: 93chunk [00:04, 21.59chunk/s]

Loading data: 98chunk [00:04, 21.51chunk/s]

Loading data: 103chunk [00:04, 21.42chunk/s]

Loading data: 108chunk [00:04, 25.86chunk/s]

Loading data: 111chunk [00:04, 22.65chunk/s]

Loading data: 116chunk [00:05, 22.01chunk/s]

Loading data: 121chunk [00:05, 21.81chunk/s]

Loading data: 126chunk [00:05, 25.89chunk/s]

Loading data: 129chunk [00:05, 22.71chunk/s]

Loading data: 132chunk [00:05, 21.12chunk/s]

Loading data: 137chunk [00:05, 26.50chunk/s]

Loading data: 141chunk [00:06, 24.44chunk/s]

Loading data: 145chunk [00:06, 23.23chunk/s]

Loading data: 150chunk [00:06, 23.23chunk/s]

Loading data: 155chunk [00:06, 28.06chunk/s]

Loading data: 159chunk [00:06, 25.44chunk/s]

Loading data: 162chunk [00:07, 22.82chunk/s]

Loading data: 167chunk [00:07, 22.87chunk/s]

Loading data: 172chunk [00:07, 27.78chunk/s]

Loading data: 176chunk [00:07, 25.19chunk/s]

Loading data: 179chunk [00:07, 22.68chunk/s]

Loading data: 184chunk [00:07, 27.99chunk/s]

Loading data: 188chunk [00:08, 25.30chunk/s]

Loading data: 191chunk [00:08, 22.84chunk/s]

Loading data: 196chunk [00:08, 28.30chunk/s]

Loading data: 200chunk [00:08, 25.69chunk/s]

Loading data: 203chunk [00:08, 23.19chunk/s]

Loading data: 208chunk [00:08, 23.29chunk/s]

Loading data: 213chunk [00:08, 28.43chunk/s]

Loading data: 217chunk [00:09, 25.89chunk/s]

Loading data: 220chunk [00:09, 23.24chunk/s]

Loading data: 225chunk [00:09, 28.58chunk/s]

Loading data: 229chunk [00:09, 25.88chunk/s]

Loading data: 233chunk [00:09, 24.20chunk/s]

Loading data: 238chunk [00:10, 22.65chunk/s]

Loading data: 243chunk [00:10, 27.18chunk/s]

Loading data: 247chunk [00:10, 23.75chunk/s]

Loading data: 250chunk [00:10, 21.72chunk/s]

Loading data: 255chunk [00:10, 26.90chunk/s]

Loading data: 259chunk [00:10, 24.37chunk/s]

Loading data: 262chunk [00:11, 21.30chunk/s]

Loading data: 267chunk [00:11, 26.19chunk/s]

Loading data: 271chunk [00:11, 23.14chunk/s]

Loading data: 275chunk [00:11, 21.51chunk/s]

Loading data: 280chunk [00:11, 21.28chunk/s]

Loading data: 285chunk [00:12, 25.51chunk/s]

Loading data: 288chunk [00:12, 22.96chunk/s]

Loading data: 292chunk [00:12, 22.20chunk/s]

Loading data: 297chunk [00:12, 27.39chunk/s]

Loading data: 301chunk [00:12, 25.03chunk/s]

Loading data: 305chunk [00:12, 23.60chunk/s]

Loading data: 310chunk [00:13, 22.49chunk/s]

Loading data: 315chunk [00:13, 27.23chunk/s]

Loading data: 319chunk [00:13, 25.02chunk/s]

Loading data: 322chunk [00:13, 22.26chunk/s]

Loading data: 327chunk [00:13, 27.47chunk/s]

Loading data: 331chunk [00:13, 24.93chunk/s]

Loading data: 334chunk [00:14, 22.25chunk/s]

Loading data: 339chunk [00:14, 27.54chunk/s]

Loading data: 343chunk [00:14, 25.00chunk/s]

Loading data: 347chunk [00:14, 23.43chunk/s]

Loading data: 352chunk [00:14, 23.28chunk/s]

Loading data: 357chunk [00:14, 28.19chunk/s]

Loading data: 361chunk [00:15, 25.47chunk/s]

Loading data: 364chunk [00:15, 22.89chunk/s]

Loading data: 369chunk [00:15, 28.19chunk/s]

Loading data: 373chunk [00:15, 25.45chunk/s]

Loading data: 377chunk [00:15, 23.76chunk/s]

Loading data: 382chunk [00:15, 23.34chunk/s]

Loading data: 387chunk [00:16, 27.76chunk/s]

Loading data: 391chunk [00:16, 24.73chunk/s]

Loading data: 394chunk [00:16, 21.98chunk/s]

Loading data: 399chunk [00:16, 27.22chunk/s]

Loading data: 403chunk [00:16, 24.55chunk/s]

Loading data: 406chunk [00:16, 21.84chunk/s]

Loading data: 411chunk [00:17, 27.08chunk/s]

Loading data: 415chunk [00:17, 24.46chunk/s]

Loading data: 419chunk [00:17, 22.70chunk/s]

Loading data: 424chunk [00:17, 22.40chunk/s]

Loading data: 429chunk [00:17, 26.80chunk/s]

Loading data: 433chunk [00:18, 23.72chunk/s]

Loading data: 436chunk [00:18, 21.17chunk/s]

Loading data: 441chunk [00:18, 25.99chunk/s]

Loading data: 444chunk [00:18, 22.27chunk/s]

Loading data: 449chunk [00:18, 21.83chunk/s]

Loading data: 454chunk [00:18, 26.34chunk/s]

Loading data: 458chunk [00:19, 23.38chunk/s]

Loading data: 461chunk [00:19, 20.68chunk/s]

Loading data: 466chunk [00:19, 20.74chunk/s]

Loading data: 471chunk [00:19, 25.09chunk/s]

Loading data: 474chunk [00:19, 21.74chunk/s]

Loading data: 478chunk [00:20, 20.41chunk/s]

Loading data: 483chunk [00:20, 25.43chunk/s]

Loading data: 486chunk [00:20, 22.32chunk/s]

Loading data: 491chunk [00:20, 22.15chunk/s]

Loading data: 496chunk [00:20, 26.97chunk/s]

Loading data: 500chunk [00:20, 24.31chunk/s]

Loading data: 503chunk [00:21, 21.70chunk/s]

Loading data: 508chunk [00:21, 21.77chunk/s]

Loading data: 513chunk [00:21, 26.61chunk/s]

Loading data: 517chunk [00:21, 23.88chunk/s]

Loading data: 521chunk [00:21, 22.19chunk/s]

Loading data: 526chunk [00:22, 26.82chunk/s]

Loading data: 530chunk [00:22, 23.79chunk/s]

Loading data: 533chunk [00:22, 21.28chunk/s]

Loading data: 538chunk [00:22, 21.37chunk/s]

Loading data: 543chunk [00:22, 26.21chunk/s]

Loading data: 547chunk [00:22, 23.69chunk/s]

Loading data: 550chunk [00:23, 21.33chunk/s]

Loading data: 555chunk [00:23, 26.36chunk/s]

Loading data: 559chunk [00:23, 23.67chunk/s]

Loading data: 563chunk [00:23, 22.08chunk/s]

Loading data: 568chunk [00:23, 27.06chunk/s]

Loading data: 572chunk [00:23, 24.23chunk/s]

Loading data: 575chunk [00:24, 21.64chunk/s]

Loading data: 580chunk [00:24, 21.73chunk/s]

Loading data: 585chunk [00:24, 26.69chunk/s]

Loading data: 589chunk [00:24, 24.06chunk/s]

Loading data: 593chunk [00:24, 22.31chunk/s]

Loading data: 598chunk [00:25, 27.19chunk/s]

Loading data: 602chunk [00:25, 24.26chunk/s]

Loading data: 605chunk [00:25, 21.66chunk/s]

Loading data: 610chunk [00:25, 26.83chunk/s]

Loading data: 614chunk [00:25, 23.92chunk/s]

Loading data: 617chunk [00:25, 20.99chunk/s]

Loading data: 621chunk [00:26, 24.36chunk/s]

Loading data: 624chunk [00:26, 19.75chunk/s]

Loading data: 628chunk [00:26, 22.50chunk/s]

Loading data: 631chunk [00:26, 17.68chunk/s]

Loading data: 635chunk [00:26, 16.39chunk/s]

Loading data: 639chunk [00:27, 20.08chunk/s]

Loading data: 642chunk [00:27, 17.60chunk/s]

Loading data: 646chunk [00:27, 21.23chunk/s]

Loading data: 649chunk [00:27, 18.11chunk/s]

Loading data: 653chunk [00:27, 17.03chunk/s]

Loading data: 657chunk [00:28, 20.23chunk/s]

Loading data: 660chunk [00:28, 17.82chunk/s]

Loading data: 665chunk [00:28, 17.76chunk/s]

Loading data: 669chunk [00:28, 20.83chunk/s]

Loading data: 672chunk [00:28, 18.34chunk/s]

Loading data: 676chunk [00:28, 21.88chunk/s]

Loading data: 679chunk [00:29, 18.41chunk/s]

Loading data: 683chunk [00:29, 17.45chunk/s]

Loading data: 687chunk [00:29, 21.12chunk/s]

Loading data: 690chunk [00:29, 18.35chunk/s]

Loading data: 694chunk [00:30, 17.23chunk/s]

Loading data: 699chunk [00:30, 21.88chunk/s]

Loading data: 702chunk [00:30, 18.56chunk/s]

Loading data: 706chunk [00:30, 22.00chunk/s]

Loading data: 709chunk [00:30, 18.57chunk/s]

Loading data: 713chunk [00:30, 22.45chunk/s]

Loading data: 716chunk [00:31, 18.84chunk/s]

Loading data: 720chunk [00:31, 17.64chunk/s]

Loading data: 724chunk [00:31, 21.14chunk/s]

Loading data: 727chunk [00:31, 18.29chunk/s]

Loading data: 732chunk [00:31, 18.08chunk/s]

Loading data: 736chunk [00:32, 21.58chunk/s]

Loading data: 739chunk [00:32, 18.45chunk/s]

Loading data: 743chunk [00:32, 22.21chunk/s]

Loading data: 746chunk [00:32, 18.90chunk/s]

Loading data: 750chunk [00:32, 17.63chunk/s]

Loading data: 754chunk [00:33, 21.33chunk/s]

Loading data: 757chunk [00:33, 18.36chunk/s]

Loading data: 762chunk [00:33, 18.16chunk/s]

Loading data: 767chunk [00:33, 22.56chunk/s]

Loading data: 770chunk [00:33, 19.32chunk/s]

Loading data: 774chunk [00:34, 18.02chunk/s]

Loading data: 778chunk [00:34, 21.58chunk/s]

Loading data: 781chunk [00:34, 18.63chunk/s]

Loading data: 786chunk [00:34, 18.45chunk/s]

Loading data: 791chunk [00:34, 22.77chunk/s]

Loading data: 794chunk [00:35, 19.52chunk/s]

Loading data: 798chunk [00:35, 18.25chunk/s]

Loading data: 803chunk [00:35, 22.74chunk/s]

Loading data: 806chunk [00:35, 19.55chunk/s]

Loading data: 810chunk [00:35, 18.12chunk/s]

Loading data: 815chunk [00:36, 22.51chunk/s]

Loading data: 818chunk [00:36, 19.28chunk/s]

Loading data: 823chunk [00:36, 18.84chunk/s]

Loading data: 827chunk [00:36, 22.19chunk/s]

Loading data: 830chunk [00:36, 18.96chunk/s]

Loading data: 834chunk [00:37, 22.49chunk/s]

Loading data: 837chunk [00:37, 19.00chunk/s]

Loading data: 841chunk [00:37, 17.63chunk/s]

Loading data: 845chunk [00:37, 21.30chunk/s]

Loading data: 848chunk [00:37, 18.25chunk/s]

Loading data: 852chunk [00:37, 22.11chunk/s]

Loading data: 855chunk [00:38, 18.57chunk/s]

Loading data: 859chunk [00:38, 17.42chunk/s]

Loading data: 863chunk [00:38, 21.20chunk/s]

Loading data: 866chunk [00:38, 18.22chunk/s]

Loading data: 871chunk [00:38, 22.98chunk/s]

Loading data: 874chunk [00:39, 19.13chunk/s]

Loading data: 878chunk [00:39, 22.92chunk/s]

Loading data: 881chunk [00:39, 19.09chunk/s]

Loading data: 885chunk [00:39, 17.74chunk/s]

Loading data: 889chunk [00:39, 21.55chunk/s]

Loading data: 892chunk [00:40, 18.37chunk/s]

Loading data: 896chunk [00:40, 22.26chunk/s]

Loading data: 899chunk [00:40, 18.76chunk/s]

Loading data: 904chunk [00:40, 23.69chunk/s]

Loading data: 907chunk [00:40, 19.72chunk/s]

Loading data: 911chunk [00:41, 18.26chunk/s]

Loading data: 915chunk [00:41, 21.94chunk/s]

Loading data: 918chunk [00:41, 18.62chunk/s]

Loading data: 923chunk [00:41, 23.35chunk/s]

Loading data: 926chunk [00:41, 19.50chunk/s]

Loading data: 931chunk [00:41, 18.63chunk/s]

Loading data: 935chunk [00:42, 22.04chunk/s]

Loading data: 938chunk [00:42, 18.62chunk/s]

Loading data: 942chunk [00:42, 22.27chunk/s]

Loading data: 945chunk [00:42, 18.74chunk/s]

Loading data: 949chunk [00:42, 22.50chunk/s]

Loading data: 952chunk [00:43, 18.56chunk/s]

Loading data: 956chunk [00:43, 22.39chunk/s]

Loading data: 959chunk [00:43, 18.66chunk/s]

Loading data: 964chunk [00:43, 18.39chunk/s]

Loading data: 968chunk [00:43, 22.00chunk/s]

Loading data: 971chunk [00:43, 18.63chunk/s]

Loading data: 976chunk [00:44, 23.42chunk/s]

Loading data: 979chunk [00:44, 19.66chunk/s]

Loading data: 983chunk [00:44, 18.16chunk/s]

Loading data: 987chunk [00:44, 21.84chunk/s]

Loading data: 990chunk [00:44, 18.42chunk/s]

Loading data: 994chunk [00:45, 22.09chunk/s]

Loading data: 997chunk [00:45, 18.41chunk/s]

Loading data: 1001chunk [00:45, 21.70chunk/s]

Loading data: 1004chunk [00:45, 18.05chunk/s]

Loading data: 1009chunk [00:45, 18.07chunk/s]

Loading data: 1014chunk [00:46, 22.49chunk/s]

Loading data: 1017chunk [00:46, 18.95chunk/s]

Loading data: 1021chunk [00:46, 22.23chunk/s]

Loading data: 1024chunk [00:46, 17.46chunk/s]

Loading data: 1028chunk [00:46, 21.18chunk/s]

Loading data: 1031chunk [00:47, 18.03chunk/s]

Loading data: 1036chunk [00:47, 17.99chunk/s]

Loading data: 1040chunk [00:47, 21.55chunk/s]

Loading data: 1043chunk [00:47, 18.45chunk/s]

Loading data: 1047chunk [00:47, 22.08chunk/s]

Loading data: 1050chunk [00:47, 18.35chunk/s]

Loading data: 1054chunk [00:48, 22.20chunk/s]

Loading data: 1057chunk [00:48, 18.68chunk/s]

Loading data: 1062chunk [00:48, 18.11chunk/s]

Loading data: 1067chunk [00:48, 22.51chunk/s]

Loading data: 1070chunk [00:48, 19.11chunk/s]

Loading data: 1074chunk [00:49, 22.57chunk/s]

Loading data: 1077chunk [00:49, 18.56chunk/s]

Loading data: 1081chunk [00:49, 17.28chunk/s]

Loading data: 1085chunk [00:49, 20.98chunk/s]

Loading data: 1088chunk [00:49, 17.92chunk/s]

Loading data: 1092chunk [00:50, 21.73chunk/s]

Loading data: 1095chunk [00:50, 18.11chunk/s]

Loading data: 1099chunk [00:50, 21.86chunk/s]

Loading data: 1102chunk [00:50, 18.23chunk/s]

Loading data: 1106chunk [00:50, 22.14chunk/s]

Loading data: 1109chunk [00:50, 18.33chunk/s]

Loading data: 1113chunk [00:51, 22.30chunk/s]

Loading data: 1116chunk [00:51, 18.44chunk/s]

Loading data: 1120chunk [00:51, 22.34chunk/s]

Loading data: 1123chunk [00:51, 18.42chunk/s]

Loading data: 1127chunk [00:51, 17.07chunk/s]

Loading data: 1131chunk [00:52, 20.81chunk/s]

Loading data: 1134chunk [00:52, 17.83chunk/s]

Loading data: 1138chunk [00:52, 21.47chunk/s]

Loading data: 1141chunk [00:52, 17.59chunk/s]

Loading data: 1145chunk [00:52, 21.48chunk/s]

Loading data: 1148chunk [00:52, 17.72chunk/s]

Loading data: 1152chunk [00:53, 21.54chunk/s]

Loading data: 1155chunk [00:53, 17.88chunk/s]

Loading data: 1159chunk [00:53, 21.70chunk/s]

Loading data: 1162chunk [00:53, 17.84chunk/s]

Loading data: 1166chunk [00:53, 16.49chunk/s]

Loading data: 1170chunk [00:54, 20.27chunk/s]

Loading data: 1173chunk [00:54, 17.25chunk/s]

Loading data: 1177chunk [00:54, 21.05chunk/s]

Loading data: 1180chunk [00:54, 17.71chunk/s]

Loading data: 1185chunk [00:54, 22.75chunk/s]

Loading data: 1188chunk [00:55, 18.74chunk/s]

Loading data: 1192chunk [00:55, 17.24chunk/s]

Loading data: 1196chunk [00:55, 20.95chunk/s]

Loading data: 1200chunk [00:55, 18.62chunk/s]

Loading data: 1204chunk [00:55, 22.25chunk/s]

Loading data: 1207chunk [00:56, 18.28chunk/s]

Loading data: 1211chunk [00:56, 21.91chunk/s]

Loading data: 1214chunk [00:56, 18.17chunk/s]

Loading data: 1218chunk [00:56, 21.85chunk/s]

Loading data: 1221chunk [00:56, 18.12chunk/s]

Loading data: 1225chunk [00:56, 21.95chunk/s]

Loading data: 1228chunk [00:57, 17.83chunk/s]

Loading data: 1232chunk [00:57, 21.69chunk/s]

Loading data: 1235chunk [00:57, 18.09chunk/s]

Loading data: 1239chunk [00:57, 22.01chunk/s]

Loading data: 1242chunk [00:57, 18.03chunk/s]

Loading data: 1247chunk [00:57, 22.77chunk/s]

Loading data: 1250chunk [00:58, 18.45chunk/s]

Loading data: 1254chunk [00:58, 22.17chunk/s]

Loading data: 1257chunk [00:58, 18.26chunk/s]

Loading data: 1261chunk [00:58, 22.01chunk/s]

Loading data: 1264chunk [00:58, 17.99chunk/s]

Loading data: 1268chunk [00:58, 21.75chunk/s]

Loading data: 1271chunk [00:59, 17.80chunk/s]

Loading data: 1275chunk [00:59, 21.81chunk/s]

Loading data: 1278chunk [00:59, 18.04chunk/s]

Loading data: 1282chunk [00:59, 22.00chunk/s]

Loading data: 1285chunk [00:59, 18.16chunk/s]

Loading data: 1289chunk [01:00, 21.98chunk/s]

Loading data: 1292chunk [01:00, 18.08chunk/s]

Loading data: 1296chunk [01:00, 22.05chunk/s]

Loading data: 1299chunk [01:00, 18.00chunk/s]

Loading data: 1303chunk [01:00, 21.92chunk/s]

Loading data: 1306chunk [01:01, 17.90chunk/s]

Loading data: 1310chunk [01:01, 21.51chunk/s]

Loading data: 1313chunk [01:01, 17.88chunk/s]

Loading data: 1317chunk [01:01, 21.84chunk/s]

Loading data: 1320chunk [01:01, 18.09chunk/s]

Loading data: 1324chunk [01:01, 22.06chunk/s]

Loading data: 1327chunk [01:02, 17.86chunk/s]

Loading data: 1331chunk [01:02, 21.16chunk/s]

Loading data: 1334chunk [01:02, 17.28chunk/s]

Loading data: 1338chunk [01:02, 20.87chunk/s]

Loading data: 1341chunk [01:02, 17.33chunk/s]

Loading data: 1345chunk [01:02, 20.91chunk/s]

Loading data: 1348chunk [01:03, 17.04chunk/s]

Loading data: 1352chunk [01:03, 20.89chunk/s]

Loading data: 1355chunk [01:03, 17.43chunk/s]

Loading data: 1359chunk [01:03, 21.39chunk/s]

Loading data: 1362chunk [01:03, 17.75chunk/s]

Loading data: 1366chunk [01:04, 21.62chunk/s]

Loading data: 1369chunk [01:04, 17.79chunk/s]

Loading data: 1373chunk [01:04, 21.71chunk/s]

Loading data: 1376chunk [01:04, 17.89chunk/s]

Loading data: 1380chunk [01:04, 21.60chunk/s]

Loading data: 1383chunk [01:04, 17.59chunk/s]

Loading data: 1387chunk [01:05, 21.50chunk/s]

Loading data: 1390chunk [01:05, 17.64chunk/s]

Loading data: 1394chunk [01:05, 21.47chunk/s]

Loading data: 1397chunk [01:05, 17.71chunk/s]

Loading data: 1401chunk [01:05, 21.53chunk/s]

Loading data: 1404chunk [01:06, 17.60chunk/s]

Loading data: 1408chunk [01:06, 21.50chunk/s]

Loading data: 1411chunk [01:06, 17.60chunk/s]

Loading data: 1415chunk [01:06, 21.33chunk/s]

Loading data: 1418chunk [01:06, 17.38chunk/s]

Loading data: 1422chunk [01:06, 21.33chunk/s]

Loading data: 1425chunk [01:07, 17.46chunk/s]

Loading data: 1429chunk [01:07, 21.11chunk/s]

Loading data: 1432chunk [01:07, 17.31chunk/s]

Loading data: 1436chunk [01:07, 21.15chunk/s]

Loading data: 1439chunk [01:07, 16.88chunk/s]

Loading data: 1443chunk [01:08, 20.62chunk/s]

Loading data: 1446chunk [01:08, 16.98chunk/s]

Loading data: 1450chunk [01:08, 20.70chunk/s]

Loading data: 1453chunk [01:08, 17.37chunk/s]

Loading data: 1457chunk [01:08, 21.34chunk/s]

Loading data: 1460chunk [01:09, 17.24chunk/s]

Loading data: 1464chunk [01:09, 21.16chunk/s]

Loading data: 1467chunk [01:09, 17.41chunk/s]

Loading data: 1471chunk [01:09, 21.31chunk/s]

Loading data: 1474chunk [01:09, 17.59chunk/s]

Loading data: 1478chunk [01:09, 21.31chunk/s]

Loading data: 1481chunk [01:10, 17.34chunk/s]

Loading data: 1485chunk [01:10, 21.20chunk/s]

Loading data: 1488chunk [01:10, 17.46chunk/s]

Loading data: 1492chunk [01:10, 21.41chunk/s]

Loading data: 1495chunk [01:10, 17.47chunk/s]

Loading data: 1499chunk [01:10, 21.22chunk/s]

Loading data: 1502chunk [01:11, 17.24chunk/s]

Loading data: 1506chunk [01:11, 21.06chunk/s]

Loading data: 1509chunk [01:11, 17.51chunk/s]

Loading data: 1513chunk [01:11, 21.06chunk/s]

Loading data: 1516chunk [01:11, 17.40chunk/s]

Loading data: 1520chunk [01:12, 21.34chunk/s]

Loading data: 1523chunk [01:12, 17.46chunk/s]

Loading data: 1527chunk [01:12, 21.24chunk/s]

Loading data: 1530chunk [01:12, 17.00chunk/s]

Loading data: 1534chunk [01:12, 20.81chunk/s]

Loading data: 1537chunk [01:13, 17.36chunk/s]

Loading data: 1541chunk [01:13, 21.36chunk/s]

Loading data: 1544chunk [01:13, 17.78chunk/s]

Loading data: 1548chunk [01:13, 21.79chunk/s]

Loading data: 1551chunk [01:13, 18.42chunk/s]

Loading data: 1556chunk [01:14, 17.81chunk/s]

Loading data: 1560chunk [01:14, 21.33chunk/s]

Loading data: 1564chunk [01:14, 18.72chunk/s]

Loading data: 1568chunk [01:14, 22.08chunk/s]

Loading data: 1571chunk [01:14, 18.07chunk/s]

Loading data: 1575chunk [01:14, 21.54chunk/s]

Loading data: 1578chunk [01:15, 17.47chunk/s]

Loading data: 1582chunk [01:15, 21.28chunk/s]

Loading data: 1585chunk [01:15, 17.76chunk/s]

Loading data: 1589chunk [01:15, 21.55chunk/s]

Loading data: 1592chunk [01:15, 17.78chunk/s]

Loading data: 1596chunk [01:15, 21.65chunk/s]

Loading data: 1599chunk [01:16, 17.74chunk/s]

Loading data: 1603chunk [01:16, 21.52chunk/s]

Loading data: 1606chunk [01:16, 17.46chunk/s]

Loading data: 1610chunk [01:16, 21.18chunk/s]

Loading data: 1613chunk [01:16, 17.83chunk/s]

Loading data: 1617chunk [01:17, 21.77chunk/s]

Loading data: 1620chunk [01:17, 18.76chunk/s]

Loading data: 1625chunk [01:17, 24.30chunk/s]

Loading data: 1628chunk [01:17, 20.49chunk/s]

Loading data: 1633chunk [01:17, 25.73chunk/s]

Loading data: 1637chunk [01:17, 22.29chunk/s]

Loading data: 1640chunk [01:18, 19.34chunk/s]

Loading data: 1645chunk [01:18, 24.33chunk/s]

Loading data: 1648chunk [01:18, 20.59chunk/s]

Loading data: 1653chunk [01:18, 25.50chunk/s]

Loading data: 1657chunk [01:18, 22.14chunk/s]

Loading data: 1662chunk [01:19, 20.92chunk/s]

Loading data: 1667chunk [01:19, 25.26chunk/s]

Loading data: 1671chunk [01:19, 22.07chunk/s]

Loading data: 1676chunk [01:19, 21.04chunk/s]

Loading data: 1681chunk [01:19, 25.43chunk/s]

Loading data: 1685chunk [01:20, 22.30chunk/s]

Loading data: 1690chunk [01:20, 21.24chunk/s]

Loading data: 1695chunk [01:20, 25.57chunk/s]

Loading data: 1699chunk [01:20, 22.42chunk/s]

Loading data: 1704chunk [01:20, 21.34chunk/s]

Loading data: 1709chunk [01:21, 25.69chunk/s]

Loading data: 1713chunk [01:21, 22.60chunk/s]

Loading data: 1718chunk [01:21, 21.41chunk/s]

Loading data: 1723chunk [01:21, 25.76chunk/s]

Loading data: 1727chunk [01:21, 22.50chunk/s]

Loading data: 1732chunk [01:22, 21.26chunk/s]

Loading data: 1737chunk [01:22, 25.54chunk/s]

Loading data: 1741chunk [01:22, 22.20chunk/s]

Loading data: 1746chunk [01:22, 19.99chunk/s]

Loading data: 1751chunk [01:22, 24.16chunk/s]

Loading data: 1754chunk [01:23, 20.65chunk/s]

Loading data: 1759chunk [01:23, 25.31chunk/s]

Loading data: 1763chunk [01:23, 22.19chunk/s]

Loading data: 1766chunk [01:23, 19.44chunk/s]

Loading data: 1771chunk [01:23, 24.41chunk/s]

Loading data: 1774chunk [01:24, 20.69chunk/s]

Loading data: 1779chunk [01:24, 25.57chunk/s]

Loading data: 1783chunk [01:24, 20.95chunk/s]

Loading data: 1788chunk [01:24, 20.16chunk/s]

Loading data: 1793chunk [01:24, 24.72chunk/s]

Loading data: 1797chunk [01:25, 21.74chunk/s]

Loading data: 1802chunk [01:25, 20.84chunk/s]

Loading data: 1807chunk [01:25, 25.40chunk/s]

Loading data: 1811chunk [01:25, 22.29chunk/s]

Loading data: 1816chunk [01:25, 21.14chunk/s]

Loading data: 1821chunk [01:25, 25.50chunk/s]

Loading data: 1825chunk [01:26, 22.21chunk/s]

Loading data: 1830chunk [01:26, 26.60chunk/s]

Loading data: 1834chunk [01:26, 22.93chunk/s]

Loading data: 1838chunk [01:26, 20.82chunk/s]

Loading data: 1843chunk [01:26, 25.25chunk/s]

Loading data: 1847chunk [01:27, 22.08chunk/s]

Loading data: 1852chunk [01:27, 20.93chunk/s]

Loading data: 1857chunk [01:27, 25.33chunk/s]

Loading data: 1861chunk [01:27, 22.10chunk/s]

Loading data: 1866chunk [01:28, 20.85chunk/s]

Loading data: 1871chunk [01:28, 25.12chunk/s]

Loading data: 1875chunk [01:28, 22.07chunk/s]

Loading data: 1880chunk [01:28, 20.95chunk/s]

Loading data: 1885chunk [01:28, 25.41chunk/s]

Loading data: 1889chunk [01:29, 22.18chunk/s]

Loading data: 1894chunk [01:29, 20.82chunk/s]

Loading data: 1899chunk [01:29, 25.01chunk/s]

Loading data: 1903chunk [01:29, 21.94chunk/s]

Loading data: 1908chunk [01:29, 20.86chunk/s]

Loading data: 1913chunk [01:30, 24.90chunk/s]

Loading data: 1916chunk [01:30, 20.97chunk/s]

Loading data: 1921chunk [01:30, 25.55chunk/s]

Loading data: 1925chunk [01:30, 21.95chunk/s]

Loading data: 1929chunk [01:30, 20.02chunk/s]

Loading data: 1934chunk [01:31, 24.05chunk/s]

Loading data: 1937chunk [01:31, 20.13chunk/s]

Loading data: 1942chunk [01:31, 24.90chunk/s]

Loading data: 1946chunk [01:31, 21.67chunk/s]

Loading data: 1950chunk [01:31, 19.74chunk/s]

Loading data: 1955chunk [01:31, 24.32chunk/s]

Loading data: 1958chunk [01:32, 20.36chunk/s]

Loading data: 1963chunk [01:32, 25.26chunk/s]

Loading data: 1967chunk [01:32, 21.82chunk/s]

Loading data: 1971chunk [01:32, 19.84chunk/s]

Loading data: 1976chunk [01:32, 24.53chunk/s]

Loading data: 1979chunk [01:33, 20.55chunk/s]

Loading data: 1984chunk [01:33, 25.52chunk/s]

Loading data: 1988chunk [01:33, 21.93chunk/s]

Loading data: 1992chunk [01:33, 19.81chunk/s]

Loading data: 1997chunk [01:33, 24.09chunk/s]

Loading data: 2000chunk [01:34, 20.29chunk/s]

Loading data: 2005chunk [01:34, 25.13chunk/s]

Loading data: 2009chunk [01:34, 21.85chunk/s]

Loading data: 2013chunk [01:34, 19.92chunk/s]

Loading data: 2018chunk [01:34, 24.73chunk/s]

Loading data: 2022chunk [01:35, 21.54chunk/s]

Loading data: 2027chunk [01:35, 26.20chunk/s]

Loading data: 2031chunk [01:35, 22.33chunk/s]

Loading data: 2035chunk [01:35, 20.34chunk/s]

Loading data: 2040chunk [01:35, 25.06chunk/s]

Loading data: 2044chunk [01:35, 21.92chunk/s]

Loading data: 2049chunk [01:36, 26.66chunk/s]

Loading data: 2053chunk [01:36, 22.85chunk/s]

Loading data: 2058chunk [01:36, 21.42chunk/s]

Loading data: 2063chunk [01:36, 25.89chunk/s]

Loading data: 2067chunk [01:36, 22.35chunk/s]

Loading data: 2072chunk [01:37, 26.68chunk/s]

Loading data: 2076chunk [01:37, 22.83chunk/s]

Loading data: 2080chunk [01:37, 20.44chunk/s]

Loading data: 2085chunk [01:37, 25.01chunk/s]

Loading data: 2089chunk [01:37, 21.58chunk/s]

Loading data: 2094chunk [01:38, 26.05chunk/s]

Loading data: 2098chunk [01:38, 21.55chunk/s]

Loading data: 2102chunk [01:38, 19.57chunk/s]

Loading data: 2107chunk [01:38, 24.00chunk/s]

Loading data: 2110chunk [01:38, 20.17chunk/s]

Loading data: 2115chunk [01:38, 24.83chunk/s]

Loading data: 2119chunk [01:39, 20.39chunk/s]

Loading data: 2124chunk [01:39, 24.82chunk/s]

Loading data: 2128chunk [01:39, 21.47chunk/s]

Loading data: 2132chunk [01:39, 19.59chunk/s]

Loading data: 2137chunk [01:40, 24.19chunk/s]

Loading data: 2140chunk [01:40, 20.34chunk/s]

Loading data: 2145chunk [01:40, 25.12chunk/s]

Loading data: 2149chunk [01:40, 21.73chunk/s]

Loading data: 2154chunk [01:40, 26.38chunk/s]

Loading data: 2158chunk [01:40, 22.45chunk/s]

Loading data: 2162chunk [01:41, 20.19chunk/s]

Loading data: 2167chunk [01:41, 24.83chunk/s]

Loading data: 2171chunk [01:41, 21.49chunk/s]

Loading data: 2176chunk [01:41, 26.10chunk/s]

Loading data: 2180chunk [01:41, 21.92chunk/s]

Loading data: 2184chunk [01:42, 19.78chunk/s]

Loading data: 2189chunk [01:42, 24.18chunk/s]

Loading data: 2192chunk [01:42, 18.59chunk/s]

Loading data: 2197chunk [01:42, 23.38chunk/s]

Loading data: 2201chunk [01:42, 20.43chunk/s]

Loading data: 2205chunk [01:43, 23.15chunk/s]

Loading data: 2208chunk [01:43, 19.39chunk/s]

Loading data: 2212chunk [01:43, 21.83chunk/s]

Loading data: 2215chunk [01:43, 16.24chunk/s]

Loading data: 2218chunk [01:43, 18.45chunk/s]

Loading data: 2221chunk [01:43, 20.52chunk/s]

Loading data: 2224chunk [01:44, 15.01chunk/s]

Loading data: 2228chunk [01:44, 18.32chunk/s]

Loading data: 2231chunk [01:44, 14.14chunk/s]

Loading data: 2235chunk [01:44, 17.45chunk/s]

Loading data: 2238chunk [01:45, 13.90chunk/s]

Loading data: 2241chunk [01:45, 16.30chunk/s]

Loading data: 2244chunk [01:45, 13.33chunk/s]

Loading data: 2248chunk [01:45, 16.61chunk/s]

Loading data: 2252chunk [01:46, 14.12chunk/s]

Loading data: 2256chunk [01:46, 17.34chunk/s]

Loading data: 2259chunk [01:46, 14.57chunk/s]

Loading data: 2263chunk [01:46, 17.93chunk/s]

Loading data: 2267chunk [01:47, 15.51chunk/s]

Loading data: 2271chunk [01:47, 18.69chunk/s]

Loading data: 2274chunk [01:47, 15.43chunk/s]

Loading data: 2278chunk [01:47, 18.87chunk/s]

Loading data: 2282chunk [01:47, 15.74chunk/s]

Loading data: 2286chunk [01:48, 18.78chunk/s]

Loading data: 2289chunk [01:48, 15.36chunk/s]

Loading data: 2293chunk [01:48, 18.78chunk/s]

Loading data: 2297chunk [01:48, 17.27chunk/s]

Loading data: 2302chunk [01:48, 21.97chunk/s]

Loading data: 2305chunk [01:49, 18.25chunk/s]

Loading data: 2310chunk [01:49, 22.92chunk/s]

Loading data: 2313chunk [01:49, 18.66chunk/s]

Loading data: 2318chunk [01:49, 23.47chunk/s]

Loading data: 2321chunk [01:49, 19.20chunk/s]

Loading data: 2326chunk [01:49, 23.72chunk/s]

Loading data: 2329chunk [01:50, 19.07chunk/s]

Loading data: 2334chunk [01:50, 23.56chunk/s]

Loading data: 2337chunk [01:50, 18.96chunk/s]

Loading data: 2341chunk [01:50, 22.61chunk/s]

Loading data: 2344chunk [01:50, 18.34chunk/s]

Loading data: 2349chunk [01:51, 23.03chunk/s]

Loading data: 2352chunk [01:51, 18.58chunk/s]

Loading data: 2357chunk [01:51, 23.37chunk/s]

Loading data: 2360chunk [01:51, 18.89chunk/s]

Loading data: 2365chunk [01:51, 23.45chunk/s]

Loading data: 2368chunk [01:52, 18.98chunk/s]

Loading data: 2373chunk [01:52, 23.38chunk/s]

Loading data: 2376chunk [01:52, 17.97chunk/s]

Loading data: 2380chunk [01:52, 21.08chunk/s]

Loading data: 2383chunk [01:52, 16.59chunk/s]

Loading data: 2387chunk [01:53, 20.39chunk/s]

Loading data: 2390chunk [01:53, 16.85chunk/s]

Loading data: 2394chunk [01:53, 20.62chunk/s]

Loading data: 2397chunk [01:53, 15.98chunk/s]

Loading data: 2401chunk [01:53, 19.28chunk/s]

Loading data: 2404chunk [01:53, 21.23chunk/s]

Loading data: 2407chunk [01:54, 16.85chunk/s]

Loading data: 2412chunk [01:54, 22.30chunk/s]

Loading data: 2415chunk [01:54, 17.48chunk/s]

Loading data: 2419chunk [01:54, 21.06chunk/s]

Loading data: 2422chunk [01:54, 16.99chunk/s]

Loading data: 2426chunk [01:55, 20.62chunk/s]

Loading data: 2430chunk [01:55, 17.32chunk/s]

Loading data: 2434chunk [01:55, 20.75chunk/s]

Loading data: 2437chunk [01:55, 16.73chunk/s]

Loading data: 2441chunk [01:55, 20.57chunk/s]

Loading data: 2445chunk [01:55, 24.25chunk/s]

Loading data: 2449chunk [01:56, 19.31chunk/s]

Loading data: 2453chunk [01:56, 17.76chunk/s]

Loading data: 2458chunk [01:56, 22.96chunk/s]

Loading data: 2462chunk [01:56, 20.19chunk/s]

Loading data: 2466chunk [01:57, 23.46chunk/s]

Loading data: 2470chunk [01:57, 19.29chunk/s]

Loading data: 2474chunk [01:57, 22.58chunk/s]

Loading data: 2477chunk [01:57, 17.97chunk/s]

Loading data: 2481chunk [01:57, 21.43chunk/s]

Loading data: 2485chunk [01:57, 24.68chunk/s]

Loading data: 2489chunk [01:58, 19.39chunk/s]

Loading data: 2493chunk [01:58, 17.19chunk/s]

Loading data: 2497chunk [01:58, 20.70chunk/s]

Loading data: 2501chunk [01:58, 17.81chunk/s]

Loading data: 2504chunk [01:59, 19.37chunk/s]

Loading data: 2508chunk [01:59, 22.59chunk/s]

Loading data: 2511chunk [01:59, 17.67chunk/s]

Loading data: 2515chunk [01:59, 21.48chunk/s]

Loading data: 2518chunk [01:59, 17.25chunk/s]

Loading data: 2522chunk [01:59, 21.15chunk/s]

Loading data: 2526chunk [02:00, 17.34chunk/s]

Loading data: 2530chunk [02:00, 20.84chunk/s]

Loading data: 2533chunk [02:00, 16.76chunk/s]

Loading data: 2537chunk [02:00, 20.11chunk/s]

Loading data: 2541chunk [02:00, 23.56chunk/s]

Loading data: 2544chunk [02:01, 18.26chunk/s]

Loading data: 2547chunk [02:01, 19.91chunk/s]

Loading data: 2550chunk [02:01, 15.84chunk/s]

Loading data: 2554chunk [02:01, 19.78chunk/s]

Loading data: 2557chunk [02:01, 16.31chunk/s]

Loading data: 2562chunk [02:01, 21.88chunk/s]

Loading data: 2566chunk [02:02, 19.63chunk/s]

Loading data: 2571chunk [02:02, 24.71chunk/s]

Loading data: 2575chunk [02:02, 21.15chunk/s]

Loading data: 2580chunk [02:02, 25.95chunk/s]

Loading data: 2580chunk [05:56, 25.95chunk/s]

IOStream.flush timed out


Loading data: 2580chunk [06:37, 25.95chunk/s]

Loading data: 2582chunk [08:29, 31.46s/chunk]

Loading data: 2583chunk [08:42, 29.54s/chunk]

Loading data: 2584chunk [08:49, 26.69s/chunk]

Loading data: 2587chunk [08:49, 17.21s/chunk]

Loading data: 2589chunk [08:58, 13.91s/chunk]

Loading data: 2591chunk [08:59, 10.37s/chunk]

Loading data: 2595chunk [08:59,  5.90s/chunk]

Loading data: 2595chunk [09:25,  5.90s/chunk]

Loading data: 2597chunk [09:52, 10.72s/chunk]

Loading data: 2598chunk [10:12, 12.08s/chunk]

Loading data: 2599chunk [10:28, 12.70s/chunk]

Loading data: 2600chunk [10:36, 11.84s/chunk]

Loading data: 2602chunk [10:43,  8.80s/chunk]

Loading data: 2603chunk [10:43,  7.18s/chunk]

Loading data: 2606chunk [11:32, 11.32s/chunk]

Loading data: 2607chunk [11:44, 11.59s/chunk]

Loading data: 2608chunk [11:54, 11.11s/chunk]

Loading data: 2609chunk [12:03, 10.62s/chunk]

Loading data: 2610chunk [12:15, 10.92s/chunk]

Loading data: 2611chunk [12:21,  9.83s/chunk]

Loading data: 2612chunk [12:24,  7.84s/chunk]

Loading data: 2613chunk [14:01, 32.48s/chunk]

Loading data: 2614chunk [14:18, 28.38s/chunk]

Loading data: 2615chunk [14:34, 24.73s/chunk]

Loading data: 2616chunk [14:49, 21.90s/chunk]

Loading data: 2617chunk [14:50, 15.67s/chunk]

Loading data: 2620chunk [14:50,  6.95s/chunk]

Loading data: 2622chunk [15:18,  9.39s/chunk]

Loading data: 2623chunk [15:18,  7.63s/chunk]

Loading data: 2624chunk [15:18,  5.99s/chunk]

Loading data: 2629chunk [15:19,  2.37s/chunk]

Loading data: 2632chunk [15:19,  1.57s/chunk]

Loading data: 2636chunk [15:19,  1.05chunk/s]

Loading data: 2639chunk [15:19,  1.43chunk/s]

Loading data: 2644chunk [15:19,  2.38chunk/s]

Loading data: 2647chunk [15:20,  2.98chunk/s]

Loading data: 2652chunk [15:20,  4.63chunk/s]

Loading data: 2655chunk [15:20,  5.45chunk/s]

Loading data: 2660chunk [15:20,  8.06chunk/s]

Loading data: 2663chunk [15:20,  8.69chunk/s]

Loading data: 2667chunk [15:21, 11.38chunk/s]

Loading data: 2670chunk [15:21, 11.53chunk/s]

Loading data: 2675chunk [15:21, 15.89chunk/s]

Loading data: 2678chunk [15:21, 14.63chunk/s]

Loading data: 2683chunk [15:21, 19.19chunk/s]

Loading data: 2686chunk [15:22, 16.49chunk/s]

Loading data: 2691chunk [15:22, 21.12chunk/s]

Loading data: 2694chunk [15:22, 17.76chunk/s]

Loading data: 2699chunk [15:22, 22.66chunk/s]

Loading data: 2702chunk [15:22, 18.78chunk/s]

Loading data: 2706chunk [15:22, 22.00chunk/s]

Loading data: 2709chunk [15:23, 17.23chunk/s]

Loading data: 2714chunk [15:23, 21.92chunk/s]

Loading data: 2718chunk [15:23, 18.68chunk/s]

Loading data: 2723chunk [15:23, 23.33chunk/s]

Loading data: 2726chunk [15:24, 19.25chunk/s]

Loading data: 2731chunk [15:24, 24.02chunk/s]

Loading data: 2735chunk [15:24, 20.41chunk/s]

Loading data: 2740chunk [15:24, 25.16chunk/s]

Loading data: 2744chunk [15:24, 21.28chunk/s]

Loading data: 2749chunk [15:25, 19.66chunk/s]

Loading data: 2754chunk [15:25, 23.86chunk/s]

Loading data: 2758chunk [15:25, 20.51chunk/s]

Loading data: 2763chunk [15:25, 24.40chunk/s]

Loading data: 2766chunk [15:25, 19.37chunk/s]

Loading data: 2771chunk [15:25, 23.72chunk/s]

Loading data: 2774chunk [15:26, 19.02chunk/s]

Loading data: 2779chunk [15:26, 23.75chunk/s]

Loading data: 2783chunk [15:26, 19.14chunk/s]

Loading data: 2787chunk [15:26, 21.91chunk/s]

Loading data: 2790chunk [15:27, 17.09chunk/s]

Loading data: 2794chunk [15:27, 20.78chunk/s]

Loading data: 2798chunk [15:27, 17.65chunk/s]

Loading data: 2802chunk [15:27, 21.21chunk/s]

Loading data: 2805chunk [15:27, 16.73chunk/s]

Loading data: 2809chunk [15:27, 20.12chunk/s]

Loading data: 2813chunk [15:28, 23.44chunk/s]

Loading data: 2816chunk [15:28, 17.92chunk/s]

Loading data: 2820chunk [15:28, 21.78chunk/s]

Loading data: 2823chunk [15:28, 17.08chunk/s]

Loading data: 2827chunk [15:28, 20.61chunk/s]

Loading data: 2830chunk [15:29, 16.56chunk/s]

Loading data: 2834chunk [15:29, 20.06chunk/s]

Loading data: 2838chunk [15:29, 16.80chunk/s]

Loading data: 2842chunk [15:29, 20.17chunk/s]

Loading data: 2845chunk [15:29, 16.62chunk/s]

Loading data: 2849chunk [15:30, 20.30chunk/s]

Loading data: 2853chunk [15:30, 23.66chunk/s]

Loading data: 2856chunk [15:30, 18.31chunk/s]

Loading data: 2861chunk [15:30, 23.16chunk/s]

Loading data: 2864chunk [15:30, 17.96chunk/s]

Loading data: 2867chunk [15:30, 20.02chunk/s]

Loading data: 2870chunk [15:31, 16.45chunk/s]

Loading data: 2875chunk [15:31, 21.56chunk/s]

Loading data: 2878chunk [15:31, 17.43chunk/s]

Loading data: 2882chunk [15:31, 21.23chunk/s]

Loading data: 2885chunk [15:32, 17.01chunk/s]

Loading data: 2890chunk [15:32, 22.21chunk/s]

Loading data: 2894chunk [15:32, 18.39chunk/s]

Loading data: 2899chunk [15:32, 22.11chunk/s]

Loading data: 2902chunk [15:32, 17.73chunk/s]

Loading data: 2906chunk [15:32, 20.82chunk/s]

Loading data: 2910chunk [15:33, 17.25chunk/s]

Loading data: 2914chunk [15:33, 20.75chunk/s]

Loading data: 2918chunk [15:33, 18.08chunk/s]

Loading data: 2923chunk [15:33, 22.72chunk/s]

Loading data: 2926chunk [15:34, 17.98chunk/s]

Loading data: 2930chunk [15:34, 21.24chunk/s]

Loading data: 2934chunk [15:34, 17.84chunk/s]

Loading data: 2939chunk [15:34, 22.43chunk/s]

Loading data: 2942chunk [15:34, 18.16chunk/s]

Loading data: 2946chunk [15:34, 21.49chunk/s]

Loading data: 2950chunk [15:35, 17.31chunk/s]

Loading data: 2954chunk [15:35, 20.74chunk/s]

Loading data: 2958chunk [15:35, 17.80chunk/s]

Loading data: 2962chunk [15:35, 20.43chunk/s]

Loading data: 2962chunk [16:47, 20.43chunk/s]

Loading data: 2965chunk [17:06,  7.38s/chunk]

Loading data: 2966chunk [18:20, 14.36s/chunk]

Loading data: 2967chunk [18:35, 14.35s/chunk]

Loading data: 2968chunk [18:47, 14.07s/chunk]

Loading data: 2969chunk [18:57, 13.21s/chunk]

Loading data: 2969chunk [19:13, 13.21s/chunk]

Loading data: 2971chunk [19:43, 16.72s/chunk]

Loading data: 2972chunk [20:15, 19.63s/chunk]

Loading data: 2973chunk [20:43, 21.52s/chunk]

Loading data: 2974chunk [21:57, 33.95s/chunk]

Loading data: 2975chunk [22:13, 29.51s/chunk]

Loading data: 2976chunk [22:27, 25.39s/chunk]

Loading data: 2977chunk [22:42, 22.43s/chunk]

Loading data: 2978chunk [23:01, 21.67s/chunk]

Loading data: 2979chunk [23:20, 20.73s/chunk]

Loading data: 2980chunk [23:44, 21.54s/chunk]

Loading data: 2981chunk [24:52, 35.18s/chunk]

Loading data: 2982chunk [25:43, 40.08s/chunk]

Loading data: 2983chunk [27:27, 58.92s/chunk]

Loading data: 2984chunk [28:26, 58.94s/chunk]

Loading data: 2985chunk [30:26, 77.18s/chunk]

Loading data: 2986chunk [58:50, 563.28s/chunk]

Loading data: 2987chunk [1:09:00, 577.71s/chunk]

Loading data: 2988chunk [1:09:21, 410.90s/chunk]

Loading data: 2989chunk [1:10:01, 299.44s/chunk]

Loading data: 2990chunk [1:12:03, 246.51s/chunk]

Loading data: 2991chunk [1:12:30, 180.71s/chunk]

Loading data: 2992chunk [1:12:54, 133.65s/chunk]

Loading data: 2993chunk [1:13:45, 108.87s/chunk]

Loading data: 2994chunk [1:15:19, 104.35s/chunk]

Loading data: 2995chunk [1:16:51, 100.79s/chunk]

Loading data: 2996chunk [1:18:47, 105.31s/chunk]

Loading data: 2997chunk [1:19:40, 89.54s/chunk] 

Loading data: 2998chunk [1:23:21, 128.82s/chunk]

Loading data: 2999chunk [1:26:24, 145.27s/chunk]

Loading data: 3000chunk [1:32:32, 211.99s/chunk]

Loading data: 3001chunk [1:40:50, 297.72s/chunk]

Loading data: 3002chunk [1:44:23, 272.29s/chunk]

Loading data: 3003chunk [1:46:39, 231.40s/chunk]

Loading data: 3004chunk [1:49:55, 220.91s/chunk]

Loading data: 3005chunk [1:52:21, 198.20s/chunk]

Loading data: 3006chunk [1:52:50, 147.67s/chunk]

Loading data: 3007chunk [1:55:36, 153.23s/chunk]

Loading data: 3008chunk [1:55:59, 114.10s/chunk]

Loading data: 3009chunk [1:56:42, 92.62s/chunk] 

Loading data: 3010chunk [1:57:26, 78.11s/chunk]

Loading data: 3011chunk [1:59:50, 97.99s/chunk]

Loading data: 3012chunk [2:00:55, 88.09s/chunk]

Loading data: 3013chunk [2:01:17, 68.36s/chunk]

Loading data: 3014chunk [2:03:03, 79.48s/chunk]

Loading data: 3015chunk [2:06:53, 124.54s/chunk]

Loading data: 3016chunk [2:07:18, 94.98s/chunk] 

Loading data: 3017chunk [2:07:32, 70.56s/chunk]

Loading data: 3018chunk [2:07:50, 54.61s/chunk]

Loading data: 3019chunk [2:08:03, 42.46s/chunk]

Loading data: 3020chunk [2:08:19, 34.34s/chunk]

Loading data: 3021chunk [2:08:42, 31.07s/chunk]

Loading data: 3022chunk [2:09:48, 41.46s/chunk]

Loading data: 3023chunk [2:10:00, 32.68s/chunk]

Loading data: 3024chunk [2:10:12, 26.34s/chunk]

Loading data: 3025chunk [2:10:25, 22.45s/chunk]

Loading data: 3026chunk [2:10:41, 20.56s/chunk]

Loading data: 3027chunk [2:11:08, 22.24s/chunk]

Loading data: 3028chunk [2:11:38, 24.82s/chunk]

Loading data: 3029chunk [2:12:03, 24.85s/chunk]

Loading data: 3030chunk [2:12:20, 22.48s/chunk]

Loading data: 3031chunk [2:14:18, 50.95s/chunk]

Loading data: 3032chunk [2:15:02, 48.92s/chunk]

Loading data: 3033chunk [2:15:49, 48.27s/chunk]

Loading data: 3034chunk [2:16:55, 53.68s/chunk]

Loading data: 3035chunk [2:17:25, 46.73s/chunk]

Loading data: 3036chunk [2:18:02, 43.50s/chunk]

Loading data: 3037chunk [2:19:51, 63.38s/chunk]

Loading data: 3038chunk [2:22:04, 84.40s/chunk]

Loading data: 3039chunk [2:26:07, 131.75s/chunk]

Loading data: 3040chunk [2:32:59, 215.71s/chunk]

Loading data: 3041chunk [2:42:03, 312.24s/chunk]

Loading data: 3042chunk [3:22:46, 953.58s/chunk]IOStream.flush timed out


Loading data: 3043chunk [3:38:07, 944.02s/chunk]

Loading data: 3044chunk [3:38:12, 662.23s/chunk]

Loading data: 3046chunk [3:38:13, 356.72s/chunk]

Loading data: 3048chunk [3:38:47, 221.87s/chunk]

Loading data: 3049chunk [3:38:49, 173.16s/chunk]

Loading data: 3053chunk [3:38:49, 76.52s/chunk] 

Loading data: 3053chunk [3:39:14, 76.52s/chunk]

Loading data: 3056chunk [3:39:19, 51.70s/chunk]

Loading data: 3057chunk [3:39:22, 44.28s/chunk]

Loading data: 3059chunk [3:39:23, 30.99s/chunk]

Loading data: 3060chunk [3:39:23, 25.62s/chunk]

Loading data: 3064chunk [3:41:52, 31.48s/chunk]

Loading data: 3065chunk [3:42:53, 35.78s/chunk]

Loading data: 3066chunk [3:45:33, 58.09s/chunk]

Loading data: 3067chunk [3:50:23, 105.15s/chunk]

Loading data: 3068chunk [3:56:53, 169.44s/chunk]

Loading data: 3069chunk [4:07:00, 275.31s/chunk]

IOStream.flush timed out


Loading data: 3070chunk [4:15:12, 331.91s/chunk]

Loading data: 3071chunk [4:16:08, 257.58s/chunk]

Loading data: 3072chunk [4:18:18, 222.12s/chunk]

Loading data: 3073chunk [4:19:13, 174.56s/chunk]

Loading data: 3074chunk [4:19:58, 137.12s/chunk]

Loading data: 3075chunk [4:21:43, 127.74s/chunk]

Loading data: 3076chunk [4:24:03, 131.29s/chunk]

Loading data: 3077chunk [4:26:15, 131.44s/chunk]

Loading data: 3078chunk [4:27:51, 120.94s/chunk]

Loading data: 3079chunk [4:28:44, 100.65s/chunk]

Loading data: 3080chunk [4:32:01, 129.24s/chunk]

Loading data: 3081chunk [4:36:05, 163.98s/chunk]

Loading data: 3082chunk [4:42:10, 224.05s/chunk]

Loading data: 3083chunk [4:55:56, 404.46s/chunk]

Loading data: 3084chunk [5:10:49, 550.90s/chunk]

Loading data: 3085chunk [5:18:17, 519.92s/chunk]

Loading data: 3086chunk [5:19:25, 384.67s/chunk]

Loading data: 3087chunk [5:21:35, 308.14s/chunk]

Loading data: 3088chunk [5:35:21, 463.58s/chunk]

Loading data: 3089chunk [5:35:51, 333.47s/chunk]

Loading data: 3090chunk [5:36:39, 247.72s/chunk]

Loading data: 3091chunk [5:39:13, 219.35s/chunk]

Loading data: 3092chunk [5:40:25, 175.39s/chunk]

Loading data: 3093chunk [5:42:00, 151.29s/chunk]

Loading data: 3094chunk [5:43:52, 139.35s/chunk]

Loading data: 3095chunk [5:46:55, 152.53s/chunk]

Loading data: 3096chunk [5:58:40, 318.26s/chunk]

IOStream.flush timed out


Loading data: 3097chunk [6:36:24, 896.80s/chunk]

IOStream.flush timed out


Loading data: 3098chunk [7:45:00, 1866.67s/chunk]IOStream.flush timed out


Loading data: 3099chunk [8:46:13, 2408.68s/chunk]

In [None]:
connection_string

In [None]:
full_df

In [None]:
# Split train, val, test
train_df, val_df, test_df = train_test_split_timebased(
    full_df)

In [None]:
assert train_df[args.timestamp_col].max() < val_df[args.timestamp_col].min(), "There are overlapping timestamps between train and validation datasets."
assert val_df[args.timestamp_col].max() < test_df[args.timestamp_col].min(), "There are overlapping timestamps between validation and test datasets."

In [None]:
logger.info(f"Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

## Sampling data

Just randomly get X users will not guarantee that the output dataset would qualify the condition of **richness**. Instead we take an iterative approach where we gradually drop random users from the dataset while keeping an eye on the conditions and our sampling target.

In [None]:
def remove_random_users(df, k=10):
    users = df[args.user_col].unique()
    np.random.seed(args.random_seed)
    to_remove_users = np.random.choice(users, size=k, replace=False)
    return df.loc[lambda df: ~df[args.user_col].isin(to_remove_users)]


def get_unqualified(df, col: str, threshold: int):
    unqualified = df.groupby(col).size().loc[lambda s: s < threshold].index
    return unqualified


get_unqualified_users = partial(
    get_unqualified, col=args.user_col, threshold=args.min_user_interactions
)
get_unqualified_items = partial(
    get_unqualified, col=args.item_col, threshold=args.min_item_interactions
)

In [None]:
buffer_perc = 0.2
perc_users_removed_each_round = 0.01
debug = True
keep_random_removing = True
r = 1

sample_df = train_df.copy()

while keep_random_removing:
    num_users_removed_each_round = int(
        perc_users_removed_each_round * sample_df[args.user_col].nunique()
    )
    print(
        f"\n\nRandomly removing {num_users_removed_each_round} users - Round {r} started"
    )
    sample_df = remove_random_users(sample_df, k=num_users_removed_each_round)

    keep_removing = True
    i = 1

    while keep_removing:
        if debug:
            logger.info(f"Sampling round {i} started")
        keep_removing = False
        uu = get_unqualified_users(sample_df)
        if debug:
            logger.info(f"{len(uu)=}")
        if len(uu):
            sample_df = sample_df.loc[lambda df: ~df[args.user_col].isin(uu)]
            if debug:
                logger.info(f"After removing uu: {len(sample_df)=}")
            assert len(get_unqualified_users(sample_df)) == 0
            keep_removing = True
        ui = get_unqualified_items(sample_df)
        if debug:
            logger.info(f"{len(ui)=}")
        if len(ui):
            sample_df = sample_df.loc[lambda df: ~df[args.item_col].isin(ui)]
            if debug:
                logger.info(f"After removing ui: {len(sample_df)=}")
            assert len(get_unqualified_items(sample_df)) == 0
            keep_removing = True
        i += 1

    sample_users = sample_df[args.user_col].unique()
    sample_items = sample_df[args.item_col].unique()
    num_users = len(sample_users)
    logger.info(f"After randomly removing users - round {r}: {num_users=}")
    if num_users > args.sample_users * (1 + buffer_perc):
        logger.info(
            f"Number of users {num_users} are still greater than expected, keep removing..."
        )
    else:
        logger.info(
            f"Number of users {num_users} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False
    
    val_sample_df = val_df.loc[
                lambda df: df[args.user_col].isin(sample_users)
                & df[args.item_col].isin(sample_items)
            ]
    test_sample_df = test_df.loc[
                lambda df: df[args.user_col].isin(sample_users)
                & df[args.item_col].isin(sample_items)
            ]
    if (num_val_records := val_sample_df.shape[0]) < 3000:
        logger.info(
            f"Number of val_df records {num_val_records:,.0f} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False
    if (num_test_records := test_sample_df.shape[0]) < 3000:
        logger.info(
            f"Number of test_df records {num_test_records:,.0f} are falling below expected threshold, stop and use `sample_df` as final output..."
        )
        keep_random_removing = False

    r += 1

sample_users = sample_df[args.user_col].unique()
sample_items = sample_df[args.item_col].unique()
logger.info(f"Final sample sizes: {len(sample_users)=:,.0f}, {len(sample_items)=:,.0f}")


In [None]:
assert sample_df[args.timestamp_col].max() < val_sample_df[args.timestamp_col].min(), "There are overlapping timestamps between train and validation datasets."
assert val_sample_df[args.timestamp_col].max() < test_sample_df[args.timestamp_col].min(), "There are overlapping timestamps between validation and test datasets."

In [None]:
assert val_sample_df.loc[lambda df: ~df[args.user_col].isin(sample_users)].shape[0] == 0, "Validation DataFrame contains unexpected users."
assert test_sample_df.loc[lambda df: ~df[args.user_col].isin(sample_users)].shape[0] == 0, "Test DataFrame contains unexpected users."
assert val_sample_df.loc[lambda df: ~df[args.item_col].isin(sample_items)].shape[0] == 0, "Validation DataFrame contains unexpected items."
assert test_sample_df.loc[lambda df: ~df[args.item_col].isin(sample_items)].shape[0] == 0, "Test DataFrame contains unexpected items."

In [None]:
px.histogram(sample_df.groupby(args.user_col).size())

In [None]:
px.histogram(sample_df.groupby(args.item_col).size())

In [None]:
sample_df

In [None]:
val_sample_df

In [None]:
test_sample_df

In [None]:
subsets = ["train", "val", "test"]
original_length = {"train": train_df.shape[0], "val": val_df.shape[0], "test": test_df.shape[0]}
sampled_length = {"train": sample_df.shape[0], "val": val_sample_df.shape[0], "test": test_sample_df.shape[0]}


In [None]:
original_length

In [None]:
sampled_length

In [None]:
fig = make_subplots(rows=1, cols=3)

# Add data for each subset
for i, subset in enumerate(subsets):
    row = i // 3 + 1
    col = i % 3 +1

    # Add trace for 'curr'
    fig.add_trace(
        go.Bar(
            name="original",
            x=[subset],
            y=[original_length[subset]],
            marker_color = "lightblue",
            showlegend=(i == 0),
            texttemplate="%{y:.2}",
        ),
        row=row,
        col=col,
    )

    # Add trace for 'new'
    fig.add_trace(
        go.Bar(
            name="sample",
            x=[subset],
            y=[sampled_length[subset]],
            marker_color="lightgreen",
            showlegend=(i == 0),
            texttemplate="%{y:.2}",
        ),
        row=row,
        col=col,
    )

    # Add diff annotation
    difference = (sampled_length[subset] - original_length[subset]) / original_length[
        subset
    ]
    fig.add_annotation(
        x=subset,
        y=sampled_length[subset] * 1.10,  # Position above the tallest bar
        text=f"Δ={difference:.2%}",
        showarrow=False,
        font=dict(color="black", size=14),
        row=row,
        col=col,
    )

fig.update_layout(showlegend=True)

fig.show()

In [None]:
# Perit the sampled data
sample_df.to_parquet(f"{args.sample_data_persit_path}/train_sample_interactions_8000u.parquet")
val_sample_df.to_parquet(f"{args.sample_data_persit_path}/val_sample_interactions_8000u.parquet")
test_sample_df.to_parquet(f"{args.sample_data_persit_path}/test_sample_interactions_8000u.parquet")

Remember to version your data with dvc

### Read data and split it into train-val-test sets

In [None]:
train_sample_df = pd.read_parquet(f"{args.sample_data_persit_path}/train_sample_interactions_8000u.parquet")

In [None]:
def plot_interactions_over_time(df):
    plot_df = df.groupby(args.timestamp_col).size()

    fig = px.line(
        x=plot_df.index,
        y=plot_df.values,
        labels={"x": "Date", "y": "Number of Interactions"},
        title="Interactions Over Time",
        height=500,
    )

    fig.update_layout(yaxis=dict(showticklabels=True, tickformat=","))

    fig.show()

In [None]:
plot_interactions_over_time(train_sample_df)

In [None]:
# Build up idm
# Sorted to make sure that even rerun we get same idm mapping
unique_user_ids = sorted(train_sample_df[args.user_col].unique())
unique_item_ids = sorted(train_sample_df[args.item_col].unique())
logger.info(f"Number of unique users: {len(unique_user_ids):,.0f}")
logger.info(f"Number of unique items: {len(unique_item_ids):,.0f}")
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [None]:
idm.save(f"{args.notebook_persit_path}/idm_8000u.json")
idm_persist_fp = f"{args.notebook_persit_path}/idm_8000u.json"
idm = IDMapper().load(idm_persist_fp)

In [None]:
len(idm.item_to_index)

In [None]:
for k, _ in idm.item_to_index.items():
    assert type(k) is str, "Type of user id should be string"
for k,_ in idm.user_to_index.items():
    assert type(k) is str, "Type of item id should be string"