In [3]:
# # Run me if working on local machine
# import os
# import sys
# os.chdir("../..")
# sys.path.append('./src')

In [None]:
%pwd

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
import logging
logger = logging.getLogger("notebooks.debug")

In [8]:
import utils.logging
utils.logging.setup("conf/logging/default.yaml")

In [9]:
import utils.configs
_ = utils.configs.setup("conf/app.yaml")

2023-11-26 00:45:14,668 - utils.configs - INFO - loading app config 'conf/app.yaml'...
2023-11-26 00:45:14,673 - utils.configs - INFO - loading app config 'conf/app.yaml': done


In [10]:
import dotenv
assert dotenv.load_dotenv(dotenv_path="conf/envs/dev.env")

---

In [14]:
import utils.aws.s3

In [None]:
s3_bucket = utils.configs.get("s3.bucket")
s3_prefix = utils.configs.get("s3.prefix")

In [15]:
s3_dir = os.path.join("s3://", s3_bucket, s3_prefix, "joined", "compact")

In [16]:
src_dir = os.path.join(s3_dir, "date=2023-01-01")
dst_dir = os.path.join("data", "joined", "compact", "date=2023-01-01")

In [17]:
import utils.filesystem

In [21]:
for s3_filename in utils.aws.s3.ls(src_dir, fullpath=False):
    src_path = os.path.join(src_dir, s3_filename)
    dst_path = os.path.join(dst_dir, s3_filename)
    if not utils.filesystem.path_exists(dst_path):
        logger.info(f"downloading '{src_path}' -> '{dst_path}'...")
        utils.aws.s3.download_file(s3_path=src_path, local_path=dst_path)
        logger.info(f"downloading '{src_path}' -> '{dst_path}': done")

In [22]:
import pyarrow
import pyarrow.dataset

In [23]:
src_dataset = pyarrow.dataset.dataset(
    source = "data/joined/compact/",
    partitioning="hive"
)

In [24]:
next(iter(src_dataset.to_batches())).to_pandas()

Unnamed: 0,label,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31_idx,f32_idx,f33_idx,f34_idx,f35_idx,f36_idx,f37_idx,f38_idx,f39_idx,date
0,0,,100.0,1.0,41.0,10.0,0.0,0.0,12,1,...,4,2,27,39,31,0,509,2,4,2023-01-01
1,0,59.0,64.0,26.0,32.0,3.0,0.0,0.0,1,29,...,4,2,228,330,251,102,654,7,2,2023-01-01
2,0,,28.0,,0.0,3.0,3.0,0.0,-1,1,...,2,2,39,58,45,0,162,3,1,2023-01-01
3,0,2.0,11.0,,31.0,1.0,0.0,0.0,39,3,...,3,2,7184,10254,8060,21,73,3,4,2023-01-01
4,0,1.0,166.0,,4.0,20.0,1.0,0.0,70,0,...,3,3,2,2,2,2,170,4,3,2023-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131067,0,,,1.0,0.0,1.0,3.0,0.0,1500,0,...,2,2,1,1,1,1,1008,2,5,2023-01-01
131068,0,5.0,445.0,,,,,0.0,13,0,...,2,2,1,84,1,0,1496,2,2,2023-01-01
131069,0,30.0,1110.0,8.0,11.0,8.0,1.0,0.0,1161,1,...,3,3,2,2,2,2,689,3,3,2023-01-01
131070,0,43.0,569.0,2.0,,3.0,0.0,0.0,11,31,...,3,2,4,5,4,17,210,2,2,2023-01-01


### Test read speed

In [None]:
import time
import tqdm.auto as tqdm

In [26]:
def measure_speed(
    dataset,
    filter = None,
    batch_size = 10_000,
    limit = None
):
    if limit is None:
        logger.info("getting dataset size...")
        total_records = dataset.count_rows(filter=filter)
        logger.info(f"getting dataset size: done ({total_records} records)")
    else:
        total_records = limit

    logger.info("reading dataset...")
    time_start = time.time()
    pbar = tqdm.tqdm(desc="reading data", total=total_records)
    src_batches = dataset.to_batches(filter=filter, batch_size=batch_size)
    rows_processed = 0
    for batch_id, batch in enumerate(src_batches, start=1):
        batch = batch.to_pandas()
        pbar.set_postfix({'batches': batch_id}, refresh=False)
        pbar.update(batch.shape[0])
        rows_processed += batch.shape[0]
        if limit is not None and rows_processed >= limit:
            break
    pbar.close()

    time_finish = time.time()
    elapsed_time = (time_finish - time_start)
    read_speed = rows_processed / elapsed_time
    logger.info(f"reading dataset: done ({int(elapsed_time)} seconds, {int(read_speed)} rows/sec)")

In [27]:
measure_speed(src_dataset)

2023-11-26 00:46:46,395 - notebooks.debug - INFO - getting dataset size...
2023-11-26 00:46:46,405 - notebooks.debug - INFO - getting dataset size: done (195841983 records)
2023-11-26 00:46:46,405 - notebooks.debug - INFO - reading dataset...


reading data: 100%|██████████| 195841983/195841983 [00:34<00:00, 5718247.20it/s, batches=19660]

2023-11-26 00:47:20,668 - notebooks.debug - INFO - reading dataset: done (34 seconds, 5715902 rows/sec)



