In [2]:
!hostname

r002.ib.bridges2.psc.edu


In [3]:
import os
from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
def generate_attributes(nrows: int) -> pd.DataFrame:
    index = np.arange(nrows)
    attrib = np.random.rand(nrows)
    df = pd.DataFrame({"attrib": attrib}, index=index)
    df = df.convert_dtypes()
    return df

In [5]:
def generate_arrays(nrows: int, nelem: int, offset: int = 0) -> pd.DataFrame:
    index = np.arange(offset, offset + nrows)
    arrs = [np.random.randn(nelem) for ii in range(nrows)]
    df = pd.DataFrame({"array": arrs}, index=index)
    df = df.convert_dtypes()
    return df

In [6]:
def generate_dataset(
    nrows: int,
    nelem: int,
    batch_size_bytes: int = 128 * 1024 * 1024,
    root: str = "data",
    dry_run: bool = False,
):
    row_bytes = 4 * 8 * nelem
    batch_size = batch_size_bytes // row_bytes
    batch_size = 10 * (batch_size // 10)
    nbatches = int(np.ceil(nrows / batch_size))
    total_gb = (row_bytes * nrows) / 1e9
    print(
        f"nrows={nrows}\tnelem={nelem}\tnbatches={nbatches}\t"
        f"row_bytes={row_bytes}\tbatch_size={batch_size}\t"
        f"total_gb={total_gb:.2f}"
    )
    if dry_run:
        return

    datadir = Path(root) / f"nrows-{nrows}_nelem-{nelem}"
    attrib_dir = datadir / "attrib"
    arrays_dir = datadir / "arrays"
    os.makedirs(attrib_dir, exist_ok=True)
    os.makedirs(arrays_dir, exist_ok=True)

    attrib = generate_attributes(nrows)
    attrib.to_parquet(attrib_dir / "attrib.parquet")

    for offset in tqdm(range(0, nrows, batch_size)):
        batch_nrows = min(batch_size, nrows - offset)
        arrays = generate_arrays(batch_nrows, nelem, offset=offset)
        arrays.to_parquet(arrays_dir / f"arrays-{offset:08d}.parquet")

In [7]:
nrows = 10 ** np.arange(3, 6)
nelem = 10 ** np.arange(1, 5)

for nrow, nelem in product(nrows, nelem):
    generate_dataset(nrow, nelem, dry_run=False)

nrows=1000	nelem=10	nbatches=1	row_bytes=320	batch_size=419430	total_gb=0.00


  0%|          | 0/1 [00:00<?, ?it/s]

nrows=1000	nelem=100	nbatches=1	row_bytes=3200	batch_size=41940	total_gb=0.00


  0%|          | 0/1 [00:00<?, ?it/s]

nrows=1000	nelem=1000	nbatches=1	row_bytes=32000	batch_size=4190	total_gb=0.03


  0%|          | 0/1 [00:00<?, ?it/s]

nrows=1000	nelem=10000	nbatches=3	row_bytes=320000	batch_size=410	total_gb=0.32


  0%|          | 0/3 [00:00<?, ?it/s]

nrows=10000	nelem=10	nbatches=1	row_bytes=320	batch_size=419430	total_gb=0.00


  0%|          | 0/1 [00:00<?, ?it/s]

nrows=10000	nelem=100	nbatches=1	row_bytes=3200	batch_size=41940	total_gb=0.03


  0%|          | 0/1 [00:00<?, ?it/s]

nrows=10000	nelem=1000	nbatches=3	row_bytes=32000	batch_size=4190	total_gb=0.32


  0%|          | 0/3 [00:00<?, ?it/s]

nrows=10000	nelem=10000	nbatches=25	row_bytes=320000	batch_size=410	total_gb=3.20


  0%|          | 0/25 [00:00<?, ?it/s]

nrows=100000	nelem=10	nbatches=1	row_bytes=320	batch_size=419430	total_gb=0.03


  0%|          | 0/1 [00:00<?, ?it/s]

nrows=100000	nelem=100	nbatches=3	row_bytes=3200	batch_size=41940	total_gb=0.32


  0%|          | 0/3 [00:00<?, ?it/s]

nrows=100000	nelem=1000	nbatches=24	row_bytes=32000	batch_size=4190	total_gb=3.20


  0%|          | 0/24 [00:00<?, ?it/s]

nrows=100000	nelem=10000	nbatches=244	row_bytes=320000	batch_size=410	total_gb=32.00


  0%|          | 0/244 [00:00<?, ?it/s]