In [1]:
!hostname

br012.ib.bridges2.psc.edu


In [2]:
import os
from pathlib import Path
from itertools import product
import time
from typing import Tuple

import duckdb
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
def dummy_calculation(df: pd.DataFrame) -> float:
    array_values = df["array"].values
    array_values = np.concatenate(array_values)
    return np.mean(array_values)

In [4]:
def run_test(
    nrows: int,
    nelem: int,
    root: str = "data",
    expected_n: int = 200,
) -> Tuple[pd.DataFrame, float, float]:

    datadir = Path(root)  / f"nrows-{nrows}_nelem-{nelem}"
    attrib = duckdb.from_parquet(str(datadir / "attrib.parquet"))
    arrays = duckdb.from_parquet(str(datadir / "arrays-*.parquet"))

    threshold = expected_n / len(attrib)
    filtered = attrib.filter(f"attrib < {threshold:.6f}")
    filtered = filtered.set_alias("filtered")
    arrays = arrays.set_alias("arrays")

    tic = time.monotonic()
    df = filtered.join(arrays, "filtered.index = arrays.index").df()
    result = dummy_calculation(df)
    rt = time.monotonic() - tic
    return df, rt, result

In [5]:
df, rt, result = run_test(1000, 100)

In [7]:
df.head(5)

Unnamed: 0,index,attrib,index_2,array
0,0,0.096985,0,"[0.11065926902076481, 2.144258450266957, 1.738..."
1,5,0.035192,5,"[-0.7613518665760686, 0.5911537640444635, 0.50..."
2,10,0.16946,10,"[0.06622648602981021, -0.27364897363442103, -1..."
3,18,0.013489,18,"[-0.451220626441112, -0.25187656987984225, -0...."
4,27,0.155097,27,"[-0.9431276860085344, -0.12053523843683936, 0...."


In [6]:
results = []

nrows = 10 ** np.arange(3, 6)
nelem = 10 ** np.arange(1, 5)

for nrow, nelem in product(nrows, nelem):
    _, rt, value = run_test(nrow, nelem)
    results.append([nrow, nelem, rt, value])
    print(f"nrow={nrow}, nelem={nelem}, rt={rt:.3f}s")

results = pd.DataFrame(results, columns=["nrow", "nelem", "rt", "value"])

nrow=1000, nelem=10, rt=0.004s
nrow=1000, nelem=100, rt=0.010s
nrow=1000, nelem=1000, rt=0.141s
nrow=1000, nelem=10000, rt=0.763s
nrow=10000, nelem=10, rt=0.054s
nrow=10000, nelem=100, rt=0.110s
nrow=10000, nelem=1000, rt=0.269s
nrow=10000, nelem=10000, rt=1.407s
nrow=100000, nelem=10, rt=0.060s
nrow=100000, nelem=100, rt=0.254s
nrow=100000, nelem=1000, rt=1.004s
nrow=100000, nelem=10000, rt=2.798s


In [9]:
results.iloc[:, :3]

Unnamed: 0,nrow,nelem,rt
0,1000,10,0.003757
1,1000,100,0.009874
2,1000,1000,0.140762
3,1000,10000,0.762913
4,10000,10,0.053829
5,10000,100,0.110057
6,10000,1000,0.269147
7,10000,10000,1.407382
8,100000,10,0.059533
9,100000,100,0.253589
