# Testing writing to $N$ `hdf5` files in parallel where each contains $p$ numbers

In [1]:
import numpy as np
rng = np.random.default_rng(12345)
import pandas as pd

from p_tqdm import p_map

In [2]:
# Number of parameters
p = 100000
# Number of ensembles
N = 100

In [3]:
def write_to_many(N):
    df = pd.DataFrame(rng.normal(size=p), columns=[f"{N}"])
    df.to_hdf(f"Ensemble_{N}.h5", key="df", mode="w")
    return True

## Write to files

In [4]:
%%time
res = p_map(write_to_many, np.arange(0, N))
print(f"Writing to {N} hdf5 files where each contains {p} numbers takes:")

  0%|          | 0/100 [00:00<?, ?it/s]

Writing to 100 hdf5 files where each contains 100000 numbers takes:
CPU times: user 92.9 ms, sys: 54.9 ms, total: 148 ms
Wall time: 1.02 s


## Read from files

In [5]:
%%time
df = pd.DataFrame(index=np.arange(0, p))
for i in range(N):
    df = df.join(pd.read_hdf(f"Ensemble_{i}.h5"))
    
print(f"Combining {N} hdf5 files containing {p} numbers each into one data frame takes:")

Combining 100 hdf5 files containing 100000 numbers each into one data frame takes:
CPU times: user 804 ms, sys: 107 ms, total: 911 ms
Wall time: 1.52 s


In [6]:
df.shape

(100000, 100)

In [7]:
assert df.shape == (p, N)

## Clean up

In [8]:
from pathlib import Path
for p in Path(".").glob("Ensemble_*.h5"):
    p.unlink()