In [1]:
import h5py as h5
import dask.array as da
import numpy as np

def h5_writer(data, h5_path, chunks):
    from h5py import File
    from os.path import exists
    from os import remove

    if exists(h5_path):
        remove(h5_path)

    with File(h5_path, 'w') as f:
        f.create_dataset('default', data=data, compression='gzip', chunks=chunks, shuffle=True)

# chosen to match my raw data        
dims = [2, 16, 1884, 1248]
chunks = (1, 236, 156)
fname_base = 't_{0:03d}.h5'

# generate a sequence of hdf5 files with a repeated random dataset
def generate_h5_files(dims):
    data = np.random.randint(0, 10, dims).astype('int16')
    for ind, val in enumerate(data):
        h5_writer(val, fname_base.format(ind), chunks=chunks)
        
generate_h5_files(dims)

# get h5 files
dsets = [h5.File(fname_base.format(ind))['default'] for ind in range(dims[0])]

# represent h5 files as dask array
x = da.stack([da.from_array(d, chunks=chunks) for d in dsets])

Load data from a single file: h5py

In [2]:
%%timeit
t = 0
z = 0
dsets[t][z]

16.8 ms ± 28.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Load data from a single file: dask

In [3]:
%%timeit
t = 0
z = 0
x[t,z].compute()

165 ms ± 3.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Load data from multiple files: looped h5py

In [4]:
%%timeit
t = [0,1]
z = 0
[dsets[t_][z] for t_ in t]

34 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Load data from multiple files: dask

In [5]:
%%timeit
t = slice(0, None)
z = 0
x[t,z].compute()

305 ms ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
