In [1]:
import time
import numpy as np 
from dask import delayed
from dask import array as dask_array
from unyt.dask_array import unyt_from_dask, reduce_with_units

# simulation of reading in data off disk into a unyt_dask array 
chunk_ids = range(30)  # this could instead be a list of files 
def get_chunk_info(field, chunk_id):
    # to build dask arrays, need to know the sizes and data types of the chunks
    # in a real application, we would need to check the size of the field we 
    # are reading in. In this case, we will set a chunk size with some random
    # variation
    chunk_size = (1000000 + chunk_id + np.random.randint(-100, 100) , 3)
    # if info on units is stored in the file, we would also want to read that in
    units = "kg/m**3"
    # same for the data type
    field_dtype = np.float64
    return chunk_size, units, field_dtype

@delayed
def read_chunk(field, chunk_id, chunk_size, field_dtype):
    # this would be where you open a chunk and read in the data, from which
    # we are constructing our delayed array. Since we are generating fake
    # data instead of reading, we also pass in chunk_size here and field_dtype
    # for consistency (if we were reading, we would not need these).    
    time.sleep(2) # add an artificial slowdown to simulate some extra process
    return np.random.random(chunk_size).astype(field_dtype)

chunks = []
field = "Density"
for chunk_id in chunk_ids:
    # get the size of this chunk
    chunk_size, units, field_dtype = get_chunk_info(field, chunk_id)
    # get the delayed read of this chunk
    delayed_read = read_chunk(field, chunk_id, chunk_size, field_dtype)
    # get the plain dask array of this chunk
    plain_dask = dask_array.from_delayed(delayed_read, chunk_size, dtype=field_dtype)
    chunks.append(plain_dask)

density = unyt_from_dask(dask_array.concatenate(chunks), units)
density

Unnamed: 0,Array,Chunk
Bytes,686.66 MiB,22.89 MiB
Shape,"(30000630, 3)","(1000108, 3)"
Count,90 Tasks,30 Chunks
Type,float64,numpy.ndarray
Units,kg/m**3,kg/m**3
"Array Chunk Bytes 686.66 MiB 22.89 MiB Shape (30000630, 3) (1000108, 3) Count 90 Tasks 30 Chunks Type float64 numpy.ndarray Units kg/m**3 kg/m**3",3  30000630,

Unnamed: 0,Array,Chunk
Bytes,686.66 MiB,22.89 MiB
Shape,"(30000630, 3)","(1000108, 3)"
Count,90 Tasks,30 Chunks
Type,float64,numpy.ndarray
Units,kg/m**3,kg/m**3


In [2]:
density.mean(axis=0).to('g/cm**3')

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Count,133 Tasks,1 Chunks
Type,float64,numpy.ndarray
Units,g/cm**3,g/cm**3
"Array Chunk Bytes 24 B 24 B Shape (3,) (3,) Count 133 Tasks 1 Chunks Type float64 numpy.ndarray Units g/cm**3 g/cm**3",3  1,

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Count,133 Tasks,1 Chunks
Type,float64,numpy.ndarray
Units,g/cm**3,g/cm**3


In [3]:
%%time
density.mean(axis=0).to('g/cm**3').compute()

CPU times: user 1.09 s, sys: 90.4 ms, total: 1.18 s
Wall time: 10.4 s


unyt_array([0.0005    , 0.00050002, 0.00050002], 'g/cm**3')