# Dask read / write HDF

In [2]:
import dask
import dask.dataframe as dd

import pandas as pd

In [2]:
df = pd.DataFrame({"col1": ["a", "b", "c", "d"], "col2": [1, 2, 3, 4]})

In [3]:
ddf = dd.from_pandas(df, npartitions=2)

### write to multiple files

In [6]:
ddf.to_hdf("../tmp/first-hdf/output-*.hdf", "/data1")

['../tmp/first-hdf/output-0.hdf', '../tmp/first-hdf/output-1.hdf']

## read multiple files

In [15]:
ddf = dd.read_hdf("../tmp/first-hdf/output-*.hdf", "/data1")

In [16]:
ddf.compute()

Unnamed: 0,col1,col2
0,a,1
1,b,2
2,c,3
3,d,4


### write to single file

In [11]:
ddf.to_hdf("../tmp/my-output.hdf", "/cool", format="table")

In [23]:
print(dd.read_hdf("../tmp/my-output.hdf", "/cool").compute())

  col1  col2
0    a     1
1    b     2
2    c     3
3    d     4


In [12]:
df = pd.DataFrame({"col1": ["x", "y"], "col2": [88, 99]})

In [13]:
ddf = dd.from_pandas(df, npartitions=2)

In [14]:
ddf.to_hdf("../tmp/my-output.hdf", "/nice", format="table", append=True)

['../tmp/my-output.hdf']

In [20]:
print(dd.read_hdf("../tmp/my-output.hdf", "/nice").compute())

  col1  col2
0    x    88
1    y    99


In [21]:
print(dd.read_hdf("../tmp/my-output.hdf", "/cool").compute())

  col1  col2
0    a     1
1    b     2
2    c     3
3    d     4


In [22]:
print(dd.read_hdf("../tmp/my-output.hdf", "/*").compute())

  col1  col2
0    a     1
1    b     2
2    c     3
3    d     4
0    x    88
1    y    99


## Write medium sized dataset

In [20]:
ddf = dask.datasets.timeseries(
    start="2015-01-01",
    end="2020-12-31",
    freq="1s",
    partition_freq="7d",
    seed=42,
)

In [5]:
len(ddf)

189302400

In [9]:
%%time
ddf.to_csv("csv/output-*", compression="gzip")

CPU times: user 26min 30s, sys: 47.2 s, total: 27min 18s
Wall time: 14min 4s


['/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-000',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-001',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-002',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-003',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-004',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-005',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-006',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-007',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-008',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-009',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-010',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv/output-011',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/csv

In [8]:
%%time
ddf.to_hdf("hdf/output-*.hdf", "/data1")

CPU times: user 3min 19s, sys: 12.2 s, total: 3min 31s
Wall time: 3min 23s


['hdf/output-000.hdf',
 'hdf/output-001.hdf',
 'hdf/output-002.hdf',
 'hdf/output-003.hdf',
 'hdf/output-004.hdf',
 'hdf/output-005.hdf',
 'hdf/output-006.hdf',
 'hdf/output-007.hdf',
 'hdf/output-008.hdf',
 'hdf/output-009.hdf',
 'hdf/output-010.hdf',
 'hdf/output-011.hdf',
 'hdf/output-012.hdf',
 'hdf/output-013.hdf',
 'hdf/output-014.hdf',
 'hdf/output-015.hdf',
 'hdf/output-016.hdf',
 'hdf/output-017.hdf',
 'hdf/output-018.hdf',
 'hdf/output-019.hdf',
 'hdf/output-020.hdf',
 'hdf/output-021.hdf',
 'hdf/output-022.hdf',
 'hdf/output-023.hdf',
 'hdf/output-024.hdf',
 'hdf/output-025.hdf',
 'hdf/output-026.hdf',
 'hdf/output-027.hdf',
 'hdf/output-028.hdf',
 'hdf/output-029.hdf',
 'hdf/output-030.hdf',
 'hdf/output-031.hdf',
 'hdf/output-032.hdf',
 'hdf/output-033.hdf',
 'hdf/output-034.hdf',
 'hdf/output-035.hdf',
 'hdf/output-036.hdf',
 'hdf/output-037.hdf',
 'hdf/output-038.hdf',
 'hdf/output-039.hdf',
 'hdf/output-040.hdf',
 'hdf/output-041.hdf',
 'hdf/output-042.hdf',
 'hdf/outpu

In [21]:
%%time
ddf.to_parquet("parquet", engine="pyarrow", compression="snappy")

CPU times: user 1min 28s, sys: 6.06 s, total: 1min 34s
Wall time: 38.2 s


(None,)

## Read medium sized dataset & run query

In [3]:
ddf = dd.read_hdf("hdf/output-*.hdf", "/data1")

In [4]:
ddf.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
2015-01-01 00:00:01,987,Patricia,0.069601,0.755351
2015-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
2015-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
2015-01-01 00:00:04,967,Michael,-0.25146,0.81093


In [5]:
ddf.npartitions

313

In [11]:
%%time
ddf["name"].nunique().compute()

CPU times: user 1min 31s, sys: 4.09 s, total: 1min 35s
Wall time: 1min 37s


26

In [13]:
ddf = dd.read_csv("csv/output-*", compression="gzip")

Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  warn(


In [14]:
%%time
ddf["name"].nunique().compute()

CPU times: user 2min 43s, sys: 48.8 s, total: 3min 31s
Wall time: 1min 12s


26

In [16]:
ddf = dd.read_parquet("parquet", engine="pyarrow", columns=["name"])

In [17]:
%%time
ddf["name"].nunique().compute()

CPU times: user 16.8 s, sys: 947 ms, total: 17.8 s
Wall time: 10.5 s


26

## Dask Array create HDF5

In [2]:
import dask.array as da

In [3]:
arr = da.random.random((10, 2))

In [4]:
arr.compute()

array([[0.23100387, 0.48254162],
       [0.99983175, 0.84552979],
       [0.28108483, 0.66065226],
       [0.93905801, 0.76594799],
       [0.62132557, 0.30761242],
       [0.98902505, 0.43684951],
       [0.33143127, 0.13451706],
       [0.72742902, 0.86381888],
       [0.92025059, 0.38922756],
       [0.82958047, 0.05398304]])

In [5]:
arr.to_hdf5("../tmp/myfile.hdf5", "/x")

## Read HDF5 into Dask Array

In [9]:
import h5py

In [16]:
file = h5py.File('../tmp/myfile.hdf5')

In [17]:
file

<HDF5 file "myfile.hdf5" (mode r)>

In [18]:
arr = da.from_array(f["x"])

In [19]:
arr.compute()

array([[0.23100387, 0.48254162],
       [0.99983175, 0.84552979],
       [0.28108483, 0.66065226],
       [0.93905801, 0.76594799],
       [0.62132557, 0.30761242],
       [0.98902505, 0.43684951],
       [0.33143127, 0.13451706],
       [0.72742902, 0.86381888],
       [0.92025059, 0.38922756],
       [0.82958047, 0.05398304]])

In [20]:
type(arr)

dask.array.core.Array