# Read time comparison: Zarr vs HDF5 
Compute the maximum water level during Hurricane Ike on a 9 million node triangular mesh storm surge model (this reads 53GB of data). The data was stored in both Zarr format and as NetCDF4/HDF5, using 11MB chunks with no filters and zlib (level 5) compression.  

Instead of reading the NetCDF4/HDF5 file with an HDF5 or NetCDF4 library, we extract the metadata into an fsspec referenceFileSystem file, create a mapper, and then read the mapper using the Zarr library.

Using a cluster with 60 cores, we find that the performance between this read approach and reading native Zarr is not significantly different.  

In [None]:
import xarray as xr
import zarr
import fsspec
import fsspec.implementations.reference as refs
import intake
import intake_xarray

### Open Intake Catalog

In [None]:
cat = intake.open_catalog('intake_catalog.yml')
list(cat)

### Zarr library reading HDF5 file with fsspec

In [None]:
ds_hdf5  = cat['ike-hdf5'].to_dask()
print(ds_hdf5.zeta.encoding,'\n')
ds_hdf5.zeta

In [None]:
dsk = ds_hdf5.zeta.data.dask
list(dsk.values())[0]

In [None]:
ds_hdf5.zeta.data.dask['open_dataset-54fe252987a424a0df5493e8e797db42zeta-08fa0982425a7062a477ba33bd82fbe6']

In [None]:
ds_hdf5.zeta.data.dask['open_dataset-54fe252987a424a0df5493e8e797db42zeta-08fa0982425a7062a477ba33bd82fbe6'
                      ].array.array.array

### Zarr library reading equivalent Zarr format dataset

In [None]:
ds_zarr  = cat['ike-zarr'].to_dask()
print(ds_zarr.zeta.encoding,'\n')
ds_zarr.zeta

### Start a dask cluster to crunch the data

In [None]:
from dask.distributed import Client
from dask_gateway import Gateway
gateway = Gateway()
cluster = gateway.new_cluster()

In [None]:
cluster.scale(30);

In [None]:
cluster

In [None]:
client = Client(cluster)

In [None]:
client

In [None]:
%%time
max_hdf5 = ds_hdf5['zeta'].max(dim='time').compute()

In [None]:
%%time
max_zarr = ds_zarr['zeta'].max(dim='time').compute()

### compare a point from both datasets, should be the same

In [None]:
max_hdf5[1000].values

In [None]:
max_zarr[1000].values