# Icechunk Performance - Zarr V3

Using data from the [NCAR ERA5 AWS Public Dataset](https://nsf-ncar-era5.s3.amazonaws.com/index.html).

In [1]:
import xarray as xr
import zarr
import dask
import fsspec
from dask.diagnostics import ProgressBar

print('xarray:  ', xr.__version__)
print('dask:    ', dask.__version__)
print('zarr:    ', zarr.__version__)

xarray:   0.9.7.dev3734+g26081d4f
dask:     2024.9.1+8.g70f56e28
zarr:     3.0.0b1.dev8+g9bbfd88


In [2]:
zarr.config.set(
    {
        'threading.max_workers': 16,
        'async.concurrency': 128
    }
)

<donfig.config_obj.ConfigSet at 0x7f5a7a52bf50>

In [3]:
url = "https://nsf-ncar-era5.s3.amazonaws.com/e5.oper.an.pl/194106/e5.oper.an.pl.128_060_pv.ll025sc.1941060100_1941060123.nc"
%time ds = xr.open_dataset(fsspec.open(url).open(), engine="h5netcdf", chunks={"time": 1})
ds = ds.drop_encoding()

CPU times: user 277 ms, sys: 37.5 ms, total: 315 ms
Wall time: 2.33 s


  var_chunks = _get_chunk(var, chunks, chunkmanager)


In [4]:
print(ds)

<xarray.Dataset> Size: 4GB
Dimensions:    (time: 24, level: 37, latitude: 721, longitude: 1440)
Coordinates:
  * latitude   (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * level      (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03
  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
  * time       (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00
Data variables:
    PV         (time, level, latitude, longitude) float32 4GB dask.array<chunksize=(1, 37, 721, 1440), meta=np.ndarray>
    utc_date   (time) int32 96B dask.array<chunksize=(1,), meta=np.ndarray>
Attributes:
    DATA_SOURCE:          ECMWF: https://cds.climate.copernicus.eu, Copernicu...
    NETCDF_CONVERSION:    CISL RDA: Conversion from ECMWF GRIB 1 data to netC...
    NETCDF_VERSION:       4.8.1
    CONVERSION_PLATFORM:  Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...
    CONVERSION_DATE:      Wed May 10 06:33:49 MDT 2023
    Conventions:       

### Load Data from HDF5 File

This illustrates how loading directly from HDF5 files on S3 can be slow, even with Dask.

In [5]:
with ProgressBar():
    dsl = ds.load()

[########################################] | 100% Completed | 62.20 ss


### Write Zarr Store - No Dask

In [6]:
encoding = {
    "PV": {
        "codecs": [zarr.codecs.BytesCodec(), zarr.codecs.ZstdCodec()],
        "chunks": (1, 1, 721, 1440)
    }
}

In [7]:
import s3fs
s3 = s3fs.S3FileSystem(use_listings_cache=False)

In [10]:
target_path = "icechunk-test/ryan/zarr-v3/test-era5-v3-919"
store = zarr.storage.RemoteStore(s3, mode="w", path=target_path)

In [11]:
%time dsl.to_zarr(store, consolidated=False, zarr_format=3, encoding=encoding, mode="w")

CPU times: user 36.2 s, sys: 2.53 s, total: 38.7 s
Wall time: 15.8 s


<xarray.backends.zarr.ZarrStore at 0x7f5a3839efc0>

In [48]:
# with dask
dslc = dsl.chunk({"time": 1, "level": 1})
store_d = zarr.storage.RemoteStore(s3, mode="w", path=target_url + "-dask")
with ProgressBar():
    dslc.to_zarr(store_d, consolidated=False, zarr_format=3, encoding=encoding, mode="w")

[########################################] | 100% Completed | 12.60 s


### Read Data Back

In [12]:
#store = zarr.storage.RemoteStore(s3, mode="r", path=target_url)
%time dss = xr.open_dataset(store, consolidated=False, zarr_format=3, engine="zarr")

CPU times: user 35.6 ms, sys: 0 ns, total: 35.6 ms
Wall time: 343 ms


In [13]:
dss

In [14]:
%time dss.PV[0, 0, 0, 0].values

CPU times: user 15.7 ms, sys: 0 ns, total: 15.7 ms
Wall time: 101 ms


array(0.00710905, dtype=float32)

In [16]:
%time _ = dss.compute()

CPU times: user 8.41 s, sys: 1.19 s, total: 9.6 s
Wall time: 5.11 s


In [17]:
dssd = xr.open_dataset(store, consolidated=False, engine="zarr").chunk({"time": 1, "level": 10})

In [18]:
with ProgressBar():
    _ = dssd.compute()

[########################################] | 100% Completed | 6.26 sms
