In [34]:
from pathlib import Path
import shutil
import xarray as xr
import numpy as np
import dask.array as da

xr.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.10.12 (main, Aug 15 2023, 11:50:32) [GCC 9.4.0]
python-bits: 64
OS: Linux
OS-release: 5.15.0-92-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: ('en_US', 'UTF-8')
libhdf5: None
libnetcdf: None

xarray: 2023.10.1
pandas: 2.1.3
numpy: 1.25.2
scipy: None
netCDF4: None
pydap: None
h5netcdf: None
h5py: None
Nio: None
zarr: 2.16.1
cftime: None
nc_time_axis: None
PseudoNetCDF: None
iris: None
bottleneck: None
dask: 2023.11.0
distributed: None
matplotlib: None
cartopy: None
seaborn: None
numbagg: None
fsspec: 2023.10.0
cupy: None
pint: None
sparse: None
flox: None
numpy_groupies: None
setuptools: 67.8.0
pip: 23.1.2
conda: None
pytest: None
mypy: None
IPython: 8.17.2
sphinx: None


In [35]:
zarr_file_path = Path("../generated/file.zarr")

In [42]:
if zarr_file_path.exists():
    shutil.rmtree(zarr_file_path)
    
chunk_size = 5
shape = (50, 32, 1000)
ones_dataset = xr.Dataset({"data": xr.ones_like(xr.DataArray(np.empty(shape)))})
ones_dataset = ones_dataset.chunk({"dim_0": chunk_size})
chunk_indices = np.arange(len(ones_dataset.chunks["dim_0"]))
chunk_ids = np.repeat(np.arange(ones_dataset.sizes["dim_0"] // chunk_size), chunk_size)
chunk_ids_dask_array = da.from_array(chunk_ids, chunks=(chunk_size,))
# Append the chunk IDs Dask array as a new variable to the existing dataset
ones_dataset["chunk_id"] = (("dim_0",), chunk_ids_dask_array)


# Create a new dataset filled with zeros
zeros_dataset = xr.Dataset({"data": xr.zeros_like(xr.DataArray(np.empty(shape)))})
zeros_dataset.to_zarr(zarr_file_path, compute=False)


def process_chunk(chunk_dataset):
    chunk_id = int(chunk_dataset["chunk_id"][0])
    chunk_dataset_to_store = chunk_dataset.drop_vars("chunk_id")

    start_index = chunk_id * chunk_size
    end_index = chunk_id * chunk_size + chunk_size

    chunk_dataset_to_store.to_zarr(
        zarr_file_path, region={"dim_0": slice(start_index, end_index)}
    )
    return chunk_dataset


ones_dataset.map_blocks(process_chunk, template=ones_dataset).compute()

In [46]:
# Load data stored in zarr
zarr_data = xr.open_zarr(zarr_file_path, chunks={"dim_0": chunk_size})

# Find differences
for var_name in zarr_data.variables:
    try:
        xr.testing.assert_equal(zarr_data[var_name], ones_dataset[var_name])
    except AssertionError:
        print(f"Differences in {var_name}:")
        expected = ones_dataset[var_name].sum().compute().item()
        actual = zarr_data[var_name].sum().compute().item()
        print(f"{expected=}")
        print(f"{actual=}")



Differences in data:
expected=1600000.0
actual=664000.0
