# Virtualizer + Icechunk + Cubed

In [1]:
import cubed
# import icechunk
import xarray as xr
import virtualizarr
# from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig

In [2]:
! rm -rf combined rechunked

### Open tutorial data

In [3]:
ds = xr.tutorial.open_dataset('air_temperature')

### Split into two NetCDF files

In [4]:
ds1 = ds.isel(time=slice(None, 1460))
ds2 = ds.isel(time=slice(1460, None))
ds1.to_netcdf('air1.nc')
ds2.to_netcdf('air2.nc')

  ds1.to_netcdf('air1.nc')
  ds2.to_netcdf('air2.nc')


### Combine two NetCDFs using Virtualizer

In [5]:
vds1 = virtualizarr.open_virtual_dataset('air1.nc', indexes={})
vds2 = virtualizarr.open_virtual_dataset('air2.nc', indexes={})
combined_vds = xr.concat([vds1, vds2], dim='time', coords='minimal', compat='override')

### Write references to Icechunk

In [10]:
storage = StorageConfig.filesystem('combined')
store = IcechunkStore.create(storage=storage, mode="w")
combined_vds.virtualize.to_icechunk(store)

### Commit to Icechunk

In [11]:
store.commit("combined files")

'TKCY5X6PF0D8QMGAKRWG'

### Open in Xarray using Cubed arrays

In [14]:
ds = xr.open_zarr(
    store,
    zarr_version=3,
    consolidated=False,
    chunked_array_type='cubed',
    from_array_kwargs={'spec': cubed.Spec(allowed_mem="2GB", executor_name="single-threaded")},
    chunks={}
)
ds

  ds = xr.open_zarr(


Unnamed: 0,Array,Chunk
Bytes,31.0 MB,2.0 MB
Shape,"(2920, 25, 53)","(730, 13, 27)"
Count,3 arrays in Plan,16 Chunks
Type,float64,np.ndarray
"Array Chunk Bytes 31.0 MB 2.0 MB Shape (2920, 25, 53) (730, 13, 27) Count 3 arrays in Plan 16 Chunks Type float64 np.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,2.0 MB
Shape,"(2920, 25, 53)","(730, 13, 27)"
Count,3 arrays in Plan,16 Chunks
Type,float64,np.ndarray


### TODO

In [15]:
cubed.to_zarr(ds["air"].data, "rechunked")

ValueError: cannot reshape array of size 1934500 into shape (730,13,27)

In [9]:
ds.to_zarr("rechunked", safe_chunks=False)

  return to_zarr(  # type: ignore[call-overload,misc]


ValueError: cannot reshape array of size 1934500 into shape (730,13,27)

In [6]:
# Write to Kerchunk format
combined_vds.virtualize.to_kerchunk('combined.json', format='json')

In [7]:
vds = virtualizarr.open_virtual_dataset('combined.json', filetype='kerchunk', indexes={})
vds

In [15]:
spec = cubed.Spec(allowed_mem="2GB", executor_name="single-threaded")
ds = xr.open_dataset(
    'combined.json',
    engine="kerchunk",
    # zarr_version=3,
    # consolidated=False,
    chunked_array_type='cubed',
    from_array_kwargs={'spec': spec},
    chunks={}
)
ds

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,15.5 MB
Shape,"(2920, 25, 53)","(1460, 25, 53)"
Count,3 arrays in Plan,2 Chunks
Type,float64,np.ndarray
"Array Chunk Bytes 31.0 MB 15.5 MB Shape (2920, 25, 53) (1460, 25, 53) Count 3 arrays in Plan 2 Chunks Type float64 np.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,15.5 MB
Shape,"(2920, 25, 53)","(1460, 25, 53)"
Count,3 arrays in Plan,2 Chunks
Type,float64,np.ndarray


In [19]:
cubed.to_zarr(ds["air"].data, "rechunked")

In [18]:
ds.to_zarr("rechunked", safe_chunks=False, chunkmanager_store_kwargs={'spec': spec})

ValueError: Arrays must have same spec in single computation. Specs: [cubed.Spec(work_dir=None, allowed_mem=2000000000, reserved_mem=0, executor=<cubed.runtime.executors.local.SingleThreadedExecutor object at 0x1285b8990>, storage_options=None, zarr_compressor=default), cubed.Spec(work_dir=None, allowed_mem=2000000000, reserved_mem=0, executor=<cubed.runtime.executors.local.SingleThreadedExecutor object at 0x1285b8990>, storage_options=None, zarr_compressor=default), cubed.Spec(work_dir=None, allowed_mem=2000000000, reserved_mem=100000000, executor=None, storage_options=None, zarr_compressor=default)]