# VirtualiZarr + Kerchunk + Cubed

In [1]:
import cubed
import xarray as xr
import virtualizarr

In [2]:
! rm -rf combined* rechunked

### Open tutorial data

In [17]:
ds = xr.tutorial.open_dataset('air_temperature')
ds

### Split into two NetCDF files

In [4]:
ds1 = ds.isel(time=slice(None, 1460))
ds2 = ds.isel(time=slice(1460, None))
ds1.to_netcdf('air1.nc')
ds2.to_netcdf('air2.nc')

  ds1.to_netcdf('air1.nc')
  ds2.to_netcdf('air2.nc')


### Combine two NetCDFs using VirtualiZarr

In [6]:
vds1 = virtualizarr.open_virtual_dataset('air1.nc', indexes={})
vds2 = virtualizarr.open_virtual_dataset('air2.nc', indexes={})
combined_vds = xr.concat([vds1, vds2], dim='time', coords='minimal', compat='override')

### Write references to Kerchunk

In [7]:
combined_vds.virtualize.to_kerchunk('combined.json', format='json')

### Open in Xarray using Cubed arrays

In [8]:
spec = cubed.Spec(allowed_mem="2GB", executor_name="single-threaded")
ds = xr.open_dataset(
    'combined.json',
    engine="kerchunk",
    chunked_array_type='cubed',
    from_array_kwargs={'spec': spec},
    chunks={}
)
ds

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,15.5 MB
Shape,"(2920, 25, 53)","(1460, 25, 53)"
Count,3 arrays in Plan,2 Chunks
Type,float64,np.ndarray
"Array Chunk Bytes 31.0 MB 15.5 MB Shape (2920, 25, 53) (1460, 25, 53) Count 3 arrays in Plan 2 Chunks Type float64 np.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,15.5 MB
Shape,"(2920, 25, 53)","(1460, 25, 53)"
Count,3 arrays in Plan,2 Chunks
Type,float64,np.ndarray


### Rechunk

In [15]:
# TODO: why do we need to pass in chunked_array_type and from_array_kwargs again?
ds2 = ds.chunk({"time": 2920, "lat": 5, "lon": 5}, chunked_array_type="cubed", from_array_kwargs={'spec': spec})
ds2

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,584.0 KB
Shape,"(2920, 25, 53)","(2920, 5, 5)"
Count,4 arrays in Plan,55 Chunks
Type,float64,np.ndarray
"Array Chunk Bytes 31.0 MB 584.0 KB Shape (2920, 25, 53) (2920, 5, 5) Count 4 arrays in Plan 55 Chunks Type float64 np.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,31.0 MB,584.0 KB
Shape,"(2920, 25, 53)","(2920, 5, 5)"
Count,4 arrays in Plan,55 Chunks
Type,float64,np.ndarray


### Save rechunked arrays as Zarr

In [16]:
cubed.to_zarr(ds2["air"].data, "rechunked")

In [10]:
#ds.to_zarr("rechunked", safe_chunks=False, chunkmanager_store_kwargs={'spec': spec})