# Write Virtual Icechunk with VirtualiZarr 

In [1]:
from arraylake import Client
from s3fs import S3FileSystem
from tqdm.notebook import tqdm
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr import open_virtual_dataset
import xarray as xr
import zarr

In [2]:
zarr.__version__

'3.0.0b2'

In [3]:
s3 = S3FileSystem()

base_url = "s3://gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07"
one_day = s3.ls(f"{base_url}/1998/001")
one_day

['gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S000000-E002959.0000.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S003000-E005959.0030.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S010000-E012959.0060.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S013000-E015959.0090.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S020000-E022959.0120.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S023000-E025959.0150.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S030000-E032959.0180.V07B.HDF5',
 'gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07/1998/001/3B-HHR.MS.MRG.3IMERG.19980101-S033000-E

In [4]:

dsets = []

drop_variables = ["Intermediate", "nv", "lonv", "latv"]
all_coords = ["time", "lon", "lat", "time_bnds", "lon_bnds", "lat_bnds"]
min_coords = ["time", "time_bnds"]

for file in tqdm(one_day):
    url = f"s3://{file}"
    if len(dsets) == 0:
        my_drop_variables = drop_variables
        loadable_variables = all_coords
        my_coords = all_coords
    else:
        my_drop_variables = drop_variables + list(set(all_coords) - set(min_coords))
        loadable_variables = min_coords
        my_coords = min_coords
    ds = open_virtual_dataset(
        url, indexes={}, group="Grid", backend=HDFVirtualBackend,
        drop_variables=my_drop_variables,
        loadable_variables=loadable_variables
    ).set_coords(my_coords)
    dsets.append(ds)


  0%|          | 0/48 [00:00<?, ?it/s]

In [6]:
import xarray as xr
ds_concat = xr.concat(dsets, dim="time", coords="minimal", join="override")
ds_concat

### Workaround for VirtualiZarr broken fill value encoding with zarr_format=3

https://github.com/zarr-developers/VirtualiZarr/issues/343

In [7]:
from xarray.backends.zarr import FillValueCoder

coder = FillValueCoder()

# promote fill value to attr for zarr V3
for dvar in ds_concat.data_vars:
    dtype = ds_concat[dvar].dtype
    # this is wrong due to bug in Sean's reader
    #fill_value = dtype.type(ds_concat[dvar].data.zarray.fill_value)
    fill_value = dtype.type(ds_concat[dvar].attrs['CodeMissingValue'])
    encoded_fill_value = coder.encode(fill_value, dtype)
    ds_concat[dvar].attrs['_FillValue'] = encoded_fill_value
    print(dvar, dtype, fill_value, encoded_fill_value)

ds_concat

precipitation float32 -9999.9 AAAAQPOHw8A=
randomError float32 -9999.9 AAAAQPOHw8A=
probabilityLiquidPrecipitation int16 -9999 -9999
precipitationQualityIndex float32 -9999.9 AAAAQPOHw8A=


In [8]:
client = Client()
ic_repo = client.create_repo("nasa-impact/GPM_3IMERGHH.07-virtual", kind="icechunk")
ic_repo

<icechunk.IcechunkStore at 0x7f6d33c42bd0>

In [9]:
ds_concat.virtualize.to_icechunk(ic_repo)

  out += (get_codec_class(name_parsed).from_dict(c),)
  warn(
  out += (get_codec_class(name_parsed).from_dict(c),)
  warn(
  out += (get_codec_class(name_parsed).from_dict(c),)
  warn(
  out += (get_codec_class(name_parsed).from_dict(c),)
  warn(


In [10]:
ic_repo.commit("wrote virtual dataset")

'ANYB3BTF2ZF6ANR5FFZ0'