In [2]:
import virtualizarr
virtualizarr.__version__

'1.3.1'

In [3]:
from datetime import datetime, timedelta
import pandas as pd
import coiled
from s3fs import S3FileSystem

In [4]:
cluster = coiled.Cluster(
    workspace="earthmover-devs",
    software="icechunk-virtualizarr",
    region="us-west-2",
    n_workers=1,
)
client = cluster.get_client()

Output()


+---------+----------------+----------------+---------+
| Package | Client         | Scheduler      | Workers |
+---------+----------------+----------------+---------+
| python  | 3.12.7.final.0 | 3.12.0.final.0 | None    |
| tornado | 6.4.1          | 6.4.2          | None    |
+---------+----------------+----------------+---------+


In [5]:
cluster.scale(50)

In [6]:
base_url = "s3://gesdisc-cumulus-prod-protected/GPM_L3/GPM_3IMERGHH.07"

def make_url(date: datetime) -> str:
    end_date = date + timedelta(minutes=29, seconds=59)
    base_date = datetime(year=date.year, month=date.month, day=date.day, hour=0, minute=0, second=0)
    delta_minutes = (date - base_date) // timedelta(minutes=1)
    components = [
        base_url,
        "{:04d}".format(date.year),
        date.strftime('%j'),  # day of year
        (
            "3B-HHR.MS.MRG.3IMERG." +
            date.strftime("%Y%m%d") +
            "-S" + date.strftime("%H%M%S") +
            "-E" + end_date.strftime("%H%M%S") +
            ".{:04d}".format(delta_minutes) +
            ".V07B.HDF5"
        )
    ]
    return '/'.join(components)


In [7]:
from dask import compute

In [8]:
def get_info(url):
    try:
        return s3.info(url)
    except FileNotFoundError:
        return None

def get_info_for_time(time):
    url = make_url(time)
    return get_info(url)

def hours_for_day(day):
    assert day.hour == day.minute == day.second == 0
    return pd.date_range(start=day, periods=48, freq="30min")

def get_info_for_day(day):
    return [get_info(make_url(full_datetime)) for full_datetime in hours_for_day(day)]

In [9]:
def open_virtual(url, keep_coords=True):
    from virtualizarr.readers.hdf import HDFVirtualBackend
    from virtualizarr import open_virtual_dataset

    # had to remove time_bnds because it was not playing nicely with concat
    # > MergeError: conflicting values for variable 'time_bnds' on objects to be combined.
    # > You can skip this check by specifying compat='override'.
    drop_variables = ["Intermediate", "nv", "lonv", "latv"]
    all_coords = ["time", "lon", "lat", "lon_bnds", "lat_bnds", "time_bnds"]
    min_coords = ["time", "time_bnds"]

    if keep_coords:
        my_drop_variables = drop_variables
        loadable_variables = all_coords
        my_coords = all_coords
    else:
        my_drop_variables = drop_variables + list(set(all_coords) - set(min_coords))
        loadable_variables = min_coords
        my_coords = min_coords
        
    ds = open_virtual_dataset(
        url, indexes={}, group="Grid", backend=HDFVirtualBackend,
        drop_variables=my_drop_variables,
        loadable_variables=loadable_variables
    ).set_coords(my_coords)
    return ds

In [10]:
def reduce_via_concat(dsets, concat_dim="time"):
    import xarray as xr
    return xr.concat(dsets, dim=concat_dim, coords="minimal", join="override")

In [11]:
from xarray.backends.zarr import FillValueCoder

def fix_ds(ds):
    ds = ds.copy()
    coder = FillValueCoder()
    # promote fill value to attr for zarr V3
    for dvar in ds.data_vars:
        dtype = ds[dvar].dtype
        # this is wrong due to bug in Sean's reader
        #fill_value = dtype.type(ds_concat[dvar].data.zarray.fill_value)
        fill_value = dtype.type(ds[dvar].attrs['CodeMissingValue'])
        encoded_fill_value = coder.encode(fill_value, dtype)
        ds[dvar].attrs['_FillValue'] = encoded_fill_value
    
    return ds

In [12]:
import dask.bag as db
import itertools

In [13]:
def dset_for_year(year):
    all_days = pd.date_range(start=f"{year}-01-01", end=f"{year}-12-31", freq="1D")
    all_times = list(itertools.chain(*[hours_for_day(day) for day in all_days]))

    b = db.from_sequence(all_times, partition_size=48)
    all_urls = db.map(make_url, b)
    vdsets = db.map(open_virtual, all_urls)
    concatted = vdsets.reduction(reduce_via_concat, reduce_via_concat)
    ds = concatted.compute()
    return fix_ds(ds)

In [14]:
year = 1998
all_days = pd.date_range(start=f"{year}-01-01", end=f"{year}-12-31", freq="1D")
all_times = list(itertools.chain(*[hours_for_day(day) for day in all_days]))
url = make_url(all_times[-1])
ds = open_virtual(url)
ds

In [15]:
ds_1998 = dset_for_year(1998)


In [16]:
ds_1998

In [17]:
import boto3
import icechunk

store_name = "2025-02-27/GPM_3IMERGHH.07-virtual-1998-v2"

session = boto3.Session()

# Get the credentials from the session
credentials = session.get_credentials()

# Extract the actual key, secret, and token
creds = credentials.get_frozen_credentials()
storage = icechunk.s3_storage(
    bucket='nasa-veda-scratch',
    prefix=f"icechunk/{store_name}",
    access_key_id=creds.access_key,
    secret_access_key=creds.secret_key,
    session_token=creds.token            
)

In [18]:
repo = icechunk.Repository.open_or_create(
    storage=storage,
)

repo

<icechunk.repository.Repository at 0x7fc64e88eb40>

In [20]:
ds_1998.virtualize.to_icechunk(store)

  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)


In [21]:
session.commit("wrote 1998")

'8QP4W228P75VG7XJYV7G'

In [22]:
store_name = "2025-02-27/GPM_3IMERGHH.07-virtual-full"

storage = icechunk.s3_storage(
    bucket='nasa-veda-scratch',
    prefix=f"icechunk/{store_name}",
    access_key_id=creds.access_key,
    secret_access_key=creds.secret_key,
    session_token=creds.token            
)

In [23]:
repo = icechunk.Repository.open_or_create(
    storage=storage,
)
session = repo.writable_session("main")
store = session.store

In [24]:
ds_1998.virtualize.to_icechunk(store)
session.commit("wrote 1998")

  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)


'3E57T4C5A81QNCQ19D60'

In [25]:
all_dsets = [dset_for_year(year) for year in range(1999, 2009)]

In [27]:
ds

In [26]:
for i, year in enumerate(range(1999, 2009)):
    print(year)
    ds = all_dsets[i]
    session = repo.writable_session("main")
    ds.virtualize.to_icechunk(session.store, append_dim="time")
    session.commit(f"wrote {year}")

1999


  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)
  super().__init__(**codec_config)


KeyError: '///precipitation'

In [30]:
ds_5yr = xr.concat([ds_1998] + all_dsets[:4], dim="time")
ds_5yr

KeyboardInterrupt: 

In [21]:
#for year in range(2000, 2024):
all_dsets = []
for year in range(1999, 2009):
    print(year)
    ds_year = dset_for_year(year)
    #ds_year.virtualize.to_icechunk(ic_repo, append_dim="time")
    #cid = ic_repo.commit(f"Appended {year}")
    #print(cid)
    all_dsets.append(ds_year)

1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
