# Proof of concept: Virtualizing CMIP6 netcdf files

In [4]:
#!pip install virtualizarr
#!pip install git+https://github.com/jbusecke/VirtualiZarr.git@esgf-cmip-test
!pip install ipywidgets



In [5]:
from tqdm.auto import tqdm
from virtualizarr import open_virtual_dataset
from virtualizarr.kerchunk import FileType

import xarray as xr

In [10]:
from dask.diagnostics import ProgressBar

In [None]:
# data is located on public s3 (more info: https://pangeo-data.github.io/pangeo-cmip6-cloud/overview.html#netcdf-data-overview)
paths = [
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_185001-186012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_187101-188012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_188101-189012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_186101-187012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_189101-190012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_190101-191012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_191101-192012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_192101-193012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_193101-194012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_194101-195012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_195101-196012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_196101-197012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_197101-198012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_198101-199012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_199101-200012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_200101-201012.nc',
    'esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_201101-201412.nc'
]

In [None]:
# load virtual datasets in serial
vds_list = []
for f in tqdm(files):
    vds = open_virtual_dataset(f, filetype=FileType.netcdf4, indexes={})
    vds_list.append(vds)

In [None]:
combined_vds = xr.combine_nested(vds_list, concat_dim=['time'], coords='minimal', compat='override')

In [None]:
combined_vds.virtualize.to_kerchunk('combined_full.json', format='json')

## Read from local json
If you executed all steps above, you should be able to execute this cell.

In [None]:
dsv_local = xr.open_dataset(
    "reference://",
    engine="zarr",
    chunks={},
    backend_kwargs={
        "consolidated": False,
        "storage_options": {
            "fo": "combined_full.json",
            "remote_protocol": "s3",
            "remote_options": {"anon": True},
        },
    },
)
dsv_local

## Read from Json on public cloud storage
I moved the resulting json to a public bucket for testing

In [None]:
dsv_bucket = xr.open_dataset(
    "reference://",
    engine="zarr",
    chunks={},
    backend_kwargs={
        "consolidated": False,
        "storage_options": {
            "target_protocol": "gs",
            "fo": 'gs://cmip6/testing-virtualizarr/proof-of-concept.json',
            "remote_protocol": "s3",
            "remote_options":{'anon':True},
        },
    }
)
dsv_bucket

In [6]:
DSID="CMIP6.CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.pr.gn.v20190401"
esgf_url = f"https://esgf-data1.llnl.gov/thredds/fileServer/user_pub_work/vzarr/{DSID}.json"

In [7]:
ds = xr.open_dataset( 
    "reference://",
     engine="zarr",
     chunks={},
   backend_kwargs={
        "storage_options": {
            "target_protocol": "http",
           "fo": esgf_url,
              "remote_protocol": "http",
       },
      },
    decode_times=False
    
)
print(f"Dataset before mean: {ds}")


1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  ds = xr.open_dataset(
  var = coder.decode(var, name=name)


Dataset before mean: <xarray.Dataset> Size: 453MB
Dimensions:    (lat: 192, time: 1980, nbnd: 2, lon: 288)
Coordinates:
  * lat        (lat) float64 2kB -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * lon        (lon) float64 2kB 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
  * time       (time) float64 16kB 6.749e+05 6.749e+05 ... 7.351e+05 7.351e+05
Dimensions without coordinates: nbnd
Data variables:
    lat_bnds   (time, lat, nbnd) float64 6MB dask.array<chunksize=(1, 192, 2), meta=np.ndarray>
    lon_bnds   (time, lon, nbnd) float64 9MB dask.array<chunksize=(1, 288, 2), meta=np.ndarray>
    pr         (time, lat, lon) float32 438MB dask.array<chunksize=(1, 192, 288), meta=np.ndarray>
    time_bnds  (time, nbnd) float64 32kB dask.array<chunksize=(1, 2), meta=np.ndarray>
Attributes: (12/45)
    Conventions:            CF-1.7 CMIP-6.2
    activity_id:            CMIP
    branch_method:          standard
    branch_time_in_child:   674885.0
    branch_time_in_parent:  219000.0
  

In [12]:
with ProgressBar():
    ds_mean = ds.mean().load()
print(ds_mean)

[                                        ] | 0% Completed | 459.33 ms


ReferenceNotReachable: Reference "lon_bnds/953.0.0" failed to fetch target ['http://aims3.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/pr/gn/v20190401/pr_Amon_CESM2_historical_r1i1p1f1_gn_185001-201412.nc', 343237628, 523]