# Generate Kerchunk Reference from CMIP6 NetCDF files

This notebook demonstrates how to create a kerchunk reference from NetCDF files on S3.

In [1]:
from tempfile import TemporaryDirectory
import boto3
import fsspec
import json
import os
import ujson
import xarray as xr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from typing import Dict
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role

In [2]:
credentials = eodc_hub_role.fetch_and_set_credentials()

In [None]:
!aws s3 ls

In [23]:
# Specify the CMIP collection to use (daily or monthly)
bucket_name = 'veda-data-store-staging'
model = "GISS-E2-1-G"
variable = "tas"
anon = True
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"

In [None]:
#!aws s3 ls s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/ssp585/r1i1p1f2/tas/

In [24]:
# Initiate fsspec filesystems for reading and writing
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [25]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*


In [26]:
all_files = sorted(["s3://" + f for f in files_paths])

In [None]:
so = dict(mode="rb", anon=anon, default_fill_cache=False, default_cache_type="first")

# inspecting no data values

In [40]:
import s3fs
fs = s3fs.S3FileSystem(anon=True)
aws_url = all_files[0]

fileObj = fs.open(aws_url)
ds = xr.open_dataset(fileObj, engine='h5netcdf')

In [42]:
ds.tas.values[0:10]

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan

In [None]:
# We are creating a temporary directory to store the .json reference files
# Alternately, you could write these to cloud storage.
td = TemporaryDirectory()
temp_dir = td.name
print(f"Writing single file references to {temp_dir}")

In [None]:
# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.
def generate_json_reference(u):
    with fs_read.open(u, **so) as infile:
        print(infile)
        fname = u.split("/")[-1].strip(".nc")        
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        return fname, ujson.dumps(h5chunks.translate()).encode()
    
def write_json(fname, reference_json, temp_dir):
    outf = os.path.join(temp_dir, f"{fname}.json")
    with open(outf, "wb") as f:
        f.write(reference_json)
    return outf    

# Test we can create a kerchunk reference for one file

In [None]:
fname, ref_json = generate_json_reference(all_files[0])
write_json(fname, ref_json, temp_dir)

# Start the dask cluster

In [None]:
from dask_gateway import GatewayCluster, Gateway

gateway = Gateway()
clusters = gateway.list_clusters()

# connect to an existing cluster - this is useful when the kernel shutdown in the middle of an interactive session
if clusters:
    cluster = gateway.connect(clusters[0].name)
else:
    cluster = GatewayCluster(shutdown_on_close=True)

cluster.scale(16)
client = cluster.get_client()
client

In [None]:
# Iterate through filelist to generate Kerchunked files. Good use for `Dask`
import dask.bag as db


In [None]:
#jobs = db.map(generate_json_reference, all_files[0:2])
bag = db.from_sequence(all_files, partition_size=1)
result = db.map(generate_json_reference, bag)
all_references = result.compute()

In [None]:
output_files = [write_json(fname, reference_json, temp_dir) for fname, reference_json in all_references]

In [None]:
# combine individual references into single consolidated reference
mzz = MultiZarrToZarr(
    output_files,
    remote_protocol='s3',
    remote_options={'anon': anon},
    concat_dims=['time'],
    coo_map={"time": "cf:time"},
    inline_threshold=0
)

In [None]:
%%time
multi_kerchunk = mzz.translate()

In [None]:
# Write kerchunk .json record
output_fname = f"combined_CMIP6_daily_{model}_{variable}_kerchunk.json"

In [None]:
output_location = os.path.join(temp_dir, output_fname)
with open(f"{output_location}", "wb") as f:
    print(f"Writing combined kerchunk reference file {output_location}")
    f.write(ujson.dumps(multi_kerchunk).encode())

In [None]:
# open dataset as zarr object using fsspec reference file system and Xarray
fs = fsspec.filesystem(
    "reference", fo=multi_kerchunk, remote_protocol="s3", remote_options={"anon": anon}
)
m = fs.get_mapper("")

In [None]:
# Check the data
ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False))
print(ds)

In [None]:
ds.isel(time=0).tas.attrs

In [None]:
s3 = boto3.client('s3')
response = s3.upload_file(output_location, bucket_name, f'cmip6-{model}-{variable}-kerchunk/{output_fname}')
print(f"Response uploading {output_fname} to {bucket_name} was {response}.")

In [None]:
!aws s3 ls s3://veda-data-store-staging/cmip6-GISS-E2-1-G-tas-kerchunk/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json