In [1]:
from tempfile import TemporaryDirectory
import boto3
import fsspec
import ujson
import xarray as xr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from typing import Dict

In [1]:
# Specify the CMIP collection to use (daily or monthly)
temporal_resolution = "daily" 
storage_location = "remote"
model = "ACCESS-CM2"
variable = "tasmax"

In [12]:
if temporal_resolution == "daily":
    print("Running kerchunk generation for daily CMIP6 data...")
    temporal_resolution = "daily"
    anon = True
    s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
    # Your code for daily frequency goes here
elif temporal_resolution == "monthly":
    print("Running kerchunk generation for monthly CMIP6 data...")
    temporal_resolution = "monthly"
    anon = False
    s3_path = f"s3://climatedashboard-data/cmip6/raw/monthly/CMIP6_ensemble_median/{variable}/"

Running kerchunk generation for daily CMIP6 data...


In [13]:
# Initiate fsspec filesystems for reading and writing
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [14]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1*/tasmax/*


In [16]:
# Here we prepend the prefix 's3://', which points to AWS.
if temporal_resolution == "monthly":
    subset_files = sorted(["s3://" + f for f in files_paths if ('month_ensemble-median' in f and ("1950" in f or "1951" in f))])
elif temporal_resolution == "daily":
    subset_files = sorted(["s3://" + f for f in files_paths if "1950.nc" in f or "1951.nc" in f])

In [17]:
print(f"{len(subset_files)} file paths were retrieved.")
subset_files

2 file paths were retrieved.


['s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc',
 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1951.nc']

In [18]:
so = dict(mode="rb", anon=anon, default_fill_cache=False, default_cache_type="first")

In [19]:
# We are creating a temporary directory to store the .json reference files
# Alternately, you could write these to cloud storage.
td = TemporaryDirectory()
temp_dir = td.name
print(f"Writing single file references to {temp_dir}")

Writing single file references to /var/folders/jh/_03qbqf130l8hjh8rpc6f4_c0000gn/T/tmp6vxrc_y4


In [20]:
# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.
def generate_json_reference(u, temp_dir: str):
    with fs_read.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fname = u.split("/")[-1].strip(".nc")
        outf = f"cmip6-reference/{fname}.json"
        with open(outf, "wb") as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())
        return outf

In [None]:
# Iterate through filelist to generate Kerchunked files. Good use for `Dask`
output_files = []
for single_file in subset_files:
    out_file = generate_json_reference(single_file, temp_dir)
    output_files.append(out_file)

In [None]:
# combine individual references into single consolidated reference
mzz = MultiZarrToZarr(
    output_files,
    remote_protocol='s3',
    remote_options={'anon': anon},
    concat_dims=['time'],
    coo_map={"time": "cf:time"},
    inline_threshold=0
)

In [None]:
multi_kerchunk = mzz.translate()

In [None]:
# Write kerchunk .json record
output_fname = f"cmip6-reference/combined_CMIP6_{temporal_resolution}_{model}_{variable}_kerchunk.json"
with open(f"{output_fname}", "wb") as f:
    print(f"Writing combined kerchunk reference file {output_fname}")
    f.write(ujson.dumps(multi_kerchunk).encode())

In [None]:
# open dataset as zarr object using fsspec reference file system and Xarray
fs = fsspec.filesystem(
    "reference", fo=multi_kerchunk, remote_protocol="s3", remote_options={"anon": anon}
)
m = fs.get_mapper("")

In [None]:
# Check the data
ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False))
print(ds)

In [None]:
bucket_name = 'nasa-eodc-data-store'
if storage_location == 'remote':
    s3 = boto3.client('s3')
    response = s3.upload_file(output_fname, bucket_name, output_fname)
    print(f"Response uploading {output_fname} to {bucket_name} was {response}.")