# Create Zarr from CMIP6 NetCDF with `xarray.to_zarr`

This notebook creates a Zarr datastore from CMIP6 data using the native xarray `to_zarr` method in place of a pange-forge recipe.

In [1]:
import sys; sys.path.append('..')
import eodc_hub_role
import fsspec
import s3fs
import xarray as xr

In [2]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'

In [7]:
temporal_resolution = "daily"
model = "GISS-E2-1-G"
variable = "tas"
anon=True

In [6]:
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [8]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*


In [9]:
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)
filepath = f's3://{files_paths[0]}'
ds = xr.open_dataset(s3_fs.open(filepath), engine='h5netcdf')
ds

In [9]:
# Iterate through remote_files to create a fileset
fileset = [s3_fs.open(file) for file in files_paths[0:2]]

# This works
data = xr.open_mfdataset(fileset, combine='by_coords', chunks={'time': 1, 'lat': 600, 'lon': 1440})

In [10]:
data

Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,3.30 MiB
Shape,"(730, 600, 1440)","(1, 600, 1440)"
Dask graph,730 chunks in 5 graph layers,730 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.35 GiB 3.30 MiB Shape (730, 600, 1440) (1, 600, 1440) Dask graph 730 chunks in 5 graph layers Data type float32 numpy.ndarray",1440  600  730,

Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,3.30 MiB
Shape,"(730, 600, 1440)","(1, 600, 1440)"
Dask graph,730 chunks in 5 graph layers,730 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [11]:
bucket_name = 'nasa-eodc-data-store'
s3_path = '600_1440_1_no-coord-chunks/CMIP6_daily_GISS-E2-1-G_tas.zarr'
# Write the xarray dataset to Zarr format on S3
store = s3fs.S3Map(root=f"{bucket_name}/{s3_path}", s3=s3_fs, check=False)
data.to_zarr(store, mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f3326317d80>