# Create Zarr Stores with Different Chunk Shapes

In this notebook, we create Zarr stores for the CMIP6 TAS daily data available in NetCDF on S3. This method of creating Zarr stores uses [pangeo-forge](https://pangeo-forge.readthedocs.io/) and it's [recipes](https://pangeo-forge.readthedocs.io/en/latest/pangeo_forge_recipes/recipe_user_guide/index.html) pattern.

## 1.1 Install and import libraries

In [3]:
%%capture
!pip install loguru

In [4]:
import fsspec
import s3fs
import xarray as xr
import sys; sys.path.append('..')
from helpers.profiler import Timer
import helpers.eodc_hub_role as eodc_hub_role

In [5]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
zarr_directory = 'test-data/cmip6-zarr'

Note: This is adapted from https://github.com/carbonplan/benchmark-maps/blob/datasets/stores/01b_cmip6_netcdf_to_zarr.ipynb.

## 1.2 Set parameters

In [6]:
#parameters
model = "GISS-E2-1-G"
variable = "tas"
anon=True

In [7]:
# Initiate fsspec filesystems for reading and writing
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [8]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*


In [7]:
files_paths[0]

'nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc'

## 1.3 Test we can open the files

In [9]:
fs_s3 = s3fs.S3FileSystem(anon=True)
filepath = f's3://{files_paths[0]}'
f = fs_s3.open(filepath, mode='rb')
ds = xr.open_dataset(f)
ds

# 2: Setup the destination

In [10]:
s3_fs = s3fs.S3FileSystem(
    key=credentials['aws_access_key_id'],
    secret=credentials['aws_secret_access_key'],
    token=credentials['aws_session_token'], 
    anon=False
)

# 3: Set different target chunks

For different sets of chunks, generate a zarr store.

In [11]:
chunk_sets = []
# Optimized for analysis
temporal_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 29 }
chunk_sets.append(temporal_target_chunks)
# Optimized for visualization at a single time step
global_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 1 }
chunk_sets.append(global_target_chunks)
# Optimized for time series
#spatial_target_chunks = calc_auspicious_chunks_dict(ds[variable], chunk_dims=('lat','lon',))
spatial_target_chunks = {'time': 365, 'lat': 262, 'lon': 262}
chunk_sets.append(spatial_target_chunks)

In [14]:
chunk_sets

[{'lat': 600, 'lon': 1440, 'time': 29},
 {'lat': 600, 'lon': 1440, 'time': 1},
 {'time': 365, 'lat': 262, 'lon': 262}]

In [15]:
timings = {}
# Iterate through remote_files to create a fileset
fileset = [s3_fs.open(file) for file in files_paths[0:2]]
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{zarr_directory}/{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"
    with Timer() as t:
        data = xr.open_mfdataset(fileset, combine='by_coords')
        data_chunked = data.chunk(chunk_set)
        store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3_fs, check=False)
        data_chunked.to_zarr(store, mode='w')
    timings[chunk_prefix] = round(t.elapsed * 1000, 2) 

In [16]:
timings

{'600_1440_29': 68548.07, '600_1440_1': 46144.38, '365_262_262': 33111.03}

# 4: Check it worked

In [12]:
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{zarr_directory}/{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"    
    store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3_fs, check=True)
    ds = xr.open_zarr(store, consolidated=True)
    display(ds)

Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,95.58 MiB
Shape,"(730, 600, 1440)","(29, 600, 1440)"
Dask graph,26 chunks in 2 graph layers,26 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.35 GiB 95.58 MiB Shape (730, 600, 1440) (29, 600, 1440) Dask graph 26 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  600  730,

Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,95.58 MiB
Shape,"(730, 600, 1440)","(29, 600, 1440)"
Dask graph,26 chunks in 2 graph layers,26 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,3.30 MiB
Shape,"(730, 600, 1440)","(1, 600, 1440)"
Dask graph,730 chunks in 2 graph layers,730 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.35 GiB 3.30 MiB Shape (730, 600, 1440) (1, 600, 1440) Dask graph 730 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  600  730,

Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,3.30 MiB
Shape,"(730, 600, 1440)","(1, 600, 1440)"
Dask graph,730 chunks in 2 graph layers,730 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,95.58 MiB
Shape,"(730, 600, 1440)","(365, 262, 262)"
Dask graph,36 chunks in 2 graph layers,36 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.35 GiB 95.58 MiB Shape (730, 600, 1440) (365, 262, 262) Dask graph 36 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  600  730,

Unnamed: 0,Array,Chunk
Bytes,2.35 GiB,95.58 MiB
Shape,"(730, 600, 1440)","(365, 262, 262)"
Dask graph,36 chunks in 2 graph layers,36 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [13]:
# Write output to json file
import json
datasets = {}
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    dataset_id = f"{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"
    dataset_url = f"s3://{bucket}/{zarr_directory}/{dataset_id}"
    datasets[dataset_id] = {
        "dataset_url": dataset_url,
        "variable": variable
    }
    
with open("cmip6-zarr-datasets.json", "w") as f:
    f.write(json.dumps(datasets))
    f.close()
