# Create Zarr Stores with Different Chunk Shapes

In this notebook, we create Zarr stores for the CMIP6 TAS daily data available in NetCDF on S3. This method of creating Zarr stores uses [pangeo-forge](https://pangeo-forge.readthedocs.io/) and it's [recipes](https://pangeo-forge.readthedocs.io/en/latest/pangeo_forge_recipes/recipe_user_guide/index.html) pattern.

## 1.1 Install and import libraries

In [2]:
%%capture
!pip uninstall apache-beam -y
!pip install 'apache-beam[interactive, dataframe]==2.48.0'
!pip install git+https://github.com/carbonplan/cmip6-downscaling.git
!pip install git+https://github.com/norlandrhagen/pangeo-forge-recipes.git@coord_chunking
!pip install loguru

In [25]:
import apache_beam as beam
import fsspec
import os
from pangeo_forge_recipes.patterns import FilePattern, ConcatDim, MergeDim
from pangeo_forge_recipes.transforms import OpenURLWithFSSpec, OpenWithXarray, StoreToZarr
from pangeo_forge_recipes.storage import FSSpecTarget
import re
import rioxarray
import s3fs
import xarray as xr

import sys; sys.path.append('..')
from profiler.main import Timer
import helpers.eodc_hub_role as eodc_hub_role

In [26]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'

Note: This is adapted from https://github.com/carbonplan/benchmark-maps/blob/datasets/stores/01b_cmip6_netcdf_to_zarr.ipynb.

## 1.2 Set parameters

In [5]:
#parameters
model = "GISS-E2-1-G"
variable = "tas"
anon=True

In [6]:
# Initiate fsspec filesystems for reading and writing
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [7]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*


In [7]:
files_paths[0]

'nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc'

## 1.3 Test we can open the files

In [8]:
fs_s3 = s3fs.S3FileSystem(anon=True)
filepath = f's3://{files_paths[0]}'
f = fs_s3.open(filepath, mode='rb')
ds = xr.open_dataset(f)
ds

# 2: Setup the destination

In [9]:
def format_function(time):
    pattern = r"\b\d{4}\b"
    return re.sub(pattern, str(time), filepath)

years = list(range(1950, 1952))
time_dim = ConcatDim("time", keys=years)

pattern = FilePattern(format_function, time_dim, file_type="netcdf4")
pattern = FilePattern.prune(pattern, nkeep=2)

In [27]:
fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)
target_root = FSSpecTarget(fs=fs, root_path=bucket)
print(f"Using {pattern.items()}")

Using <generator object FilePattern.items at 0x7f1a475722d0>


# 3: Set different target chunks

For different sets of chunks, generate a zarr store.

In [11]:
chunk_sets = []
# Optimized for analysis
temporal_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 29 }
chunk_sets.append(temporal_target_chunks)

In [12]:
# Optimized for visualization at a single time step
global_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 1 }
global_target_chunks
chunk_sets.append(global_target_chunks)

In [13]:
# Optimized for time series
#spatial_target_chunks = calc_auspicious_chunks_dict(ds[variable], chunk_dims=('lat','lon',))
spatial_target_chunks = {'time': 365, 'lat': 262, 'lon': 262}
chunk_sets.append(spatial_target_chunks)

In [14]:
chunk_sets

[{'lat': 600, 'lon': 1440, 'time': 29},
 {'lat': 600, 'lon': 1440, 'time': 1},
 {'time': 365, 'lat': 262, 'lon': 262}]

In [15]:
timings = {}
for chunk_set in chunk_sets:
    dir_path = str(("_").join(map(str, chunk_set.values())))
    store_name = f"test-data/cmip6-zarr/{dir_path}/CMIP6_daily_{model}_{variable}.zarr"
    with Timer() as t:
        print(f"Writing to {target_root}/{store_name}")
        transforms = (
            beam.Create(pattern.items())
            | OpenURLWithFSSpec(open_kwargs={'anon': True})
            | OpenWithXarray(file_type=pattern.file_type)
            | StoreToZarr(
                store_name=store_name,
                target_root=target_root,
                combine_dims=pattern.combine_dim_keys,
                target_chunks=chunk_set,
            )
        )
        # Commented out so we don't re-run if we don't intend to
        with beam.Pipeline() as p:
            p | transforms
    timings[dir_path] = round(t.elapsed * 1000, 2) 

Writing to FSSpecTarget(fs=<s3fs.core.S3FileSystem object at 0x7f1a51eb2680>, root_path='nasa-eodc-data-store')/test-data/cmip6-zarr/600_1440_29/CMIP6_daily_GISS-E2-1-G_tas.zarr


Writing to FSSpecTarget(fs=<s3fs.core.S3FileSystem object at 0x7f1a51eb2680>, root_path='nasa-eodc-data-store')/test-data/cmip6-zarr/600_1440_1/CMIP6_daily_GISS-E2-1-G_tas.zarr
Writing to FSSpecTarget(fs=<s3fs.core.S3FileSystem object at 0x7f1a51eb2680>, root_path='nasa-eodc-data-store')/test-data/cmip6-zarr/365_262_262/CMIP6_daily_GISS-E2-1-G_tas.zarr


In [16]:
timings

{'600_1440_29': 83717.75, '600_1440_1': 313428.93, '365_262_262': 335842.43}

Guess as to timing differences - writing more chunks and having to consolidate chunks takes more time.

600, 1440, 1 - have to write a lot of chunks ~722 total chunks
365, 262, 262 - have to conosolidate to time chunk. Also have to break apart existing chunks

In [37]:
dir_path = str(("_").join(map(str, chunk_sets[2].values())))
store_name = f"{dir_path}/CMIP6_daily_{model}_{variable}.zarr"
store_loc = f"{bucket}/test-data/cmip6-zarr/{store_name}"
store_loc

'nasa-eodc-data-store/test-data/cmip6-zarr/365_262_262/CMIP6_daily_GISS-E2-1-G_tas.zarr'

In [38]:
store = s3fs.S3Map(root=store_loc, s3=fs, check=True)

In [42]:
xr.open_dataset(store, engine='zarr', decode_times=False)#, consolidated=True)

type: destination buffer too small; expected at least 4800, got 2096

In [None]:
store_name

# 4: Check it worked

In [None]:
for chunk_set in chunk_sets:
    dir_path = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{dir_path}/CMIP6_{temporal_resolution}_{model}_{variable}.zarr"
    key = f"{store_name}/.zmetadata"
    response = s3.head_object(Bucket=bucket, Key=key)
    object_size = response['ContentLength']
    object_size_MB = object_size / (1024)    
    print(f"Size of metadata {object_size_MB} KB")
    
    store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=fs, check=True)
    ds = xr.open_zarr(store, consolidated=True)
    print(ds)