# Create Zarr Stores with Different Chunk Shapes

In this notebook, we create Zarr stores for the CMIP6 TAS daily data available in NetCDF on S3. This method of creating Zarr stores uses [pangeo-forge](https://pangeo-forge.readthedocs.io/) and it's [recipes](https://pangeo-forge.readthedocs.io/en/latest/pangeo_forge_recipes/recipe_user_guide/index.html) pattern.

## 1.1 Install and import libraries

In [None]:
%%capture
!pip uninstall apache-beam -y
!pip install 'apache-beam[interactive, dataframe]==2.48.0' git+https://github.com/carbonplan/cmip6-downscaling.git git+https://github.com/pangeo-forge/pangeo-forge-recipes.git@beam-refactor

In [6]:
import apache_beam as beam
import boto3
from botocore.exceptions import ClientError
import fsspec
import os
from pangeo_forge_recipes.patterns import FilePattern, ConcatDim, MergeDim
from pangeo_forge_recipes.transforms import OpenURLWithFSSpec, OpenWithXarray, StoreToZarr
from pangeo_forge_recipes.storage import FSSpecTarget
import re
import rioxarray
import s3fs
import xarray as xr

import sys; sys.path.append('..')
import eodc_hub_role

In [None]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'

Note: This is adapted from https://github.com/carbonplan/benchmark-maps/blob/datasets/stores/01b_cmip6_netcdf_to_zarr.ipynb.

## 1.2 Set parameters

In [None]:
#parameters
temporal_resolution = "daily"
model = "GISS-E2-1-G"
variable = "tas"
anon=True

In [None]:
# Initiate fsspec filesystems for reading and writing
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [None]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

In [None]:
files_paths[0]

## 1.3 Test we can open the files

In [None]:
fs_s3 = s3fs.S3FileSystem(anon=True)
filepath = f's3://{files_paths[0]}'
f = fs_s3.open(filepath, mode='rb')
ds = xr.open_dataset(f)
ds

# 2: Setup the destination

In [None]:
def format_function(time):
    pattern = r"\b\d{4}\b"
    return re.sub(pattern, str(time), filepath)

years = list(range(1950, 1952))
time_dim = ConcatDim("time", keys=years)

pattern = FilePattern(format_function, time_dim, file_type="netcdf4")
pattern = FilePattern.prune(pattern, nkeep=2)

In [None]:
pattern.combine_dims

In [None]:
fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)
target_root = FSSpecTarget(fs=fs, root_path=bucket)
print(f"Using {pattern.items()}")

# 3: Set different target chunks

For different sets of chunks, generate a zarr store.

In [None]:
chunk_sets = []
# Optimized for analysis
temporal_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 29 }
chunk_sets.append(temporal_target_chunks)

In [None]:
# Optimized for visualization at a single time step
global_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 1 }
global_target_chunks
chunk_sets.append(global_target_chunks)

In [None]:
# Optimized for time series
#spatial_target_chunks = calc_auspicious_chunks_dict(ds[variable], chunk_dims=('lat','lon',))
spatial_target_chunks = {'time': 365, 'lat': 262, 'lon': 262}
chunk_sets.append(spatial_target_chunks)

In [None]:
chunk_sets

In [None]:
s3 = boto3.client(
    's3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken']    
)

for chunk_set in chunk_sets:
    dir_path = str(("_").join(map(str, chunk_set.values()))) + "_2"
    store_name = f"{dir_path}/CMIP6_{temporal_resolution}_{model}_{variable}.zarr"
    try:
        key = f"{store_name}/.zmetadata"
        response = s3.head_object(Bucket=bucket, Key=key)
        print(f"File '{store_name}' exists in bucket '{bucket}'.")
        continue
    except ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File '{key}' does not exist in bucket '{bucket}'.")
        else:
            print(f"Error occurred: {e}")
            raise e
    print(f"Writing to {target_root}/{store_name}")
    transforms = (
        beam.Create(pattern.items())
        | OpenURLWithFSSpec(open_kwargs={'anon': True})
        | OpenWithXarray(file_type=pattern.file_type)
        | StoreToZarr(
            store_name=store_name,
            target_root=target_root,
            combine_dims=pattern.combine_dim_keys,
            target_chunks=chunk_set,
        )
    )
    # Commented out so we don't re-run if we don't intend to
    with beam.Pipeline() as p:
        p | transforms

In [None]:
!aws s3 ls s3://nasa-eodc-data-store/600_1440_29_2/CMIP6_daily_GISS-E2-1-G_tas.zarr/time/

In [None]:
dir_path = str(("_").join(map(str, chunk_sets[1].values())))
store_name = f"{dir_path}/CMIP6_{temporal_resolution}_{model}_{variable}.zarr"

In [None]:
store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=fs, check=True)

In [None]:
xr.open_zarr(store, consolidated=True)

In [None]:
store_name

# 4: Check it worked

In [None]:
for chunk_set in chunk_sets:
    dir_path = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{dir_path}/CMIP6_{temporal_resolution}_{model}_{variable}.zarr"
    key = f"{store_name}/.zmetadata"
    response = s3.head_object(Bucket=bucket, Key=key)
    object_size = response['ContentLength']
    object_size_MB = object_size / (1024)    
    print(f"Size of metadata {object_size_MB} KB")
    
    store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=fs, check=True)
    ds = xr.open_zarr(store, consolidated=True)
    print(ds)