# Check out simple pangeo-forge recipe

This notebook demonstrates how the default result of pangeo-forge `StoreToZarr` results coordinates being chunked.

In [None]:
import apache_beam as beam
import fsspec
import h5py
import os
from pangeo_forge_recipes.patterns import FilePattern, ConcatDim
from pangeo_forge_recipes.transforms import OpenURLWithFSSpec, OpenWithXarray, StoreToZarr
import re
import s3fs
from tempfile import TemporaryDirectory
import xarray as xr

In [2]:
#!pip install pangeo_forge_recipes==0.10.0
#!pip install 'apache-beam[interactive, dataframe]==2.48.0'
!pip show pangeo_forge_recipes

Name: pangeo-forge-recipes
Version: 0.10.0
Summary: Pipeline tools for building and publishing analysis ready datasets.
Home-page: https://github.com/pangeo-forge/pangeo-forge-recipes
Author: 
Author-email: 
License: Apache
Location: /srv/conda/envs/notebook/lib/python3.10/site-packages
Requires: cftime, dask, distributed, fsspec, h5netcdf, h5py, intake, intake-xarray, kerchunk, mypy-extensions, netcdf4, numcodecs, setuptools, xarray, zarr
Required-by: 


## 1. Set parameters

In [3]:
#parameters
temporal_resolution = "daily"
model = "GISS-E2-1-G"
variable = "tas"
anon=True

In [4]:
# Initiate fsspec filesystems for reading and writing
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

In [5]:
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*


In [6]:
files_paths[0]

'nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc'

# 2. Test we can open the files

In [7]:
fs_s3 = s3fs.S3FileSystem(anon=True)
filepath = f's3://{files_paths[0]}'
f = fs_s3.open(filepath, mode='rb')
ds = xr.open_dataset(f)
ds

## Note: tas is chunked by time, but the time coordinate itself is not.

In [8]:
with fs_read.open(filepath, 'rb') as s3_file:
    with h5py.File(s3_file, 'r') as hdf5_file:
        print(hdf5_file['tas'].chunks)
        print(hdf5_file['time'].chunks)        

(1, 600, 1440)
(365,)


# 3. Set the destination

In [9]:
def format_function(time):
    pattern = r"\b\d{4}\b"
    return re.sub(pattern, str(time), filepath)

years = list(range(1950, 1952))
time_dim = ConcatDim("time", keys=years)

pattern = FilePattern(format_function, time_dim, file_type="netcdf4")
pattern = FilePattern.prune(pattern, nkeep=2)

In [10]:
pattern.combine_dims

(ConcatDim(name='time', nitems_per_file=None),)

In [11]:
td = TemporaryDirectory()
target_root = td.name
store_name = "test-cmip6.zarr"
target_path = os.path.join(target_root, store_name)
target_path

'/tmp/tmp63u7fbmt/test-cmip6.zarr'

# 4. Set different target chunks

In [12]:
global_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 1 }
global_target_chunks

{'lat': 600, 'lon': 1440, 'time': 1}

# 5. Run pipeline

In [13]:
transforms = (
    beam.Create(pattern.items())
    | OpenURLWithFSSpec(open_kwargs={'anon': True})
    | OpenWithXarray(file_type=pattern.file_type)
    | StoreToZarr(
        store_name=store_name,
        target_root=target_root,
        combine_dims=pattern.combine_dim_keys,
        target_chunks=global_target_chunks,
    )
)
with beam.Pipeline() as p:
    p | transforms

## Coordinates are chunked - why?

A similar result will happen if we chunk the variable on lat and lon coordinates (the coordinates themselves will be chunked).

In [15]:
!ls {target_path}/time/

0    14   181  222  264  305  347  389	43   471  512  554  596  637  679  72
1    140  182  223  265  306  348  39	430  472  513  555  597  638  68   720
10   141  183  224  266  307  349  390	431  473  514  556  598  639  680  721
100  142  184  225  267  308  35   391	432  474  515  557  599  64   681  722
101  143  185  226  268  309  350  392	433  475  516  558  6	 640  682  723
102  144  186  227  269  31   351  393	434  476  517  559  60	 641  683  724
103  145  187  228  27	 310  352  394	435  477  518  56   600  642  684  725
104  146  188  229  270  311  353  395	436  478  519  560  601  643  685  726
105  147  189  23   271  312  354  396	437  479  52   561  602  644  686  727
106  148  19   230  272  313  355  397	438  48   520  562  603  645  687  728
107  149  190  231  273  314  356  398	439  480  521  563  604  646  688  729
108  15   191  232  274  315  357  399	44   481  522  564  605  647  689  73
109  150  192  233  275  316  358  4	440  482  523  565  606  648  69  