In [None]:
# !pip install -U s3fs

In [1]:
from rich import print

## Pangeo Forge: Transforming archival data into analysis-ready, cloud-optimized (ARCO) data stores

[NSF Award 2026932](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2026932)

_Frontiers In Climate_ paper: https://doi.org/10.3389/fclim.2021.782909

Slides: https://github.com/cisaacstern/pangeo-forge-slides/blob/esip-feb-2022/slides.ipynb

## NOAA Optimum Interpolation Sea Surface Temperature (OISST) `http` index
https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/
```
 │
 ├──198109/
 │   ├──oisst-avhrr-v02r01.19810901.nc
 │  ...
 │   └──oisst-avhrr-v02r01.19810930.nc
...
 └──202110/
```

In [2]:
!wget 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc'

--2022-02-28 09:46:55--  https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc
Resolving www.ncei.noaa.gov... 2610:20:8040:2::177, 2610:20:8040:2::172, 2610:20:8040:2::168, ...
Connecting to www.ncei.noaa.gov|2610:20:8040:2::177|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1714749 (1.6M) [application/x-netcdf]
Saving to: 'oisst-avhrr-v02r01.19810901.nc.17'


2022-02-28 09:46:57 (2.74 MB/s) - 'oisst-avhrr-v02r01.19810901.nc.17' saved [1714749/1714749]



In [3]:
import xarray as xr

ds = xr.open_dataset("oisst-avhrr-v02r01.19810901.nc")
nbytes = ds.nbytes
print(f"{nbytes/1e6:.2f} MB")
print(ds)

In [4]:
import pandas as pd

# Our target range includes 14372 files
dates = pd.date_range("1981-09-01", "2021-01-05", freq="D")

print(f"{len(dates)} files -> {(len(dates)*nbytes)/1e9:.2f} GB")

## The old way: manual Zarr conversion

- Reproducibility (i.e. provenance chain)
- Scalability
- Maintainability


# A better way: Pangeo Forge

1. `pangeo-forge-recipes`: Encodes complete ARCO data transformation pipeline as code which can be version controlled.
2. **Pangeo Forge Cloud**: Automatation of recipe execution in the cloud. Integrated with GitHub.

In [7]:
# https://github.com/pangeo-forge/staged-recipes/blob/master/recipes/noaa-oisst/recipe.py

from noaa_oisst_recipe import recipe

for i, (index, url) in enumerate(recipe.file_pattern.items()):
    if i < 2 or i > 14369:
        print(repr(index), url)

<img src='architecture.png'>

In [6]:
import s3fs
import xarray as xr

endpoint_url = "https://ncsa.osn.xsede.org"
fs_osn = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_url},)

path = "s3://Pangeo/pangeo-forge/noaa_oisst/v2.1-avhrr.zarr"
ds = xr.open_zarr(fs_osn.get_mapper(path), consolidated=True)
print(f"{ds.nbytes/1e9:.2f} GB")
print(ds)

<center>
<img src="pangeo-forge-logo-blue-2.png">

<h1> https://pangeo-forge.readthedocs.io/ </h1>
</center>