# Create reference files for the COAWST forecast archive on AWS Open Data
We use [kerchunk](https://fsspec.github.io/kerchunk/) to create individual reference files for each weekly NetCDF file, 
then create the combined JSON that allows access to the entire collection as a single dataset in Xarray

In [None]:
import fsspec
import xarray as xr

from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr, auto_dask, JustLoad

from dask.distributed import Client
import dask.bag as db
import ujson
from pathlib import Path
import numpy as np

We can read from AWS Open Data using `anon=True`:

In [None]:
fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)

We can't *write* to AWS Open Data without credentials, which we will specify through environment variables.  Because we are going to use environment variables instead of referencing an AWS profile, we don't specify `profile=` here in fs_write, but use `anon=False`:

In [None]:
fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)

In [None]:
flist = fs_read.glob('s3://usgs-coawst/useast-archive/*.nc')
json_dir = 's3://usgs-coawst/useast-archive/json2'

In [None]:
flist = [f's3://{f}' for f in flist]

In [None]:
print(len(flist))
print(flist[0])
print(flist[-1])

#### Create references for each NetCDF file in parallel 

In [None]:
so = dict(mode='rb', anon=True, skip_instance_cache=True)

In [None]:
def gen_json(u):
    with fs_read.open(u, **so) as infile:
        fname = Path(u).stem
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        outf = f'{json_dir}/{fname}.json'
        with fs_write.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());
    return outf

In [None]:
import sys, os
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import nebari_tools as nbt

aws_profile = 'coawst_open_data'
aws_region = 'us-west-2'
endpoint_url = f's3.{aws_region}.amazonaws.com'
nbt.set_credentials(profile=aws_profile, region=aws_region, endpoint_url=endpoint_url)

In [None]:
worker_max = 30
client,cluster = nbt.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                      region=aws_region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      worker_profile='Small Worker', 
                                      propagate_env=True) 

In [None]:
%%time
bag = db.from_sequence(flist, npartitions=worker_max).map(gen_json)
bag.compute(retries=20)

In [None]:
json_list = fs_read.glob(f'{json_dir}/*.json')
json_list = [f's3://{j}' for j in json_list]
print(len(json_list))
print(json_list[0])
print(json_list[-1])

In [None]:
import base64

def consolidate(out_):
    for k, v in out_.items():
        if isinstance(v, bytes):
            try:
              # easiest way to test if data is ascii
                out_[k] = v.decode('ascii')
            except UnicodeDecodeError:
                out_[k] = (b"base64:" + base64.b64encode(v)).decode()
        else:
                out_[k] = v
    return out_

import zarr

def modify_attrs(out):
    out_= zarr.open(out)
    out_.ocean_time.attrs['standard_name'] = 'time'
    return out

def preprocess(out):
    out = modify_attrs(out)
    out = consolidate(out)
    return out

#### Create combined references json file

In [None]:
mzz = MultiZarrToZarr(json_list,   
    concat_dims = ['ocean_time'],
    coo_map={"ocean_time": "cf:ocean_time"},
    identical_dims=['lat_psi','lat_rho','lat_u','lat_v',
                    'lon_psi','lon_rho','lon_u','lon_v'],
                     preprocess=preprocess)

In [None]:
%%time
d = mzz.translate()

In [None]:
%%time
fs5 = fsspec.filesystem("reference", fo=d, skip_instance_cache=True)

In [None]:
m = fs5.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", 
                     backend_kwargs={'consolidated':False}, chunks={}, 
                     drop_variables=['dstart'])
ds

In [None]:
combined_json = 'combined2.json'

In [None]:
fs = fsspec.filesystem('file')

In [None]:
%%time
with fs.open(combined_json, 'wb') as f:
    f.write(ujson.dumps(d).encode());

In [None]:
combined_json_aws = f's3://usgs-coawst/useast-archive/{combined_json}'
combined_json_aws

In [None]:
fs_write.upload(combined_json, combined_json_aws)