# National Water Model - Short Range Forecast

## Dataset description and why it’s a good kerchunk candidate
The National Water Model dataset is a produced by the National Oceanic and Atmospheric Administations (NOAA's) Office of Water Prediction. It is a forecast model of water resources, providing multiple variables across the continental United States (CONUS). 
This dataset is available via the Registry of Open Data on AWS as a collection of netCDF files that do not require any login authentication. Using `kerchunk`, we will demonstrate how to build a kerchunk index so that this dataset can be read as if it were a ARCO dataset. 



In [None]:
# Module Imports
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import fsspec_reference_maker
import fsspec
import xarray as xr
import os
import ujson
from tqdm import tqdm

## Create input file list

In [None]:
# Create a fsspec filesystem for AWS s3.
fs = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)
# Use fsspec and glob to retrieve a list of all netCDF files to be used in the kerchunk index generation.
flist = fs.glob(f'noaa-nwm-pds/nwm.*/short_range/nwm.*.short_range.channel_rt.f001.conus.nc')

# Join the "best time series" from past forecasts with the latest forecast
# Remove the first day of data since this is a rolling collection and 
# we don't want to be trying to access files that soon will be removed. 
# & Use all the files from the last forecast cycle

last_dir = f'{os.path.dirname(flist[-1])}'
last_file = os.path.basename(flist[-1]).split('.')
last_files = fs.glob(f'{last_dir}/{last_file[0]}.{last_file[1]}.{last_file[2]}.channel_rt.*.conus.nc')

# Skip the first of the last_files since it's a duplicate
flist.extend(last_files[1:])

# We need to include the "s3://" prefix to the list of files
# so that fsspec will recognize that these JSON files are on S3. There is no "storage_
urls = ["s3://" + f for f in flist]



## Iterate through filelist and create kerchunk indicies as .json files

In [None]:
# fsspec.open args
so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')
output_dir = './NWM_dir'

# Use kerchunk's SingleHdf5ToZarr to transform netcdf to kerchunk index.
def gen_json(u, output_dir: str):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[3]
        fname = p[5]
        outf = f'{output_dir}/{date}.{fname}.json'
        with open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());
        return outf

# Iterate through filelist to generate kerchunked files. Good use for dask
output_files = []
for fil in tqdm(urls):
    outf = gen_json(fil, output_dir)
    output_files.append(outf)
    


## Combine .json kerchunk reference files and write a combined kerchunk index

In [None]:
# Combine single kerchunk output reference files into a multi file kerchunk dataset
mzz = MultiZarrToZarr(output_files, concat_dims=['time'])
d = mzz.translate()

# Write kerchunk .json record
output_fname = 'NWM.json'
with open(f'{output_fname}', 'wb') as f:
    f.write(ujson.dumps(d).encode())

## Load kerchunked dataset

In [None]:
# create a fsspec reference filesystem from the kerchunk output
fs = fsspec.filesystem("reference", fo=output_fname)
m = fs.get_mapper("")
ds = xr.open_zarr(m)

In [None]:
ds