# Creates kerchunks from specified pattern or files on Azure Blobs

In [None]:
#!pip install autopep8

import fsspec
import ujson
import xarray as xr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from tqdm import tqdm

In [None]:
import adlfs

print(adlfs.__version__)

In [None]:
#If you want to see the available file systems
#fsspec.available_protocols()

## Needs to be improved... i.e. obtained from other credentials

In [None]:
# needs to be improved
account_dict = dict(account_name = "<get this from the azure portal: storage account name>",
               account_key="<get this from the azure portal for this account name: key>")

In [None]:
# Initiate fsspec filesystems for reading and writing
fs_read = fsspec.filesystem("abfs", **account_dict)

#fs_write = fsspec.filesystem("")
fs_write = fsspec.filesystem("file")

#!az login --use-device-code

In [None]:
# Retrieve list of available files. Can take a long time
#files_paths = fs_read.glob("abfs://bay-delta-schism2-v58/eli/simulations/hindcast_clinic2/outputs/schout_0000_1*.nc")
# Here we prepend the prefix 'abfs://', which points to Azure Blobs.
#file_pattern = sorted(["abfs://" + f for f in files_paths])# faster if you already know the patterns expected.
file_pattern = [f'abfs://bay-delta-schism2-v58/eli/simulations/hindcast_clinic2/outputs/schout_0000_{i}.nc' for i in range(1,1000)]

In [None]:
file_pattern[0:3]

In [None]:
# seems to hang for later operations if I introspect here ...
#ds = xr.open_dataset(fs_read.open(file_pattern[0]))
#ds

## Generate the zarr jsons for each file

In [None]:
so_dict = dict(mode="rb", default_fill_cache=False, default_cache_type="first")
output_dir = "./hindcast2"

In [None]:
# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.
def generate_json_reference(u, output_dir: str):
    with fs_read.open(u, **so_dict) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fname = u.split("/")[-1].strip(".nc")
        outf = f"{output_dir}/{fname}.json"
        with open(outf, "wb") as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())
        return outf

In [None]:
for file in tqdm(file_pattern): generate_json_reference(file, output_dir)

## Combine the zarr jsons from above into a single combined one

In [None]:
from kerchunk.combine import MultiZarrToZarr

In [None]:
json_files = [f"{output_dir}/{f.split('/')[-1].strip('.nc')}.json" for f in file_pattern]

In [None]:
zz = MultiZarrToZarr(json_files,
                     remote_protocol='abfs',remote_options=account_dict,
                     concat_dims=['time'], identical_dims=['nSCHISM_hgrid_node', 'nSCHISM_vgrid_layers'])

In [None]:
with open('hindcast2_combined_1_1000.json','wb') as ofh: ofh.write(ujson.dumps(zz.translate()).encode())

## Now use the combined json file to read the data

** Note ** The previous steps can all be done once and the result cached in combined json. From then the lines below should be able to use it without problems

In [None]:
import xarray as xr

In [None]:
backend_args = {"consolidated": False, "storage_options": {"fo": "hindcast2_combined_1_1000.json","remote_protocol": "abfs","remote_options": account_dict}}
ds = xr.open_dataset("reference://", engine="zarr", backend_kwargs=backend_args)

In [None]:
ds

In [None]:
ds.salt

In [None]:
ds.salt.isel(nSCHISM_hgrid_node=1277, nSCHISM_vgrid_layers=1)

In [None]:
ds.elev