# Create Kerchunk catalog for CMEMS


## Set up credentials on GFTS buckets

Credentials are stored in the `gfts` profile in your `~/.aws/credentials`. This file is generated automatically on GFTS Jupyterhub.

You can view them with `~/.aws/credentials`.

- access keys are in profile named `gfts`
- endpoint_url is `https://s3.gra.perf.cloud.ovh.net`
- region_name is `gra`

You should have read and write permissions to the bucket, but not delete

In [None]:
# !cat ~/.aws/credentials

In [1]:
import os
import ujson
import fsspec
import xarray as xr

from pathlib import Path
from kerchunk.grib2 import scan_grib
from kerchunk.combine import MultiZarrToZarr

In [2]:
s3 = fsspec.filesystem('s3',
    anon=False,
    profile="gfts",
    client_kwargs={
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    },
)

## Create catalog for CimateDT od3 data

In [3]:
bucket_name = 'gfts-reference-data/ClimateDT/raw'
s3.ls(bucket_name)

['gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20210601-20210615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20210616-20210630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20220601-20220615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20220616-20220630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20230601-20230615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20230616-20230630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20240601-20240615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20240616-20240630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20210601-20210615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20210616-20210630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20220601-20220615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20220616-20220630.grib',
 'gfts-reference-data/Climat

In [4]:
s3path = 's3://gfts-reference-data/ClimateDT/raw/*ifs-nemo_*.grib'

In [5]:
remote_files = s3.glob(s3path)
remote_files

['gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20210601-20210615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20210616-20210630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20220601-20220615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20220616-20220630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20230601-20230615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20230616-20230630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20240601-20240615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc300m_ifs-nemo_20240616-20240630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20210601-20210615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20210616-20210630.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20220601-20220615.grib',
 'gfts-reference-data/ClimateDT/raw/avg_hc700m_ifs-nemo_20220616-20220630.grib',
 'gfts-reference-data/Climat

In [6]:
ocean3d_param = ["avg_thetao", "avg_so", "avg_von", "avg_uoe", "avg_wo"]

In [7]:
remote_files3D = []
remote_files2D = []
for rf in remote_files:
    for param in ocean3d_param:
        if(rf.find(param)!=-1):
            remote_files3D.append(rf)
        else:
            remote_files2D.append(rf)
print("3D : ", remote_files3D)
print("2D : ", remote_files2D)

3D :  ['gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20210601-20210615.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20210616-20210630.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20220601-20220615.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20220616-20220630.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20230601-20230615.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20230616-20230630.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20240601-20240615.grib', 'gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20240616-20240630.grib', 'gfts-reference-data/ClimateDT/raw/avg_sos_ifs-nemo_20210601-20210615.grib', 'gfts-reference-data/ClimateDT/raw/avg_sos_ifs-nemo_20210616-20210630.grib', 'gfts-reference-data/ClimateDT/raw/avg_sos_ifs-nemo_20220601-20220615.grib', 'gfts-reference-data/ClimateDT/raw/avg_sos_ifs-nemo_20220616-20220630.grib', 'gfts-reference-data/ClimateDT/raw/avg_sos_ifs-nemo_20230601-20230615.grib', 

In [9]:
fs3d = fsspec.filesystem('')  #local file system to save final jsons to
#so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # args to fs.open()

so = dict(anon=False,
    profile="gfts",
    client_kwargs={
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    },
    default_fill_cache=False
)

# default_fill_cache=False avoids caching data in between file chunks to lowers memory usage.

In [12]:
def gen_json(file_url, so):
    gribchunks = scan_grib(file_url, storage_options=so)
        # inline threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time
    name = file_url.split('/')[-1].split('.')[0]
    outf = f'{name}.json' #file name to save json to
    print(outf)
#    with fs2.open(outf, 'wb') as f:
#        f.write(ujson.dumps(h5chunks.translate()).encode());

In [13]:
%%time
for file in remote_files3D:
    print(file)
    gen_json("s3://" + file, so)

gfts-reference-data/ClimateDT/raw/avg_so_ifs-nemo_20210601-20210615.grib


KeyError: 'Ny'