In [1]:
import xarray as xr
import yaml
import zarr
from numcodecs import Blosc
import numpy as np
import math
import logging

In [2]:
def get_encodings(outds, order, timechunk):
    encodings = {
        var: dict(
            dtype=get_dtype(outds[var]),
            chunks=get_chunksizes(
                outds=outds, var=var, order=order, timechunk=timechunk
            ),
            compressor=Blosc(cname="zstd", clevel=5, shuffle=Blosc.BITSHUFFLE),
        )
        for var in outds
    }
    return encodings


def get_dtype(da):
    if np.issubdtype(da.dtype, np.floating):
        return "float32"
    else:
        return da.dtype


def get_chunksizes(outds, var, order, timechunk):
    var_shape = outds[var].shape
    timechunk = min(timechunk, var_shape[0])
    spacechunk = compute_chunksize(order=order)
    if len(var_shape) == 1:
        return min(var_shape[0], 1024**2)
    elif len(var_shape) == 2:
        chunksizes = (timechunk, spacechunk)
        return chunksizes
    elif len(var_shape) == 3:
        levchunk = choose_levchunk(var_shape[1])
        if (levchunk * spacechunk) > 4**9:
            spacechunk = 4 ** round(0.5 * math.log2(spacechunk / levchunk))
        chunksizes = (timechunk, levchunk, spacechunk)
        return chunksizes
    else:
        raise Exception(
            "can't compute chunking for variables that have more than 3 dimensions!"
        )


def compute_chunksize(order):
    start_split = 8
    if order < start_split:
        return 12 * 4**order
    elif order == start_split:
        return 4 * 4**order
    else:
        return 4**start_split + 1


def choose_levchunk(levels, default=12, maxlevchunk=20):
    if isPrime(levels):
        if levels < maxlevchunk:
            return int(levels)
        else:
            return default
    for levchunk in range(default, 2, -1):
        if res := levels % levchunk:
            logging.debug(f"residual for level chunk {levchunk} is {res}")
        else:
            logging.debug(f"chosing {levchunk} as level chunk")
            return levchunk
    return default


def isPrime(number):
    limit = int(number / 2)  # limit indicates how many times we need to run the loop
    flag = 0  # to keep track whether the number is prime or not
    if number == 0 or number == 1:
        print(f"The Given Number {number} is Not Prime")
        return
    for i in range(2, limit + 1):
        if number % i == 0:
            flag = 1
            break
    if flag == 0:
        return True
    else:
        return False


def to_zarr_store(path, outds, timechunk, order):
    store = zarr.storage.DirectoryStore(
        path, normalize_keys=False, dimension_separator="/"
    )
    outds.to_zarr(
        store, encoding=get_encodings(outds=outds, timechunk=timechunk, order=order)
    )  # , compute=False
    store.close()

In [3]:
def convert_files(zoom, subset):
    curr_conf = config[subset]
    timechunk = curr_conf["timechunk"]
    files = (
        f"/large/sftpgo/data/NICAM/hackathon/tksk_sample/*/{subset}/z{zoom:02d}/*.nc"
    )
    ds = xr.open_mfdataset(files, chunks=dict(time=timechunk))
    out_ds = ds.rename(curr_conf["renames"]).isel(curr_conf["isel"])
    outfile = f"{output_dir}/NICAM_{subset}_z{zoom}.zarr"
    ! rm -rf {outfile}
    to_zarr_store(path=outfile, outds=out_ds, timechunk=timechunk, order=zoom)
    return outfile

In [4]:
output_dir = "/large/work/florian"
config = yaml.safe_load(open("nicam_to_zarr.yaml"))

In [5]:
for zoom in range(10):
    outfile = convert_files(zoom=zoom, subset="2d1h")
    ds_new = xr.open_dataset(outfile)

KeyboardInterrupt: 

In [13]:
ds_new

In [14]:
!cat {outfile}/.zmetadata

{
    "metadata": {
        ".zattrs": {
            "comment": "Be careful that definition of time coordinate depends on the dataset and time mode (snapshot/average).",
            "history": "Generated by mod_netcdf.f90.",
            "title": "NICAM data output"
        },
        ".zgroup": {
            "zarr_format": 2
        },
        "__values__/.zarray": {
            "chunks": [
                24,
                12
            ],
            "compressor": {
                "blocksize": 0,
                "clevel": 5,
                "cname": "zstd",
                "id": "blosc",
                "shuffle": 2
            },
            "dimension_separator": "/",
            "dtype": "<f4",
            "fill_value": "NaN",
            "filters": null,
            "order": "C",
            "shape": [
                264,
                12
            ],
            "zarr_format": 2
        },
        "__values__/.zattrs": {
            "_ARRAY_DIMENSIONS": [
              