# PUMP catalog

TODO:
- [ ] Add MITgcm
- [ ] Display LES too
- [ ] Obs catalog

In [None]:
%load_ext watermark

from pathlib import Path

from ecgtools import Builder

%watermark -iv

json: 2.0.9
sys : 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:08:06) [GCC 11.3.0]



In [None]:
ROOT = Path("/glade/campaign/cgd/oce/projects/pump/cesm/")

In [None]:
# MOM6 run catalog
catalog = {
    "baseline": (
        "baseline",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline.001.mixpods",
    ),
    "epbl": ("ePBL", "gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline.epbl.001.mixpods"),
    "kpp.lmd.002": (
        "KPP Ri0=0.5",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline.kpp.lmd.002.mixpods",
    ),
    "kpp.lmd.003": (
        "KPP Ri0=0.5, Ric=0.2,",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline.kpp.lmd.003.mixpods",
    ),
    "kpp.lmd.004": (
        "KPP ν0=2.5, Ric=0.2, Ri0=0.5",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline.kpp.lmd.004.mixpods",
    ),
    "baseline.N150": (
        "baseline N=150",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N150.baseline.mixpods",
    ),
    "kpp.lmd.004.N150": (
        "KPP ν0=2.5, Ric=0.2, Ri0=0.5, N=150",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N150.kpp.lmd.004.mixpods",
    ),
    "new_baseline.hb": (
        "KD=0, KV=0",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.new_baseline.hb",
    ),
    "new_baseline.kpp.lmd.004": (
        "KPP ν0=2.5, Ric=0.2, Ri0=0.5",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.new_baseline.kpp.lmd.004",
    ),
    "new_baseline.kpp.lmd.005": (
        "KPP ν0=2.5, Ri0=0.5",
        "gmom.e23.GJRAv3.TL319_t061_zstar_N65.new_baseline.kpp.lmd.005",
    ),
}

In [None]:
simnames = [v[-1] for v in catalog.values()]
path = list((ROOT / simnames[-3] / "run" / "jsons").glob("*.json"))[-1]

In [None]:
METRIC_VARS = {
    "Coriolis",
    "areacello",
    "areacello_bu",
    "areacello_cu",
    "areacello_cv",
    "cos_rot",
    "deptho",
    "geolat",
    "geolat_c",
    "geolat_u",
    "geolat_v",
    "geolon",
    "geolon_c",
    "geolon_u",
    "geolon_v",
    "nv",
    "sin_rot",
    "wet",
    "wet_c",
    "wet_u",
    "wet_v",
    "xh",
    "xq",
    "yh",
    "yq",
    "zi",
}

IGNORE_VARS = {
    "average_DT",
    "average_T1",
    "average_T2",
    "time",
    "time_bnds",
    "z_i",
    "z_l",
}


def parse_cesm_mom6_kerchunk_json(file, storage_options=None):
    """Parser for CESM timeseries files"""
    import warnings
    from pathlib import Path

    import fsspec

    import xarray as xr

    if storage_options is None:
        storage_options = {}

    path = Path(file)

    info = dict()
    info["casename"] = path.parts[-4]
    info["stream"] = path.stem
    info["path"] = file
    info["baseline"] = "new" if "new_baseline" in info["casename"] else "old"
    info["levels"] = int(path.parts[-4].split(".")[3].split("_")[-1][1:])

    fs = fsspec.filesystem("reference", fo=file, **storage_options)
    mapper = fs.get_mapper("")

    if path.stem == "combined":
        info["frequency"] = "N/A"
        dt = datatree.open_datatree(
            mapper, engine="zarr", use_cftime=True, consolidated=False
        )
        variables = itertools.chain(*[node.variables.keys() for node in dt.subtree])
        if not dt:
            warnings.warn(f"bad file: {file}", RuntimeWarning)
            dt.close()
            return
    else:
        ds = xr.open_zarr(
            mapper,
            use_cftime=True,
            consolidated=False,
            chunks={},
        )
        if not ds:
            warnings.warn(f"bad file: {file}", RuntimeWarning)
            ds.close()
            return
        # TODO: Use keith's util
        info["frequency"] = "daily" if xr.infer_freq(ds.time) == "D" else "monthly"
        variables = ds.variables.keys()

    info["variables"] = sorted(set(variables) - set(METRIC_VARS) - set(IGNORE_VARS))

    return info


path = (
    "/glade/campaign/cgd/oce/projects/pump/cesm/gmom.e23.GJRAv3.TL319_t061_zstar_N65.new_baseline.hb/"
    "run/jsons/combined.json"
)
parse_cesm_mom6_kerchunk_json(str(path))

## Make catalog

In [None]:
builder = Builder(paths=tuple(str(p) for p in ROOT.glob("**/run/jsons/")), depth=0)

In [None]:
builder.build(parsing_func=parse_cesm_mom6_kerchunk_json)

In [None]:
builder.save(
    name="../pump-catalog",
    # Column name including filepath
    path_column_name="path",
    # Column name including variables
    variable_column_name="variables",
    # Data file format - could be netcdf or zarr or reference (in this case, netcdf)
    data_format="reference",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["casename", "stream"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[],
)

## Read Catalog

In [None]:
import intake

data_catalog = intake.open_esm_datastore("../pump-catalog.json")
data_catalog

Unnamed: 0,unique
casename,10
stream,4
path,32
baseline,2
levels,2
frequency,2
variables,10
derived_variables,0


In [None]:
data_catalog.df

Unnamed: 0,casename,stream,path,baseline,levels,frequency,variables
0,gmom.e23.GJRAv3.TL319_t061_zstar_N150.baseline...,combined,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,150,,"['SSH', 'SSU', 'SSV', 'mlotst', 'oml', 'sos', ..."
1,gmom.e23.GJRAv3.TL319_t061_zstar_N150.baseline...,sfc,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,150,daily,"['SSH', 'SSU', 'SSV', 'mlotst', 'oml', 'sos', ..."
2,gmom.e23.GJRAv3.TL319_t061_zstar_N150.kpp.lmd....,combined,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,150,,"['SSH', 'SSU', 'SSV', 'mlotst', 'oml', 'sos', ..."
3,gmom.e23.GJRAv3.TL319_t061_zstar_N150.kpp.lmd....,sfc,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,150,daily,"['SSH', 'SSU', 'SSV', 'mlotst', 'oml', 'sos', ..."
4,gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline....,combined,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,65,,"['N2_int', 'Rd_dx', 'SSH', 'SSU', 'SSV', 'ages..."
5,gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline....,h,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,65,monthly,"['N2_int', 'agessc', 'h', 'rhopot0', 'so', 'th..."
6,gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline....,sfc,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,65,monthly,"['Rd_dx', 'SSH', 'SSU', 'SSV', 'mass_wt', 'mlo..."
7,gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline....,combined,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,65,,"['SSH', 'SSU', 'SSV', 'T_advection_xy', 'T_lbd..."
8,gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline....,h,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,65,monthly,"['h', 'so', 'thetao', 'uhGM', 'uhml', 'umo', '..."
9,gmom.e23.GJRAv3.TL319_t061_zstar_N65.baseline....,sfc,/glade/campaign/cgd/oce/projects/pump/cesm/gmo...,old,65,monthly,"['SSH', 'SSU', 'SSV', 'mlotst', 'sos', 'speed'..."


## Display interactive catalog

In [None]:
def catalog_to_grid(data_catalog):
    import ipyaggrid

    def make_short_name(casename_split):
        trimmed = casename_split[4:]
        if trimmed[-1] == "mixpods":
            trimmed = trimmed[:-1]
        return ".".join(trimmed)

    df = data_catalog.df
    df = df.assign(shortname=df.casename.str.split(".").map(make_short_name))

    column_defs = [
        {
            "headerName": "shortname",
            "field": "shortname",
            "rowGroup": False,
            "pinned": True,
        },
        {"headerName": "stream", "field": "stream", "rowGroup": False},
        {"headerName": "baseline", "field": "baseline"},
        {"headerName": "frequency", "field": "frequency", "rowGroup": False},
        {"headerName": "levels", "field": "levels", "rowGroup": False},
        {"headerName": "variables", "field": "variables", "autoHeight": True},
        {"headerName": "casename", "field": "casename", "rowGroup": False},
        {"headerName": "path", "field": "path", "rowGroup": False},
    ]

    grid_options = {
        "columnDefs": column_defs,
        "defaultColDef": {
            "resizable": True,
            "editable": False,
            "filter": True,
            "sortable": True,
        },
        "colResizeDefault": True,
        "enableRangeSelection": True,
        "statusBar": {  # new syntax since 19.0
            "statusPanels": [
                {"statusPanel": "agTotalRowCountComponent", "align": "left"},
                {"statusPanel": "agFilteredRowCountComponent"},
                {"statusPanel": "agSelectedRowCountComponent"},
                {"statusPanel": "agAggregationComponent"},
            ]
        },
        # "enableRangeHandle": True,
    }

    g = ipyaggrid.Grid(
        grid_data=df,
        grid_options=grid_options,
        quick_filter=True,
        export_csv=False,
        export_excel=False,
        export_mode="buttons",
        export_to_df=True,
        theme="ag-theme-balham",
        # show_toggle_edit=False,
        # show_toggle_delete=False,
        columns_fit="auto",
        # index=False,
        # keep_multiindex=False,
    )
    return g

In [None]:
grid = catalog_to_grid(data_catalog)
grid

Grid(columns_fit='auto', compress_data=True, export_mode='buttons', height='350px', menu={'buttons': [{'name':…

### Export HTML to view

In [None]:
from IPython.display import HTML

html = grid.export_html(build=True)
HTML(html)