In [1]:
import pandas as pd

pd.set_option("max_colwidth", None)  # makes the tables render better

import intake


def get_from_cat(catalog, columns):
    """A helper function for inspecting an intake catalog.

    Call with the catalog to be inspected and a list of columns of interest."""

    if type(columns) == type(""):
        columns = [columns]
    return catalog.df[columns].drop_duplicates().sort_values(columns)

In [2]:
# catalog_file = "/work/ka1081/Catalogs/dyamond-nextgems.json" # main catalog for DYAMOND and nextGEMS
catalog_file = "/work/ka1081/Catalogs/nextGEMS-cycle2.json"  # just nextGEMS cycle2

cat = intake.open_esm_datastore(catalog_file)
cat

Unnamed: 0,unique
variable_id,238
project,1
institution_id,2
source_id,2
experiment_id,1
simulation_id,3
realm,2
frequency,10
time_reduction,4
grid_label,1


In [3]:
cat.df.head(n=2)

Unnamed: 0,variable_id,project,institution_id,source_id,experiment_id,simulation_id,realm,frequency,time_reduction,grid_label,level_type,time_min,time_max,grid_id,format,uri
0,"(zghalf, zg, dzghalf)",nextGEMS,MPI-M,ICON-ESM,nextgems_cycle2,ngc2009,atm,fx,const,gn,ml,2020-01-20T00:00:00.000,2020-01-20T18:00:00.000,not implemented,netcdf,/work/bm1235/k203123/experiments/ngc2009/run_20200120T000000-20200203T235920/ngc2009_atm_vgrid_ml.nc
1,"(tas_gmean, rsdt_gmean, rsut_gmean, rlut_gmean, radtop_gmean, prec_gmean, evap_gmean, fwfoce_gmean)",nextGEMS,MPI-M,ICON-ESM,nextgems_cycle2,ngc2009,atm,6hour,mean,gn,ml,2020-01-20T00:00:00.000,2020-01-20T18:00:00.000,not implemented,netcdf,/work/bm1235/k203123/experiments/ngc2009/run_20200120T000000-20200203T235920/ngc2009_atm_mon_20200120T000000Z.nc


In [4]:
get_from_cat(cat, ["project", "experiment_id", "source_id", "simulation_id"])

Unnamed: 0,project,experiment_id,source_id,simulation_id
0,nextGEMS,nextgems_cycle2,ICON-ESM,ngc2009
16014,nextGEMS,nextgems_cycle2,ICON-ESM,ngc2012
16013,nextGEMS,nextgems_cycle2,IFS-FESOM,HR0N


In [5]:
get_from_cat(cat.search(simulation_id="ngc2009"), ["realm", "frequency", "variable_id"])

Unnamed: 0,realm,frequency,variable_id
4945,atm,1day,"(clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)"
7789,atm,1day,"(psl, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)"
49,atm,2hour,"(phalf,)"
145,atm,30minute,"(hydro_canopy_cond_limited_box, hydro_w_snow_box, hydro_snow_soil_dens_box)"
129,atm,30minute,"(hydro_discharge_ocean_box, hydro_drainage_box, hydro_runoff_box, hydro_transpiration_box, sse_grnd_hflx_old_box)"
33,atm,30minute,"(psl, ps, sit, sic, tas, ts, uas, vas, cfh_lnd)"
17,atm,30minute,"(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif)"
97,atm,6hour,"(clw, cli, pfull)"
161,atm,6hour,"(hydro_w_soil_sl_box, hydro_w_ice_sl_box, sse_t_soil_sl_box)"
81,atm,6hour,"(ta, hus, rho)"


In [6]:
get_from_cat(
    cat.search(simulation_id="ngc2009", variable_id="tas"),
    ["realm", "frequency", "level_type", "variable_id"],
)

Unnamed: 0,realm,frequency,level_type,variable_id
272,atm,1day,ml,"(clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)"
551,atm,1day,ml,"(psl, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)"
0,atm,30minute,ml,"(psl, ps, sit, sic, tas, ts, uas, vas, cfh_lnd)"


In [7]:
hits = cat.search(simulation_id="ngc2009", variable_id="tas", frequency="30minute")
# The 1day files would have crashed the jupyter because the files are inconsistent across the run.
hits

Unnamed: 0,unique
variable_id,9
project,1
institution_id,1
source_id,1
experiment_id,1
simulation_id,1
realm,1
frequency,1
time_reduction,1
grid_label,1


In [8]:
dataset_dict = hits.to_dataset_dict(cdf_kwargs={"chunks": {"time": 1}})


--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institution_id.source_id.experiment_id.simulation_id.realm.frequency.time_reduction.grid_label.level_type'


In [9]:
keys = list(dataset_dict.keys())
keys

['nextGEMS.MPI-M.ICON-ESM.nextgems_cycle2.ngc2009.atm.30minute.inst.gn.ml']

In [10]:
dataset = dataset_dict[keys[0]]
dataset

Unnamed: 0,Array,Chunk
Bytes,2.82 TiB,80.00 MiB
Shape,"(37009, 1, 20971520)","(1, 1, 20971520)"
Count,74841 Tasks,37009 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.82 TiB 80.00 MiB Shape (37009, 1, 20971520) (1, 1, 20971520) Count 74841 Tasks 37009 Chunks Type float32 numpy.ndarray",20971520  1  37009,

Unnamed: 0,Array,Chunk
Bytes,2.82 TiB,80.00 MiB
Shape,"(37009, 1, 20971520)","(1, 1, 20971520)"
Count,74841 Tasks,37009 Chunks
Type,float32,numpy.ndarray


In [13]:
dataset.tas.isel(time=1).min().values

array(225.27545, dtype=float32)

In [14]:
dataset.tas.isel(time=1).max().values

array(312.81677, dtype=float32)

In [15]:
dataset.tas.max(dim="ncells")  # lazy evaluation - no real work is done yet.

Unnamed: 0,Array,Chunk
Bytes,144.57 kiB,4 B
Shape,"(37009, 1)","(1, 1)"
Count,148859 Tasks,37009 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 144.57 kiB 4 B Shape (37009, 1) (1, 1) Count 148859 Tasks 37009 Chunks Type float32 numpy.ndarray",1  37009,

Unnamed: 0,Array,Chunk
Bytes,144.57 kiB,4 B
Shape,"(37009, 1)","(1, 1)"
Count,148859 Tasks,37009 Chunks
Type,float32,numpy.ndarray
