In [1]:
import json
import os
import re
import warnings

import fsspec
import intake
import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
from dmelon.utils import check_folder
from matplotlib import pyplot as plt
from natsort import natsorted
from xgcm import Grid
from xmip.preprocessing import combined_preprocessing, replace_x_y_nominal_lat_lon

warnings.filterwarnings("ignore")
xr.set_options(keep_attrs=True)

<xarray.core.options.set_options at 0x146eaa51bca0>

In [2]:
# ROOT_DIR = "/glade/work/griverat/DL-ECindex/"
# TARGET_RES = "24x72"
# VARIABLE = "tos"
# OUT_NAME_VAR = "sst"

# OUT_PATH = os.path.join(ROOT_DIR, f"DATA-Model/EC_model/CMIP6_gcp/", VARIABLE)
# TARGET_GRID_PATH = os.path.join(
#     ROOT_DIR, f"DATA-Model/EC_model/model.target.grid.{TARGET_RES}.nc"
# )

# check_folder(OUT_PATH)

In [3]:
# parameters
n = 0

In [4]:
# Parameters
n = 6


In [5]:
# url = "https://storage.googleapis.com/cmip6/pangeo-cmip6-noQC.json"
# url = "/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json"
# col = intake.open_esm_datastore(url)

col = pd.read_csv(
    "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
)
col.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
0,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,ps,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
1,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,rsds,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
2,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,rlus,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
3,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,rlds,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
4,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,psl,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706


We need to regrid the incoming CMIP6 data to a rectilinear grid first. This way we can later regrid again using a conservative method that works better between rectilinear grids.

In [6]:
target_rect_grid = xe.util.grid_global(1, 1, lon1=360)
target_rect_grid

Now we load the target grid that our output data will have

In [7]:
# target_grid = xr.open_dataset(TARGET_GRID_PATH)
# target_grid

Here we build the query to the intake catalog. We use sets here to easily find which models have all of the variables

In [8]:
var_list = {
    # "Amon": ["pr", "tauu", "tauv", "rlds", "rlus", "rsds", "rsus", "hfls", "hfss"],
    "Amon": ["pr", "uas", "vas"],
    "Omon": ["tos", "zos"]
    # "Omon": ["tos", "thetao", "vo", "uo", "wo"],
    # "fx": ["sftlf"]
}
flat_var_list = np.concatenate(list(var_list.values())).tolist()
# ["pr", "ts", "uo", "vo", "wo", "hfls", "hfss", "sftlf"]

ignore_models = set(["AWI-CM-1-1-MR"])

# common_query = dict(activity_id="CMIP", experiment_id="historical")
common_query = "(activity_id == 'CMIP') & (experiment_id == 'historical')"

var_models = []

for _k, _v in var_list.items():
    for _var in _v:
        var_models.append(
            set(
                # fmt: off
                col.query(
                    f"{common_query} & "
                    "(variable_id == @_var) & "
                    "(table_id == @_k)"
                )["source_id"].unique()
                # fmt: on
                # col.search(**common_query, variable_id=_var, table_id=_k)
                # .df["source_id"]
                # .unique()
            )
        )

models_list = list(
    var_models[0].intersection(*var_models[1:]).difference(ignore_models)
)
models_list.sort()
print(
    f"Found {len(models_list)} models with {var_list} monthly variables available\n\n{models_list}"
)

Found 40 models with {'Amon': ['pr', 'uas', 'vas'], 'Omon': ['tos', 'zos']} monthly variables available

['ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'CMCC-CM2-HR4', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5', 'CanESM5-CanOE', 'EC-Earth3', 'EC-Earth3-AerChem', 'EC-Earth3-CC', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'GFDL-CM4', 'GFDL-ESM4', 'GISS-E2-1-G', 'GISS-E2-1-G-CC', 'GISS-E2-1-H', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0', 'NESM3', 'NorCPM1', 'UKESM1-0-LL']


In [9]:
nelem = 5
models_seq = [models_list[i * nelem : (i + 1) * nelem] for i in range(8)]
print(len(models_seq))
models_list = models_seq[n]
models_list

8


['IPSL-CM6A-LR', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR']

In [10]:
target_depth_levels = np.arange(0, 350, 5)
grid_loc = dict(tos="center", thetao="center", vo="left", wo="left", uo="left")

model_container = {}
zlabels = ["lev", "level", "olevel", "deptht", "depthv", "depthu", "depthw"]
for model_name in models_list:
    print(f"Doing model: {model_name}")
    # model_query = col.search(
    #     **common_query,
    #     variable_id=flat_var_list,
    #     source_id=model_name,
    #     table_id=list(var_list.keys()),
    # )
    model_query = col.query(
        f"{common_query} & "
        "(variable_id in @flat_var_list) & "
        "(source_id == @model_name) & "
        "(table_id in @var_list.keys())"
    )

    # member_list = natsorted(
    #     model_query.member_id.unique(),
    #     key=lambda x: sum(map(int, re.split("[ripf]", x)[1:])),
    # )
    # model_query = model_query.search(member_id=member_list)
    # model_query = model_query.query("member_id in @member_list")
    # if model_query.grid_label.unique().size > 1:
    #     print(model_name)
    # grid_labels = model_query.grid_label.unique()
    # if grid_labels.size > 2:
    #     break
    member_groups = (
        model_query.groupby(["member_id", "variable_id"]).first().reset_index()
    )[model_query.columns]

    member_mask = member_groups.groupby("member_id").variable_id.count() == len(
        flat_var_list
    )
    member_mask = member_mask[member_mask].index.values

    if member_mask.size == 0:
        continue

    member_mask = natsorted(
        member_mask,
        key=lambda x: sum(map(int, re.split("[ripf]", x)[1:])),
    )[:3]
    member_groups = member_groups.query("member_id in @member_mask")

    model_container[model_name] = []
    for _member in member_mask:
        print(f"\t- Doing {_member}")
        _sel_member = member_groups.query("member_id == @_member").sort_values(
            "variable_id", ascending=False
        )
        _member_ds = []
        _prev_lev = None
        for _zstore in _sel_member.zstore.values:
            _var_data = xr.open_zarr(fsspec.get_mapper(_zstore), consolidated=True)
            try:
                _var_data = _var_data.rename(latitude="lat", longitude="lon")
            except Exception as e:
                pass
            try:
                for _zlabel in zlabels:
                    zcoord = _var_data.get(_zlabel, None)
                    if zcoord is not None:
                        zcoord = _zlabel
                        break
                units = _var_data[zcoord].attrs.get("units", None)
                if units in ["centimeters", "cm"]:
                    _var_data[zcoord] = _var_data[zcoord] / 1e3
                if _var_data[zcoord].isnull().sum() == 0:
                    _prev_lev = _var_data[zcoord]
                if _prev_lev is not None and _var_data[zcoord].isnull().sum() > 0:
                    _var_data[zcoord] = _prev_lev.data
                _var_data = _var_data.sel(**{zcoord: slice(None, 500)})
                _var_name = _zstore.split("/")[-4]
                grid = Grid(
                    _var_data,
                    coords={
                        "Z": {grid_loc[_var_name]: zcoord},
                    },
                    periodic=False,
                )
                _var_data = grid.transform(
                    _var_data[_var_name],
                    "Z",
                    target_depth_levels,
                    target_data=None,
                    method="linear",
                ).rename({zcoord: "lev"})
                _var_data.name = _var_name
                _var_data = _var_data.to_dataset()
            except Exception as e:
                # print(e)
                pass
            # try:
            #     _var_data["olevel"] = abs(_var_data.olevel)
            #     _var_data = _var_data.sortby("olevel").sel(olevel=slice(None, 300))
            # except:
            #     pass
            regridder = xe.Regridder(
                _var_data.isel(time=0, drop=True),
                target_rect_grid,
                "bilinear",
                periodic=True,
                ignore_degenerate=True,
            )
            _var_data = regridder(_var_data)
            _var_data = _var_data.convert_calendar("standard", align_on="date")
            _var_data["time"] = (
                _var_data.indexes["time"]
                .to_series()
                .apply(
                    lambda x: x.replace(
                        day=15, hour=0, minute=0, second=0, microsecond=0
                    )
                )
            )
            _member_ds.append(_var_data)
        _member_ds = xr.combine_by_coords(_member_ds, compat="override")[flat_var_list]
        _member_ds["y"] = _member_ds.lat[:, 0].data
        _member_ds["x"] = _member_ds.lon[0, :].data
        _member_ds = _member_ds.drop_vars(["lat", "lon"]).rename(y="lat", x="lon")
        _member_ds.to_netcdf(
            f"/glade/derecho/scratch/griverat/ics_CMIP6/{model_name}.{_member}.nc"
        )
        model_container[model_name].append(_member)

Doing model: IPSL-CM6A-LR


	- Doing r1i1p1f1


	- Doing r2i1p1f1


	- Doing r3i1p1f1


Doing model: MIROC-ES2L


	- Doing r1i1p1f2


	- Doing r2i1p1f2


	- Doing r3i1p1f2


Doing model: MIROC6


	- Doing r1i1p1f1


	- Doing r2i1p1f1


	- Doing r3i1p1f1


Doing model: MPI-ESM-1-2-HAM


	- Doing r1i1p1f1


	- Doing r2i1p1f1


	- Doing r3i1p1f1


Doing model: MPI-ESM1-2-HR


	- Doing r1i1p1f1


	- Doing r2i1p1f1


	- Doing r3i1p1f1


In [11]:
model_container

{'IPSL-CM6A-LR': ['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1'],
 'MIROC-ES2L': ['r1i1p1f2', 'r2i1p1f2', 'r3i1p1f2'],
 'MIROC6': ['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1'],
 'MPI-ESM-1-2-HAM': ['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1'],
 'MPI-ESM1-2-HR': ['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1']}