In [None]:
from pathlib import Path
import json
from functools import partial 

import xarray as xr
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter


SNAKEMAKE = snakemake
inputs = SNAKEMAKE.input
outputs = SNAKEMAKE.output
config = SNAKEMAKE.config
plt_cfg = config["plotting"]
partition = SNAKEMAKE.wildcards.partition
experiment = SNAKEMAKE.wildcards.experiment

plt.rcParams.update(plt_cfg["rcparams"])


TASKS = [
    "air_temperature",
    "dew_point_temperature",
    "surface_air_pressure",
    "relative_humidity",
    "water_vapor_mixing_ratio",
]


In [None]:
out_dir = Path(outputs[0])
out_dir.mkdir(parents=True, exist_ok=True)
print(f"Partition: {partition} \n")
print("Experiment configuration: \n")
exp_config = config["experiments"][experiment]
print(json.dumps(exp_config, indent=4))



In [None]:
def preprocess_predictions(ds, reftimes=None, params_as_dims="all"):
    path = Path(ds.encoding["source"])
    params = {}
    for kv in path.parent.name.split("-"):
        k, v = kv.split("~")
        if params_as_dims == "all":
            params[k] = v
        elif isinstance(params_as_dims, list):
            if k in params_as_dims:
                params[k] = v
    params["approach"] = path.parents[2].name
    params["split"] = int(path.parents[3].name.split("~")[1])
    params["seed"] = int(path.parents[4].name.split("~")[1])
    dims = list(params.keys())
    ds = ds.assign_coords(params).expand_dims(dims)
    if reftimes is not None:
        ds = ds.reindex(forecast_reference_time=reftimes).load()
    return ds

def ds_to_df(ds, name):
    return (
        ds
        .reset_coords(drop=True)
        .to_array("variable")
        .to_dataframe(name)
        .reset_index()
    )

def unstack(ds):
    dims = ["forecast_reference_time", "t", "station"]
    samples = pd.MultiIndex.from_arrays([ds[dim].values for dim in dims], names=dims)
    ds = ds.reset_coords(drop=True).assign_coords(s=samples).unstack("s")
    return ds

def remove_source_prefix(ds):
    for var in ds.data_vars:
        _, name = var.split(":")
        ds = ds.rename({var: name})
    return ds

In [None]:

if partition in ["train", "val"]:
    obs = remove_source_prefix(unstack(xr.load_dataset(inputs["y"])))
    reftimes = obs.forecast_reference_time
    pred = xr.open_mfdataset(inputs["predictions"], preprocess=partial(preprocess_predictions, reftimes=reftimes), parallel=True)
    pred = pred.dropna("forecast_reference_time", "all")
elif partition == "test":
    pp_fn = partial(preprocess_predictions, params_as_dims = ["data.reduction"])
    pred = xr.open_mfdataset(inputs["predictions"], preprocess=pp_fn, parallel=True)
    obs = remove_source_prefix(unstack(xr.load_dataset(inputs["y"])))

obs = obs.reindex_like(pred).load().chunk({"forecast_reference_time": 200})
pred = pred.squeeze().load().chunk({"forecast_reference_time": 200})
obs = obs.squeeze()
obs, pred = xr.broadcast(obs, pred)
err = pred - obs

In [None]:
mae = abs(err).mean(["forecast_reference_time","t","station"])
reduce_dims = ["var"]
nmae = (mae / obs.std()).to_array("var").mean(reduce_dims).compute()

In [None]:
fig, ax = plt.subplots(1, figsize=(6,5))
nmae_ = nmae
df = (nmae_.to_dataframe(name="Aggregated NMAE").reset_index()
      .replace(plt_cfg["approach_names"])
      .rename(columns={"approach":"Approach", "data.reduction":"Fraction of training data"})
     )

sns.boxplot(
    data=df,
    x="Fraction of training data",
    y="Aggregated NMAE",
    hue="Approach",
    hue_order=list(plt_cfg["approach_names"].values()),
    palette=plt_cfg["approach_colors"],
    showfliers=False,
    ax=ax
)

ax = plt.gca()
lgd = ax.legend(
        bbox_to_anchor=(0.19, 0.88, 0.76, 0.1),
        loc="lower left",
        ncol=2,
        mode="expand",
        borderaxespad=0.,
        frameon=False,
        fontsize=11,
        bbox_transform=plt.gcf().transFigure
    )

plt.tight_layout()
plt.savefig(out_dir / "MAE_vs_reduction.png")

In [None]:
def rh_from_t_td(t, t_d):
    
    rh = xr.where(
        t >= 0,
        100 * np.exp((17.368 * t_d)/(238.83+t_d) - (17.368 * t)/(238.83+t)),
        100 * np.exp((17.856 * t_d)/(245.52+t_d) - (17.856 * t)/(245.52+t))
    )
    
    return rh

def e_from_t_rh(t, rh):
    e = xr.where(
        t >= 0,
        rh / 100 * 6.107 * np.exp((17.368 * t) / (238.83 + t)),
        rh / 100 * 6.108 * np.exp((17.856 * t) / (245.52 + t)),
    )
    return e


def td_from_e_t(e, t):

    td = xr.where(
        t >= 0.0,
        -238.83 * (np.log(e / 6.107)) / (np.log(e / 6.107) - 17.368),
        -245.52 * (np.log(e / 6.108)) / (np.log(e / 6.108) - 17.856),
    )

    return td

def r_from_e_p(e, p):
    return 622.0 * (e / (p - e))

In [None]:
rh = pred["relative_humidity"]
t = pred["air_temperature"]
t_d = pred["dew_point_temperature"]
r = pred["water_vapor_mixing_ratio"]
p = pred["surface_air_pressure"]
e = e_from_t_rh(t, rh)


rh_derived = rh_from_t_td(t, t_d)
rh_residual = (rh_derived - rh) ** 2

r_derived = r_from_e_p(e, p)
r_residual = (r_derived - r) ** 2

physical_penalty = abs(rh_residual) / obs.relative_humidity.var() + abs(r_residual) / obs.water_vapor_mixing_ratio.var()

In [None]:

nmae_ = physical_penalty.sel(approach=["unconstrained", "loss_constrained"]).mean(["forecast_reference_time","t","station"])
df = (nmae_.to_dataframe(name=r"Physical penalty $\mathcal{P}$").reset_index()
      .replace(plt_cfg["approach_names"])
      .rename(columns={"approach":"Approach", "data.reduction":"Fraction of training data"})
     )



In [None]:
fig, axs = plt.subplots(1,2, figsize=(6,5), layout="constrained")


for approach, ax in zip(["Unconstrained","Loss constrained"], axs):
    c_idx = 0 if approach == "Unconstrained" else 2
    c = plt_cfg["approach_colors"][c_idx]
    sns.boxplot(data=df[df["Approach"] == approach], x="Fraction of training data",
        y=r"Physical penalty $\mathcal{P}$", ax=ax, color=c, showfliers=False, width=0.5)
axs[0].set_title("Unconstrained", fontsize=12)
axs[1].set_title("Loss constrained", fontsize=12)
axs[1].set(ylabel="")

plt.savefig(out_dir / "P_vs_reduction.png")