# UQ metrics

In [28]:
%matplotlib inline
import rasterio, yaml, os
import numpy as np
from pathlib import Path
import scipy
import matplotlib.pyplot as plt
from typing import *

PREDICTIONS_DIR = Path("results/dev/2023-03-14_15-45-23")
PKL_DIR = Path('data/pkl/2021-05-18_10-57-45')
GT_DIR = Path('data/preprocessed')
NUM_VARS = 5
VARIABLES = ['P95', 'MeanH', 'Dens', 'Gini', 'Cover']

EAST = ['346', '9', '341', '354', '415', '418', '416', '429', '439', '560', '472', '521', '498',
        '522', '564', '764', '781', '825', '796', '805', '827', '891', '835', '920', '959', '1023', '998',
        '527', '477', '542', '471']
WEST = ['528', '537', '792', '988', '769']
NORTH = ['819', '909', '896']
ALL = EAST + WEST + NORTH

with (PKL_DIR / 'stats.yaml').open() as fh:
    # load training set statistics for data normalization
    stats = yaml.safe_load(fh)
    labels_mean = np.array(stats['labels_mean'])

projects = [f.stem.split("_")[0] for f in PREDICTIONS_DIR.glob('*_mean.tif') if f.stem.split("_")[0] in ALL]

## Quantitative metrics

Let $\mathcal{D}=\left\{(\mathbf{x}_i, \mathbf{y}_i) \in \mathcal{X}\times\mathcal{Y}\right\}_{i=1,\ldots,N}$ be the test set and $\mathcal{P}=\left\{(\hat\mu_ i, \hat\sigma_ i^2)  \in \mathcal{X}\times\mathcal{Y}\right\}_{i=1,\ldots,N}$ be the corresponding pixel-wise predicted mean and variance.

In [55]:
def UCE(variance, mean, gt, n_bins):
    """
    Compute UCE as defined by Laves et al., Recalibration of Aleatoric and Epistemic Regression Uncertainty in Medical Imaging,
    arXiv:2104.12376v1, 2021

    Args:
    - variance (np.ndarray[d, n]): predicted variance for each variable (d) and each pixel in the dataset (n)
    - mean (np.ndarray[d, n]): predicted mean for each variable (d) and each pixel in the dataset (n)
    - gt (np.ndarray[d, n]): ground truth for each variable (d) and each pixel in the dataset (n)
    - n_bins (int): number of bins

    Returns:
    - uce (np.ndarray[d]): UCE of each variables
    - mean_mses (np.ndarray[n_bind]): mean MSE in each bins
    - mean_vars (np.ndarray[n_bind]): mean variance in each bins
    - prop_in_bins (np.ndarray[n_bind]): proportion of the dataset in each bins
    """
    d = gt.shape[0]
    # Compute UCE for each variables
    uce = np.empty((d,))
    prop_in_bins = np.empty((d, n_bins))
    mean_mses = np.empty((d, n_bins))
    mean_vars = np.empty((d, n_bins))
    for i, (var, mu, tgt) in enumerate(zip(variance, mean, gt)):
        # Linear binning
        bins = np.linspace(var.min(), var.max(), n_bins)
        # Get variance bin indexes
        bins_ids = np.digitize(var, bins=bins)
        # Loop on bins to compute statistics
        _uce = 0
        for bin_id in np.unique(bins_ids)-1:
            # Select bin
            pos = bins_ids==bin_id+1
            prop_in_bin = pos.astype("float").mean() # bin_size / N
            bin_var = var[pos]
            bin_mean = mu[pos]
            bin_tgt = tgt[pos]
            # Compute stats
            mean_var = bin_var.mean()
            mean_mse = ((bin_mean-bin_tgt)**2).mean()
            _uce += prop_in_bin * np.abs(mean_var- mean_mse)
            # keep result
            prop_in_bins[i,bin_id] = prop_in_bin
            mean_mses[i,bin_id] = mean_mse 
            mean_vars[i,bin_id] = mean_var
        uce[i] = _uce
    return uce, mean_mses, mean_vars, prop_in_bins

def ENCE(variance, mean, gt, n_bins):
    """
    Compute ENCE as defined by Levi et al., Evaluating and Calibrating Uncertainty Prediction in Regression Tasks,
    arXiv:1905.11659v3, 2023

    Args:
    - variance (np.ndarray[d, n]): predicted variance for each variable (d) and each pixel in the dataset (n)
    - mean (np.ndarray[d, n]): predicted mean for each variable (d) and each pixel in the dataset (n)
    - gt (np.ndarray[d, n]): ground truth for each variable (d) and each pixel in the dataset (n)
    - n_bins (int): number of bins

    Returns:
    - ence (np.ndarray[d]): UCE of each variables
    - bins_mean_rmse (np.ndarray[n_bind]): averager mse in each bins
    - bins_mean_std (np.ndarray[n_bind]): average std in each bins
    - bins_proportions (np.ndarray[n_bind]): proportion of the dataset in each bins
    """
    _, mean_mses, mean_vars, bins_proportions = UCE(variance, mean, gt, n_bins)
    mean_rmses, mean_stds = np.sqrt(mean_mses), np.sqrt(mean_vars)
    ence = (np.abs(mean_stds-mean_rmses) / mean_stds).mean(1)
    return ence, mean_rmses, mean_stds, bins_proportions

def UCE_p(variance, mean, gt, n_bins):
    uce, mean_mses, mean_vars, prop_in_bins = UCE(variance, mean, gt, n_bins)
    deltas = np.abs(mean_mses-mean_vars).max(1)
    uce_p = uce / deltas
    return uce_p, mean_mses, mean_vars, prop_in_bins

def ENCE_p(variance, mean, gt, n_bins):
    M = n_bins
    N = variance.shape[1]
    ence, mean_rmses, mean_stds, bins_proportions = ENCE(variance, mean, gt, n_bins)
    phis = (np.abs(mean_rmses-mean_stds)/mean_stds).max(1)
    uce_p = N/(M*phis)*ence
    return uce_p, mean_rmses, mean_stds, bins_proportions

### 1. Uncertainty Calibration Error (UCE) [Laves 2020, Laves 2021, Levi 2019, Becker 2023] and Expected Normalized Calibration Error (ENCE) [Levi 2019, Zhou 2021a]

Those metrics measure how close are the mean variance (or std) and the empirical MSE (or RMSE). As these two quantities are equal, they should be minimized $\text{UCE}(\downarrow)$ and $\text{ENCE}(\downarrow)$. 

Let $B_k$, $k=1,\ldots,M$ indicate the set of data index falling the the $k$-th bin of variance, i.e 
$$
B_k=\left\{
    i\in\{1,\ldots,N\}: \sigma_i^2 \in \left[
        \frac{\sigma_{max}^2-\sigma_{min}^2}{M}k, \frac{\sigma_{max}^2-\sigma_{min}^2}{M}(k+1)
    \right[
\right\}.
$$
We define the empirical variance in bin $k$, $\bar\delta_k^2$, as the empirical MSE in the bin and the mean variance in bin $k$, $\bar\sigma_k^2$, as the average predicted variance:
$$
\begin{align}
    \bar\delta_k^2 &= \frac{1}{|B_k|}\sum_{i\in B_k}(\hat\mu_i-\mathbf{y}_i)^2 \\
    \bar\sigma_k^2 &= \frac{1}{|B_k|}\sum_{i\in B_k}\sigma_i^2
\end{align}
$$
Then, [Laves 2021] and [Levi 2019] define UCE and ENCE, respectively, as:
$$
\begin{align}
    \text{UCE} &= \sum_{k=1}^{M}\frac{|B_k|}{N}|\bar\sigma_k^2-\bar\delta_k^2| \\
    \text{ENCE} &=  \frac{1}{N}\sum_{k=1}^{M}\frac{|\bar\sigma_k-\bar\delta_k|}{\bar\sigma_k} \\
\end{align}
$$

### Normalized UCE ($\text{UCE\%}$)
Let $\Delta_k=|\bar\sigma_k^2-\bar\delta_k^2|$ and $\Delta_{min}$, $\Delta_{max}$ be the minimum and maximum values taken by that variable across bins. Then we have
$$
\begin{align}
         0 &\leq \Delta_{min} &\leq \text{UCE} &\leq \Delta_{max} \\
   \iff  0  &\leq \frac{\Delta_{min}}{\Delta_{max}} &\leq \frac{\text{UCE}}{\Delta_{max}} &\leq 1
\end{align}
$$
Considering that, we propose to report $\text{UCE\%}=\frac{1}{\Delta_{max}}\text{UCE}$, a normalized version of $\text{UCE}$ that is bounded in $[0,1]$ for easier comparison across variables

### Normalized ENCE ($\text{ENCE\%}$)
Let $\Phi_k = \frac{|\bar\sigma_k-\bar\delta_k|}{\bar\sigma_k}$ and $\Phi_{min}$, $\Phi_{max}$ be the minimum and maximum values taken by that variable across bins. Then we have,
$$
\begin{align}
            0 &\leq \frac{M}{N}\Phi_{min} &\leq \text{ENCE} &\leq \frac{M}{N}\Phi_{max} \\
      \iff  0 &\leq \frac{\Phi_{min}}{\Phi_{max}} &\leq \frac{N}{M}\frac{\text{ENCE}}{\Phi_{max}} &\leq 1
\end{align}
$$

Considering that, we propose to report $\text{ENCE\%}=\frac{N}{M\Phi_{max}}\text{ENCE}$, a normalized version of $\text{ENCE}$ that is bounded in $[0,1]$ for easier comparison across variables

In [56]:
# Usage example
means, variances, gts = [], [], []
for project in WEST:
    try:
        mean_file = os.path.join(PREDICTIONS_DIR, f"{project}_mean.tif")
        with rasterio.open(mean_file) as fh:
            mean = fh.read(fh.indexes)
        with rasterio.open(PREDICTIONS_DIR / (project + '_variance.tif')) as fh:
            variance = fh.read(fh.indexes)
        with rasterio.open(GT_DIR / (project + '.tif')) as fh:
            gt = fh.read(fh.indexes)
            gt_mask = fh.read_masks(1).astype(bool)
    except:
        continue
    mask = ~np.isnan(mean).all(0)
    means.append(mean[:,mask]) 
    variances.append(variance[:,mask]) 
    gts.append(gt[:,mask]) 
means = np.concatenate(means, axis=1)
variances = np.concatenate(variances, axis=1)
gts = np.concatenate(gts, axis=1)
var_uce = UCE(variances, means, gts, 10)[0]
var_ence = ENCE(variances, means, gts, 10)[0]
var_uce_p = UCE_p(variances, means, gts, 10)[0]
var_ence_p = ENCE_p(variances, means, gts, 10)[0]
for i, var_name in enumerate(VARIABLES):
    print(f"{var_name}: UCE={var_uce[i]:.5f}, UCE_p={var_uce_p[i]:.5f}, ENCE={var_ence[i]:.5f}, ENCE_p={var_ence_p[i]:.5f}")

P95: UCE=3.15441, UCE_p=0.01153, ENCE=0.30003, ENCE_p=45352.27242
MeanH: UCE=1.52845, UCE_p=0.01255, ENCE=0.40573, ENCE_p=76227.82657
Dens: UCE=3020.13470, UCE_p=0.36394, ENCE=306.36776, ENCE_p=58437.30030
Gini: UCE=0.00113, UCE_p=0.03220, ENCE=0.22589, ENCE_p=43797.83451
Cover: UCE=6519.76348, UCE_p=0.65878, ENCE=315.67318, ENCE_p=44061.70232


## 2. STDs Coefficient of variation ($C_v$) [Levi 2019]

STDs measures the dispersion of the predicted uncertainties. As ENCE and UCE can be one in some trivial cases (i.e. the variance is always the same and matches the error, then both are zero but the uncertainties are not useful), $C_v$ approximates the usefulness of the estimates, it should be maximized, $C_v(\uparrow)$.

As proposed by [Levi 2019], let $\mu_\sigma=\frac{1}{N}\sum_{i=1}^{N}\hat\sigma_i$ be the average standard deviation. Thens $C_v$ is defined as:
$$
\begin{equation}
    C_v = \frac{1}{\mu_\sigma}\sqrt{\frac{1}{N-1}\sum_{i=1}^{N}(\hat\sigma_i-\mu_\sigma)^2}
\end{equation}

In [1]:
x = np.random.randn((10, 25))
x[0].sum(1).shape

NameError: name 'np' is not defined

## 3. Sharpness [Kuleshov 2018, Upadhyay 2023]

Inverse idea than $C_v$. 

In [11]:
x = list(np.linspace(0, 10, 11))
I = [(a,b) for a,b in zip(x[:-1],x[1:])]
I

[(0.0, 1.0),
 (1.0, 2.0),
 (2.0, 3.0),
 (3.0, 4.0),
 (4.0, 5.0),
 (5.0, 6.0),
 (6.0, 7.0),
 (7.0, 8.0),
 (8.0, 9.0),
 (9.0, 10.0)]

In [12]:
mn = 0
mx = 10
K = 10
[((mx-mn)*i/K, (mx-mn)*(i+1)/K) for i in range(K)]

[(0.0, 1.0),
 (1.0, 2.0),
 (2.0, 3.0),
 (3.0, 4.0),
 (4.0, 5.0),
 (5.0, 6.0),
 (6.0, 7.0),
 (7.0, 8.0),
 (8.0, 9.0),
 (9.0, 10.0)]