In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import json
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.table as tbl
from matplotlib.font_manager import FontProperties
import itertools as itert
import numpy as np
import seaborn as sbn
import gzip

available_datasets = {'toy', 'toy_noise', 'toy_hf', 'toy_modulated', 'toy_uniform', 'toy_noise_strong', 'yacht', 'diabetes',  'boston', 'energy', 'concrete',  'wine_red', 'abalone', 'naval', 'power', 'california','superconduct','protein','year' }
available_methods = {'de','pu','mc_mod_sml','mc_ll','mc'}
available_splits = {'random_folds', 'single_random_split', 'single_label_split', 'label_folds', 'single_pca_split', 'pca_folds'}

dataset_to_size = {'boston': 506, 'wine_red': 1599, 'concrete': 1030, 'toy_noise': 10000, 'abalone': 4176, 'energy': 768, 
                   'year': 515345, 'protein': 45730, 'california': 20640, 'superconduct': 21263, 'diabetes': 442, 'naval': 11934, 
                   'power': 9568, 'yacht': 308, 'toy': 1000, 'toy_hf': 1000, 'toy_noise_strong': 20000, 'toy_uniform': 20000, 'toy_modulated': 20000}


# File Reader

In [None]:
def get_dir_files(exp_dir, dataset_id):
    dir_files_ = os.listdir('%s/%s' % (exp_dir, dataset_id))
    dir_files = {'plots': {split: {} for split in available_splits},
                 'method_dict': {split: {} for split in available_splits},
                 'global_stats': {split: {} for split in available_splits},
                'model': {split: {} for split in available_splits}}

    file_pattern = r'((\w+)_|)dataset=(\w+)_splitmode=(\w+)_foldidx=(\d+)'
    file_matcher = re.compile(file_pattern)
    for dir_file in dir_files_:
        matches = file_matcher.match(dir_file)
        if matches is not None:
            matches = matches.groups()

            if matches[2] == dataset_id and matches[3] in available_splits:

                split = matches[3]
                fold_idx = int(matches[4])
                kind = matches[1] if matches[1] is not None else 'plots'
                dir_files[kind][split][fold_idx] = '%s/%s/%s' % (exp_dir, dataset_id, dir_file)

            else:
                print("Warning. File %s has unexpected form" % dir_file)
            
    return dir_files
    

def load_global_stats(dir_files, splitmode):
    
    res = []
    global_stats = None
    for fold_idx in sorted(dir_files['global_stats'][splitmode]):
        file = dir_files['global_stats'][splitmode][fold_idx]
        if file.endswith('.json'):
            with open(file) as f:
                global_stats = json.load(f)
        elif file.endswith('.json.zip'):
            with gzip.open(file) as f:
                global_stats = json.load(f)
        else:
            raise Exception("File has to be .json or .json.zip, but is %s" % file)
        
        res.append(global_stats)
            
    return res

def load_method_dict(dir_files, splitmode):
    res = []
    method_dict_json, method_dict = None, None
    for fold_idx in sorted(dir_files['method_dict'][splitmode]):
        file = dir_files['method_dict'][splitmode][fold_idx]
        
        if file.endswith('.json'):
            with open(file) as f:
                method_dict_json = json.load(f)
        elif file.endswith('.json.zip'):
            with gzip.open(file) as f:
                method_dict_json = json.load(f)
        else:
            raise Exception("File has to be .json or .json.zip, but is %s" % file)
        
        method_dict = {}
        for key in method_dict_json:
            df_train = pd.read_json(method_dict_json[key][0])
            df_test = pd.read_json(method_dict_json[key][1])
            method_dict[key] = [df_train, df_test]

        res.append(method_dict)
            
    return res
            

# Tables, Plots, ..

In [None]:
SMALL_SIZE = 6
MEDIUM_SIZE = 20
BIGGER_SIZE = 25

plt.rc('font', size=BIGGER_SIZE)# controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)
plt.rc('axes', labelsize=BIGGER_SIZE, linewidth=5)     # fontsize of the axes title # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('xtick.major', width=5, size=10)
plt.rc('xtick.minor', width=5, size=10)
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick.major', width=5, size=10)
plt.rc('ytick.minor', width=5, size=10)
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('lines', linewidth=5)

# Residual vs. std (1/3-sigma plot)

In [None]:
import scipy.stats as spst
from scipy.interpolate import interp1d

def plot_densitymap(x, y, ax):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()
    x_range, y_range = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([x_range.ravel(), y_range.ravel()])
    values = np.vstack([x, y])
    kernel = spst.gaussian_kde(values)
    density = np.reshape(kernel(positions).T, x_range.shape)

    ax.imshow(np.rot90(density), cmap=plt.cm.gist_heat_r, extent=[xmin, xmax, 0, ymax], aspect='auto')
    ax.plot(x, y, 'k.', markersize=1, alpha=0.1)


In [None]:
def show_sigmaplots(exp_dirs, datasets=available_datasets, methods=['mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'de'], splitmode = 'single_random_split', sml_name='ours', use_heat=False, savefig=None):
    
    plt.clf()
    fig, ax = plt.subplots(len(datasets), len(methods), figsize=(len(methods)*10, len(datasets)*10), squeeze=False)
    
    all_x, all_y = np.empty(0), np.empty(0)
    datasets_used, methods_used = [], []
    for exp_dir in exp_dirs:
        dataset_dirs = os.listdir(exp_dir)
        
        for row_i, dataset_id in enumerate(datasets):
            if dataset_id not in dataset_dirs:
                continue
            
            dir_files = get_dir_files(exp_dir, dataset_id)
            method_dict = load_method_dict(dir_files, splitmode)
            if method_dict is None or len(method_dict) == 0:
                continue
            
            for col_i, method in enumerate(methods): 
                if method not in method_dict[0]: # using first fold
                    continue
                
                test_df = method_dict[0][method][1]
                x, y = test_df['pred_residual'].values, test_df['pred_std'].values
                all_x, all_y = np.concatenate((x, all_x)), np.concatenate((y, all_y))
                if use_heat:
                    plot_densitymap(x, y, ax[row_i, col_i])
                else:
                    ax[row_i, col_i].scatter(x, y)
                
                if dataset_id not in datasets_used:
                    datasets_used.append(dataset_id)
                if method not in methods_used:
                    methods_used.append(method)
    
    xmin, ymin = np.quantile(all_x, 0.03), np.quantile(all_y, 0.03)
    xmax, ymax = np.quantile(all_y, 0.97), np.quantile(all_y, 0.97)
    xmin, xmax = min(xmin, -xmax),  max(-xmin, xmax) # symmetric x
    for i, dataset_id in enumerate(datasets_used):
        for j, method in enumerate(methods_used):
            
            if i == (len(datasets_used) - 1):
                ax[i, j].set_xlabel('pred_residual')
            
            if i == 0:
                ax[i, j].set_title('%s' % method if method != 'mc_mod_sml' else sml_name)
            
            if j == 0:
                ax[i, j].set_ylabel('pred_std')
                
            ax[i, j].plot([xmin, 0, xmax], [abs(xmin), 0, xmax], color='orange', label=r'$1 \sigma$')
            ax[i, j].plot([xmin, 0, xmax], [(1./3)*abs(xmin), 0, (1./3)*xmax], color='b', label=r'$3 \sigma$')
            ax[i, j].set_xlim(xmin, xmax)
            ax[i, j].set_ylim(0, ymax)
            ax[i, j].legend()
    
    plt.tight_layout()
    if savefig is not None:
        plt.savefig(savefig)


In [None]:
exp_dirs = ['/INSERT/PATH/TO/EXPERIMENT/LOGS/HERE', '/YOU/CAN/ALSO/INSERT/MULTIPLE/PATHS']

show_sigmaplots(exp_dirs, datasets=['toy_modulated', 'naval', 'abalone', 'superconduct'], use_heat=False, savefig='./plots/sigma.pdf')
show_sigmaplots(exp_dirs, datasets=['toy_modulated', 'naval', 'abalone', 'superconduct'], use_heat=False, savefig='./plots/sigma.jpg')

show_sigmaplots(exp_dirs, datasets=['toy_modulated', 'naval', 'abalone', 'superconduct'], use_heat=True, savefig='./plots/sigma_heat.pdf')
show_sigmaplots(exp_dirs, datasets=['toy_modulated', 'naval', 'abalone', 'superconduct'], use_heat=True, savefig='./plots/sigma_heat.jpg')

show_sigmaplots(exp_dirs, datasets=['abalone'], methods=['mc', 'mc_mod_sml', 'pu', 'de'], use_heat=False, savefig='./plots/sigma_abalone.jpg')
show_sigmaplots(exp_dirs, datasets=['abalone'], methods=['mc', 'mc_mod_sml', 'pu', 'de'], use_heat=False, savefig='./plots/sigma_abalone.pdf')

show_sigmaplots(exp_dirs, datasets=['abalone'], methods=['mc', 'mc_mod_sml', 'pu', 'de'], use_heat=True, savefig='./plots/sigma_abalone_heat.pdf')
show_sigmaplots(exp_dirs, datasets=['abalone'], methods=['mc', 'mc_mod_sml', 'pu', 'de'], use_heat=True, savefig='./plots/sigma_abalone_heat.jpg')

# x vs. gt, mean, standard (toy data)

In [None]:
def plot_x_vs_preds(method_dicts, methods=None, trte=[0, 1], rows=['gt', 'mean', 'res', 'gt_std', 'std'], fold_idx=0, s=5, interpol_gt=False, savefig=None):
    
    if methods is None:
        methods = [key for key in sorted(method_dicts[fold_idx])]
    
    n_methods = len(methods)
    fig, ax = plt.subplots(len(trte)*len(rows), n_methods, figsize=(len(methods)*8, len(rows)*5))
    
    trte_to_color = {0: 'orange', 1: 'blue'}
    ylims = [[([], []) for _ in range(len(rows))] for _ in range(len(trte))]
    
    for i, trte_i in enumerate(trte): # train/test
        for j, method in enumerate(methods):
            gt = method_dicts[fold_idx][method][trte_i]['gt'].values
            x = np.array([val[0] for val in method_dicts[fold_idx][method][trte_i]['x'].values])
            x_unique = np.unique(x)
            grouped_gt = [gt[x == x_val] for x_val in x_unique]
            gt_std = [np.std(group) for group in grouped_gt]
            
            pred_mean = method_dicts[fold_idx][method][trte_i]['pred_mean']
            if method == 'mc_mod_sml':
                pred_mean_nomc = method_dicts[fold_idx][method][trte_i]['pred_no_mc']
                total_std = method_dicts[fold_idx][method][trte_i]['total_std']
            pred_std = method_dicts[fold_idx][method][trte_i]['pred_std']
            residual = pred_mean - gt
            
            metrics_list = []

            k = 0
            if 'gt' in rows:
                if interpol_gt:
                    x_argsort = np.argsort(x)
                    ax[i*len(rows)+k, j].plot(x[x_argsort], gt[x_argsort], color=trte_to_color[trte_i])
                else:
                    ax[i*len(rows)+k, j].scatter(x, gt, s=s, color=trte_to_color[trte_i])
                metrics_list.append(gt)
                k += 1
            
            if 'mean' in rows:
                if method == 'mc_mod_sml':
                    ax[i*len(rows)+k, j].scatter(x, pred_mean_nomc, s=s, color=trte_to_color[trte_i])
                else:
                    ax[i*len(rows)+k, j].scatter(x, pred_mean, s=s, color=trte_to_color[trte_i])
                metrics_list.append(pred_mean)
                k += 1
            
            if 'res' in rows:
                ax[i*len(rows)+k, j].scatter(x, residual, s=s, color=trte_to_color[trte_i])
                metrics_list.append(residual)
                k += 1

            if 'gt_std' in rows:
                ax[i*len(rows)+k, j].scatter(x_unique, gt_std, s=s, color=trte_to_color[trte_i])
                metrics_list.append(gt_std)
                k += 1
                
            if 'std' in rows:
                if method == 'mc_mod_sml':
                    ax[i*len(rows)+k, j].scatter(x, total_std, s=s, color=trte_to_color[trte_i])
                else:
                    ax[i*len(rows)+k, j].scatter(x, pred_std, s=s, color=trte_to_color[trte_i])
                #ax[i*len(rows)+k, j].set_xlabel('x')
                if j == 0:
                    ax[i*len(rows)+k, j].set_ylabel('std')
                metrics_list.append(pred_std)
            
            for plot_ident, data in enumerate(metrics_list):
                ylims[i][plot_ident][0].append(np.min(data))
                ylims[i][plot_ident][1].append(np.max(data))
    
    for j, method in enumerate(methods):
        ax[0, j].set_title(method if method != 'mc_mod_sml' else 'ours')
        ax[len(trte)*len(rows)-1, j].set_xlabel('x')
        
    for j, row in enumerate(rows):
        ax[j, 0].set_ylabel(row)
    
    for i, trte_i in enumerate(trte):
        for plot_ident, ylim_vals in enumerate(ylims[i]):
            ylims[i][plot_ident] = (np.min(ylims[i][plot_ident][0]),
                                    np.max(ylims[i][plot_ident][1]))
            
            ymin, ymax = ylims[i][plot_ident]
            if plot_ident == rows.index('mean'):
                ylims[i][plot_ident] = (-max(abs(ymin), abs(ymax)) -0.5, max(abs(ymin), abs(ymax)) + 0.5)
            else:
                ylims[i][plot_ident] = (ymin - (ymax - ymin)*0.2,
                                   ymax + (ymax - ymin)*0.2)
            
            for j in range(len(methods)):
                ax[i*len(rows)+plot_ident, j].set_ylim(*ylims[i][plot_ident])

    plt.subplots_adjust(wspace=0.2, hspace=0.2)
    
    if savefig is not None:
        plt.savefig(savefig)
    #plt.savefig()
    plt.show()

In [None]:
SMALL_SIZE = 6
MEDIUM_SIZE = 20
BIGGER_SIZE = 25

plt.rc('font', size=BIGGER_SIZE)# controls default text sizes
plt.rc('axes', titlesize=40)
plt.rc('axes', labelsize=40, linewidth=5)     # fontsize of the axes title # fontsize of the x and y labels
plt.rc('xtick', labelsize=28)    # fontsize of the tick labels
plt.rc('xtick.major', width=5, size=10)
plt.rc('xtick.minor', width=5, size=10)
plt.rc('ytick', labelsize=28)    # fontsize of the tick labels
plt.rc('ytick.major', width=5, size=10)
plt.rc('ytick.minor', width=5, size=10)
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('lines', linewidth=5)

In [None]:
exp_dir = '/INSERT/PATH/TO/TOY_MODULATED/HERE'
dataset_id = 'toy_modulated' 
splitmode = 'single_random_split'

dir_files = get_dir_files(exp_dir, dataset_id)
global_stats = load_global_stats(dir_files, splitmode)
method_dicts = load_method_dict(dir_files, splitmode)
plot_x_vs_preds(method_dicts, methods=['mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'de'], rows=['gt', 'mean', 'std'], trte=[1], s=40, interpol_gt=False, savefig='./plots/toy_modulated.pdf')

In [None]:
exp_dir = '/INSERT/PATH/TO/TOY_HF/HERE'
dataset_id = 'toy_hf' 
splitmode = 'single_random_split'

dir_files = get_dir_files(exp_dir, dataset_id)
global_stats = load_global_stats(dir_files, splitmode)
method_dicts = load_method_dict(dir_files, splitmode)
plot_x_vs_preds(method_dicts, methods=['mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'de'], rows=['gt', 'mean', 'std'], trte=[1], s=40, interpol_gt=True, savefig='./plots/toy_hf.pdf')

# UCI crossvalidated runs

In [None]:
MEDIUM_SIZE = 27
BIGGER_SIZE = 35

plt.rc('font', size=BIGGER_SIZE)# controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)
plt.rc('axes', labelsize=BIGGER_SIZE, linewidth=5)     # fontsize of the axes title # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('xtick.major', width=5, size=10)
plt.rc('xtick.minor', width=5, size=10)
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick.major', width=5, size=10)
plt.rc('ytick.minor', width=5, size=10)
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('lines', linewidth=5)

In [None]:
def get_ranks(x, higher_is_better=False):
    
    x = np.array(x)
    if higher_is_better:
        sorted_idxs = np.argsort(1/(x+1))
        ranks = np.zeros(len(sorted_idxs))
        ranks[sorted_idxs] = np.arange(len(x))
    else:
        sorted_idxs = np.argsort(x)
        ranks = np.zeros(len(sorted_idxs))
        ranks[sorted_idxs] = np.arange(len(x))
    return ranks

In [None]:
def aggregate_over_folds(exp_dirs):

    def _pair_to_string(a, b):
        return str(a) + " " + str(b)
    
    aggregated = pd.DataFrame(dtype=object)
    
    for exp_dir in exp_dirs:
        
        datasets = os.listdir(exp_dir)
        
        for dataset_id in datasets:
            
            if dataset_id in available_datasets:
                
                print(dataset_id)

                dir_files = get_dir_files(exp_dir, dataset_id)

                splitmode = 'random_folds'
                global_stats_folds = load_global_stats(dir_files, splitmode)

                for fold_idx, fold in enumerate(global_stats_folds):
                    for method in sorted(fold):
                        for i, trte in enumerate(['train', 'test']):
                            dataset_trte = _pair_to_string(dataset_id, trte)

                            for metric in fold[method][i]:
                                method_metric = _pair_to_string(method, metric)
                                if dataset_trte not in aggregated.index \
                                or method_metric not in aggregated.columns \
                                or not isinstance(aggregated.loc[dataset_trte, method_metric], np.ndarray):
                                    aggregated.loc[dataset_trte, method_metric] = 0.
                                    aggregated[method_metric] = aggregated[method_metric].astype('object')
                                    aggregated.at[dataset_trte, method_metric] = np.zeros(len(global_stats_folds))

                                aggregated.loc[dataset_trte, method_metric][fold_idx] = fold[method][i][metric]

                splitmode_to_ident = {'label_folds': 'label_test', 'pca_folds': 'pca_test'}
                for splitmode in sorted(splitmode_to_ident):
                    global_stats_folds = load_global_stats(dir_files, splitmode)

                    fold_mode_to_fold_idxs  = {'extrapolate': [0, len(global_stats_folds) -1], 'interpolate': np.arange(1, len(global_stats_folds)-1)}
                    for fold_mode in ['extrapolate', 'interpolate']:
                        dataset_ident = _pair_to_string(dataset_id, '%s_%s' % (splitmode_to_ident[splitmode], fold_mode))
                        
                        fold_idxs = fold_mode_to_fold_idxs[fold_mode]
                        for i, fold_idx in enumerate(fold_idxs):
                            
                            fold = global_stats_folds[fold_idx]
                            for method in sorted(fold):

                                for metric in fold[method][1]:

                                    method_metric = _pair_to_string(method, metric)
                                    if dataset_ident not in aggregated.index \
                                    or method_metric not in aggregated.columns \
                                    or not isinstance(aggregated.loc[dataset_ident, method_metric], np.ndarray):
                                        aggregated.loc[dataset_ident, method_metric] = 0.
                                        aggregated[method_metric] = aggregated[method_metric].astype('object')
                                        aggregated.at[dataset_ident, method_metric] = np.zeros(len(fold_idxs))

                                    aggregated.loc[dataset_ident, method_metric][i] = fold[method][1][metric]
                            
    aggregated.columns = aggregated.columns.str.split(expand=True)
    aggregated = aggregated.set_index(aggregated.index.str.split(expand=True))
    return aggregated

def _75q(x):
    return np.quantile(x, .75)

def _25q(x):
    return np.quantile(x, .25)

default_ident_offsets = {'train': -0.2, 'test': -0.12, 
                     'label_test_interpolate': -0.04, 'label_test_extrapolate': 0.04,
                     'pca_test_interpolate': 0.12, 'pca_test_extrapolate': 0.2}

def plot_metrics(aggregated_mean, metric, idents=('train', 'test', 'label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
                 datasets=None, methods=None, ylim=None, yscale=None, figsize=(10, 6), ax=None, 
                 summary_stat_over=None, summary_stat_funcs=None, summary_rank_funcs=None,
                ident_offsets=default_ident_offsets, sml_name='ours', ticklabel_tilt=0, s=10,
                savefig=None, xticklabels=True):
    
    show = False
    if ax is None:
        plt.figure(figsize=figsize)
        ax = plt.gca()
        show = True
    
    if datasets is None:
        datasets = aggregated_mean.index.get_level_values(0).unique().values
    else:
        datasets = np.array(datasets)
    
    if methods is None:
        methods = aggregated_mean.columns.get_level_values(0).unique().values
        
    datasets_idx = np.arange(datasets.size)
    max_dataset_idx = datasets_idx.max()
    
    if summary_stat_funcs is None:
        summary_stat_funcs = [np.mean, np.median, np.min, lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75), np.max]
    if summary_rank_funcs is None:
        summary_rank_funcs = [np.mean]
    
    aggregated_ranks = None
    if len(summary_rank_funcs) > 0:
        aggregated_ranks = aggregated_mean.loc[(datasets, idents), (methods, metric)].apply(
            get_ranks if metric != 'r2' else lambda x: get_ranks(x, higher_is_better=True), axis=1, result_type='broadcast')
    
    method_str = {'mc_mod_sml': r'%s' % sml_name, 'mc_mod_sml1': r'%s, $\beta=0.1$' % sml_name, 
                  'mc_mod_sml25': r'%s, $\beta=0.25$' % sml_name, 'mc_mod_sml75': r'%s, $\beta=0.75$' % sml_name, 
                  'mc_mod_sml9': r'%s, $\beta=0.9$' % sml_name, 'mc_mod_sml10': r'%s, $\beta=10$' % sml_name,
                 'mc_mod_sml0': r'%s, $\beta=0$' % sml_name, 'sml_de': '%s_de' % sml_name}
    ident_str = {'label_test_interpolate': 'label_test_interp', 'label_test_extrapolate': 'label_test_extrap', 'pca_test_interpolate': 'pca_test_interp', 'pca_test_extrapolate': 'pca_test_extrap'}
    
    for ident in idents:
        for method in sorted(methods):
            
            values_over_datasets = aggregated_mean.loc[(datasets, ident), (method, metric)].values
            ax.scatter(datasets_idx + ident_offsets[ident], 
                        values_over_datasets,
                       s=s,
                       label='%s, %s' % (method_str[method] if method in method_str else method, ident_str[ident] if ident in ident_str else ident),
                       marker=method_to_marker[method],
                       color=ident_to_color[ident],
                       alpha=0.5)
            
            if summary_stat_over is None:
                summary_values = values_over_datasets
            else:
                summary_values = aggregated_mean.loc[(summary_stat_over, ident), (method, metric)].values
            
            for summary_stat_count, summary_stat_func in enumerate(summary_stat_funcs):
                if callable(summary_stat_func):
                    ax.scatter(max_dataset_idx + summary_stat_count + 1 + ident_offsets[ident], 
                               summary_stat_func(summary_values),
                               s=s,
                               marker=method_to_marker[method],
                               color=ident_to_color[ident],
                               alpha=0.5)
    
    
    if ylim is None:
        ylim = ax.get_ylim()
        
    if summary_rank_funcs is not None and len(summary_rank_funcs) > 0:
        for ident in idents:
            for method in sorted(methods):
                if summary_stat_over is None:
                    rank_summary_values = aggregated_ranks.loc[(datasets, ident), (method, metric)].values
                else:
                    rank_summary_values = aggregated_ranks.loc[(summary_stat_over, ident), (method, metric)].values
                
                rank_summary_values = ((ylim[1]-ylim[0])/len(methods))*rank_summary_values + ylim[0]
                
                for summary_stat_count, summary_rank_func in enumerate(summary_rank_funcs):
                    if callable(summary_stat_func):
                        ax.scatter(max_dataset_idx + summary_stat_count + len(summary_stat_funcs) + 1 + ident_offsets[ident], 
                                   summary_rank_func(rank_summary_values),
                                   s=s,
                                   marker=method_to_marker[method],
                                   color=ident_to_color[ident],
                                   alpha=0.5)
            
                   
    a = 0.2
    ax.plot([-0.5, -0.5], ylim, '--', color='grey', alpha=a)
    for ds_idx in datasets_idx:
        if ds_idx == max_dataset_idx:
            a = 0.5
        else:
            a = 0.2
        ax.plot([ds_idx+0.5, ds_idx+0.5], ylim, '--', color='grey', alpha=a)
    
    for ds_idx in range(max_dataset_idx + 1, max_dataset_idx + len(summary_stat_funcs) + len(summary_rank_funcs) +1):
        ax.plot([ds_idx+0.5, ds_idx+0.5], ylim, '--', color='grey', alpha=0.2)
    ax.axvspan(max_dataset_idx + 1 - 0.5, max_dataset_idx + len(summary_stat_funcs) + len(summary_rank_funcs) + 1.5, color='grey', alpha=0.05)
            
    ax.legend(prop=fontP, bbox_to_anchor=(1, 1), loc='upper left')
    
    if yscale is None:
        ax.set_yscale('log')
    else:
        ax.set_yscale(yscale)

    reduce_func_to_str = {np.mean: 'mean', np.median: 'median', np.min: 'min', np.max: 'max', _75q: '75q', _25q: '25q'}
        
    ax.set_ylabel(metric if metric != 'ws_dist' else 'Wasserstein distance')
    
    
    ax.set_xticks(np.concatenate((datasets_idx, np.arange(max_dataset_idx + 1, max_dataset_idx + len(summary_stat_funcs) + len(summary_rank_funcs) +1))))
    if xticklabels:
        ax.set_xticklabels(np.concatenate(([dataset + "\n" + ("(%.0fk)"% (float(dataset_to_size[dataset])/1000.) if (float(dataset_to_size[dataset]) >= 1000) else "(%.1fk)"%(float(dataset_to_size[dataset])/1000.) ) for dataset in datasets], 
                                       [reduce_func_to_str[func] if func in reduce_func_to_str else func.__name__ for func in summary_stat_funcs],
                                      [reduce_func_to_str[func] + " rank" if func in reduce_func_to_str else func.__name__ for func in summary_rank_funcs])),
                          rotation=ticklabel_tilt)
    else:
        ax.set_xticklabels([])
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlim(-1.25, max_dataset_idx + len(summary_stat_funcs) + len(summary_rank_funcs) + 1.25)
    

    if savefig is not None:
        plt.savefig(savefig)
    
    if show:
        plt.show()
    

def plot_uncertainty_vs_performance(aggregated_mean, perf, unc, item, second_dim_vals=None, second_dim='datasets',
                                    idents=('train', 'test', 'label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
                                    xlim=None, ylim=None, xscale=None, figsize=(10, 6)):
    
    plt.figure(figsize=figsize)
    
    
    if second_dim == 'datasets':
        
        if second_dim_vals is None:
            second_dim_vals = aggregated_mean.index.get_level_values(0).unique().values
        
        for dataset_id in second_dim_vals:
            for ident in idents:
                plt.scatter(aggregated_mean.loc[(dataset_id, ident), (item, perf)],
                            aggregated_mean.loc[(dataset_id, ident), (item, unc)],
                           color=ident_to_color[ident],
                           label='%s_%s' % (dataset_id, ident),
                            marker=dataset_to_marker[dataset_id],
                           alpha=0.5) 
                
        plt.title('Method=%s' % item)
    
    elif second_dim == 'methods':
        
        if second_dim_vals is None:
            second_dim_vals = aggregated_mean.columns.get_level_values(0).unique().values
        
        for method in second_dim_vals:
            for ident in idents:
                plt.scatter(aggregated_mean.loc[(item, ident), (method, perf)],
                            aggregated_mean.loc[(item, ident), (method, unc)],
                           color=ident_to_color[ident],
                            marker=method_to_marker[method],
                           label='%s_%s' % (method, ident),
                           alpha=0.5) 
                
        plt.title('Dataset=%s' % item)
    
    plt.xlabel(perf)
    plt.ylabel(unc)
    plt.legend(prop=fontP, bbox_to_anchor=(1, 1), loc='upper left')
    plt.xscale('symlog')
    plt.yscale('linear')
    
    if xlim is not None:
        plt.xlim(*xlim)
    
    if ylim is not None:
        plt.ylim(*ylim)
        
    if xscale is not None:
        plt.xscale(xscale)
    
    plt.show()

def plot_uncertainty_vs_uncertainty(aggregated_mean, metrics, item, second_dim='datasets', second_dim_vals=None,
                                    idents=('train', 'test', 'label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
                                    ylim=None, xlim=None, xscale=None, figsize=(10, 6)):
    
    plt.figure(figsize=figsize)
    
    for metric_pair in itert.combinations(metrics, 2):
        unc1, unc2 = metric_pair
        
        for ident in idents:
            
            if second_dim == 'datasets':
                if second_dim_vals is None:
                    datasets = aggregated_mean.index.get_level_values(0).unique().values
                else:
                    datasets = second_dim_vals
                    
                for dataset_id in datasets:
                    plt.scatter(aggregated_mean.loc[(dataset_id, ident), (item, unc1)],
                                aggregated_mean.loc[(dataset_id, ident), (item, unc2)],
                                label='%s_%s_%s' % (unc1, unc2, ident),
                                color=ident_to_color[ident],
                                marker=dataset_to_marker[dataset_id],
                                alpha=0.5)
                    
                    plt.title('dataset=%s' % item)
            elif second_dim == 'methods':
                if second_dim_vals is None:
                    methods = aggregated_mean.columns.get_level_values(0).unique().values
                else:
                    methods = second_dim_vals
                    
                for method in methods:
                    plt.scatter(aggregated_mean.loc[(item, ident), (method, unc1)],
                                aggregated_mean.loc[(item, ident), (method, unc2)],
                               label='%s_%s_%s' % (unc1, unc2, ident),
                                color=ident_to_color[ident],
                                marker=method_to_marker[method],
                                alpha=0.5
                               )
                    
                plt.title('dataset=%s' % item)
        
        plt.legend(prop=fontP, bbox_to_anchor=(1, 1), loc='upper left')
        
        if len(metrics) == 2:
            plt.xlabel(metrics[0])
            plt.ylabel(metrics[1])
        else:
            plt.xlabel('uncertainty')
            plt.ylabel('uncertainty')
        
        if xscale is not None:
            plt.xscale(xscale)
        else:
            plt.xscale('log')
        
        if ylim is not None:
            plt.ylim(*ylim)
            
        if xlim is not None:
            plt.xlim(*xlim)
        
        plt.show()
    

In [None]:
exp_dirs = ['/INCLUDE/EXPERIMENT/DIR/HERE', '/MULTIPLE/PATHS/CAN/BE/GIVEN']
    
aggregated = aggregate_over_folds(exp_dirs)
aggregated_mean = aggregated.applymap(lambda x: np.mean(x))

# order
sorted_datasets = ['toy', 'toy_hf', 'toy_uniform', 'toy_modulated', 'toy_noise', 'toy_noise_strong', 'yacht', 'diabetes',  'boston', 'energy', 'concrete',  'wine_red', 'abalone', 'power','naval', 'california','superconduct','protein','year']
aggregated_mean = aggregated_mean.loc[[ds for ds in sorted_datasets if ds in aggregated_mean.index.get_level_values(0).values]]


method_to_marker = {'mc': 'D', 'mc_ll': 'd', 'mc_mod_sml': 's', 
                   'pu': '.', 'de': 'x', 'mc_mod_sml1': 'o', 'mc_mod_sml25': '^', 'mc_mod_sml75': 'p', 'mc_mod_sml9': '+', 'pu_de': '*', 'sml_de': '+', 'mc_mod_sml0': '.', 'mc_mod_sml10': '*'}
ident_to_color = {'train': 'g', 'test': 'b', 'label_test_interpolate': 'r', 'label_test_extrapolate': 'lightcoral',
                  'pca_test_interpolate': 'y', 'pca_test_extrapolate': 'orange'}
dataset_to_marker = {'toy': ',', 'toy_noise': '.', 'yacht': '+', 'diabetes': 'x', 'boston': '|', 
                     'energy': '_', 'concrete': '1', 'wine_red': '3', 
                    'abalone': 'o', 'naval': 'v', 'power': '^', 'california': 's', 'superconduct': 'P', 
                     'protein': 'D', 'year': '*'}
fontP = FontProperties()
fontP.set_size('xx-small')

In [None]:
for method in available_methods:
    print(method, aggregated_mean.loc[(slice(None), slice(None)), (method, 'ws_dist')].quantile(0.9))

In [None]:
for method in available_methods:
    print(method, aggregated_mean.loc[(slice(None), slice(None)), (method, 'ece')].max())

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))

plot_metrics(aggregated_mean, 'rmse', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', xticklabels=False, ticklabel_tilt=45, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'rmse', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 10), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])
plt.tight_layout()
plt.savefig('./plots/rmse.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))

plot_metrics(aggregated_mean, 'rmse', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', xticklabels=False, ticklabel_tilt=45, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'rmse', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 10), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])
plt.tight_layout()
plt.savefig('./plots/rmse_smlde.pdf')

In [None]:
plot_metrics(aggregated_mean, 'rmse', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 12), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', ticklabel_tilt=45, s=160, savefig='./plots/rmse_ood.png')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'nll', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'nll', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/nll.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'nll', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'nll', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/nll_smlde.pdf')

In [None]:
plot_metrics(aggregated_mean, 'nll', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 12 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _75q],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig='./plots/nll_ood.png')

In [None]:

fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'ece', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45,xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'ece', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/ece.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'ece', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45,xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'ece', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/ece_smlde.pdf')

In [None]:
plot_metrics(aggregated_mean, 'ece', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 12 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig='./plots/ece_ood.png')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'ws_dist', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[0, 4], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test': 0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'ws_dist', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de'], 
             ylim=[0, 4], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.2, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/ws_dist.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'ws_dist', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[0, 4], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test': 0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'ws_dist', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['de', 'mc', 'mc_ll', 'mc_mod_sml', 'pu', 'pu_de', 'sml_de'], 
             ylim=[0, 4], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.2, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/ws_dist_smlde.pdf')

# Hyperparameter study (beta parameter)

In [None]:
exp_dirs = ['/INCLUDE/PATH/TO/HYPERPARAMETER_EXPERIMENT/DIR/HERE', '/MULTIPLE/PATHS/CAN/BE/GIVEN']

aggregated = aggregate_over_folds(exp_dirs)
aggregated_mean = aggregated.applymap(lambda x: np.mean(x))

# order
sorted_datasets = ['toy', 'toy_hf', 'toy_uniform', 'toy_modulated', 'toy_noise', 'toy_noise_strong', 'yacht', 'diabetes',  'boston', 'energy', 'concrete',  'wine_red', 'abalone', 'power','naval', 'california','superconduct','protein','year']
aggregated_mean = aggregated_mean.loc[[ds for ds in sorted_datasets if ds in aggregated_mean.index.get_level_values(0).values]]


method_to_marker = {'mc': 'D', 'mc_ll': 'd', 'mc_mod_sml': 's', 
                   'pu': '.', 'de': 'x', 'mc_mod_sml1': 'o', 'mc_mod_sml25': '^', 'mc_mod_sml75': 'p', 'mc_mod_sml9': '+', 'pu_de': '*', 'sml_de': '+', 'mc_mod_sml0': '.', 'mc_mod_sml10': '*'}
ident_to_color = {'train': 'g', 'test': 'b', 'label_test_interpolate': 'r', 'label_test_extrapolate': 'lightcoral',
                  'pca_test_interpolate': 'y', 'pca_test_extrapolate': 'orange'}
dataset_to_marker = {'toy': ',', 'toy_noise': '.', 'yacht': '+', 'diabetes': 'x', 'boston': '|', 
                     'energy': '_', 'concrete': '1', 'wine_red': '3', 
                    'abalone': 'o', 'naval': 'v', 'power': '^', 'california': 's', 'superconduct': 'P', 
                     'protein': 'D', 'year': '*'}
fontP = FontProperties()
fontP.set_size('xx-small')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))

plot_metrics(aggregated_mean, 'rmse', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', xticklabels=False, ticklabel_tilt=45, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'rmse', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[-0.05, 1.1], yscale='linear', figsize=(32, 10), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
             sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])
plt.tight_layout()
plt.savefig('./plots/rmse_smlbeta.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'nll', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval', 'california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'nll', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[-10, 70], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/nll_sml.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'ece', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test':0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45,xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'ece', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[0, 2], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.25, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.25}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/ece_sml.pdf')

In [None]:
fig, ax = plt.subplots(2, figsize=(32, 20))
plot_metrics(aggregated_mean, 'ws_dist', idents=('train', 'test'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[0, 4], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'train':-0.2, 'test': 0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, xticklabels=False, s=160, savefig=None, ax=ax[0])

plot_metrics(aggregated_mean, 'ws_dist', idents=('label_test_interpolate', 'label_test_extrapolate', 'pca_test_interpolate', 'pca_test_extrapolate'), 
             datasets=['yacht', 'diabetes', 'boston', 'energy', 'concrete', 'wine_red', 'abalone', 'power', 'naval','california', 'superconduct', 'protein', 'year'], 
             methods=['mc_mod_sml1', 'mc_mod_sml25', 'mc_mod_sml', 'mc_mod_sml75', 'mc_mod_sml9'], 
             ylim=[0, 4], yscale='linear', figsize=(32, 10 ), 
             ident_offsets={'label_test_interpolate':-0.2, 'label_test_extrapolate':-0.1, 'pca_test_interpolate':0.1, 'pca_test_extrapolate': 0.2}, summary_stat_funcs=[np.mean, np.median, _25q, _75q],
             summary_rank_funcs=[],
            sml_name='ours', ticklabel_tilt=45, s=160, savefig=None, ax=ax[1])

plt.tight_layout()
plt.savefig('./plots/ws_dist_sml.pdf')

# Uncertainty/Performance measure analysis

In [None]:
SMALL_SIZE = 6
MEDIUM_SIZE = 20
BIGGER_SIZE = 25

plt.rc('font', size=BIGGER_SIZE)# controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)
plt.rc('axes', labelsize=BIGGER_SIZE, linewidth=5)     # fontsize of the axes title # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('xtick.major', width=5, size=10)
plt.rc('xtick.minor', width=5, size=10)
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick.major', width=5, size=10)
plt.rc('ytick.minor', width=5, size=10)
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=SMALL_SIZE)  # fontsize of the figure title
plt.rc('lines', linewidth=5)

In [None]:
exp_dirs = ['/INCLUDE/PATH/TO/EXPERIMENTS/DIR/HERE', '/MULTIPLE/PATHS/CAN/BE/GIVEN']

aggregated = aggregate_over_folds(exp_dirs)
aggregated_mean = aggregated.applymap(lambda x: np.mean(x))

# order
sorted_datasets = ['toy', 'toy_hf', 'toy_uniform', 'toy_modulated', 'toy_noise', 'toy_noise_strong', 'yacht', 'diabetes',  'boston', 'energy', 'concrete',  'wine_red', 'abalone', 'power','naval', 'california','superconduct','protein','year']
aggregated_mean = aggregated_mean.loc[[ds for ds in sorted_datasets if ds in aggregated_mean.index.get_level_values(0).values]]


method_to_marker = {'mc': 'D', 'mc_ll': 'd', 'mc_mod_sml': 's', 
                   'pu': '.', 'de': 'x', 'mc_mod_sml1': 'o', 'mc_mod_sml25': '^', 'mc_mod_sml75': 'p', 'mc_mod_sml9': '+', 'pu_de': '*', 'sml_de': '+', 'mc_mod_sml0': '.', 'mc_mod_sml10': '*'}
ident_to_color = {'train': 'g', 'test': 'b', 'label_test_interpolate': 'r', 'label_test_extrapolate': 'lightcoral',
                  'pca_test_interpolate': 'y', 'pca_test_extrapolate': 'orange'}
dataset_to_marker = {'toy': ',', 'toy_noise': '.', 'yacht': '+', 'diabetes': 'x', 'boston': '|', 
                     'energy': '_', 'concrete': '1', 'wine_red': '3', 
                    'abalone': 'o', 'naval': 'v', 'power': '^', 'california': 's', 'superconduct': 'P', 
                     'protein': 'D', 'year': '*'}
fontP = FontProperties()
fontP.set_size('xx-small')

In [None]:
plot_uncertainty_vs_performance(aggregated_mean, 'rmse', 'nll', available_datasets, second_dim='methods', xlim=[0, 1.3], ylim=[-6, 20], xscale='linear', figsize=(40, 25))

In [None]:
plot_uncertainty_vs_performance(aggregated_mean, 'rmse', 'r2', available_datasets, second_dim='methods', xlim=[0, 1.3], ylim=[-10, 2], xscale='linear', figsize=(40, 25))

In [None]:
plot_uncertainty_vs_uncertainty(aggregated_mean, ['ws_dist', 'ece'], available_datasets, 'methods', xlim=[1e-2, 5*10**1], figsize=(40, 25))

In [None]:
plot_uncertainty_vs_uncertainty(aggregated_mean, ['ws_dist', 'ks_dist'], available_datasets, 'methods', xlim=[1e-2, 5*10**1], figsize=(40, 25))

In [None]:
plot_uncertainty_vs_uncertainty(aggregated_mean, ['ks_dist', 'ece'], available_datasets, 'methods', xlim=[0, 1], figsize=(40, 25), xscale='linear')

In [None]:
plot_uncertainty_vs_uncertainty(aggregated_mean, ['ece', 'ece_calib'], available_datasets, 'methods', xlim=[0, 2], ylim=[0, 2], figsize=(40, 25), xscale='linear')

In [None]:
plot_uncertainty_vs_performance(aggregated_mean, 'rmse', 'ece', available_datasets, second_dim='methods', ylim=[0, 2], xscale='linear', figsize=(30, 15))

In [None]:
plot_uncertainty_vs_performance(aggregated_mean, 'nll', 'ece', available_datasets, second_dim='methods', ylim=[0, 2], xlim=[-6, 50], xscale='linear', figsize=(40, 25))