# Results of Dirichlet Calibration

Generate Latex tables and CD-diagrams for log-loss, brier, error_rate, confidence-ECE, classwise-ECE and MCE.

1. Models need to be trained and tuned for calibrators
    1. Dir-l2, MS-odir, Dir-odir, TempS and VecS
    2. For that read ReadMe.txt in scripts folder.
2. Put the tunings in correct folders and run this notebook

In [1]:
# Imports to get "utility" package
import sys
from os import path
sys.path.append( path.dirname( path.dirname( path.abspath("calibration") ) ) )

In [2]:
import numpy as np
import pandas as pd
from os.path import join
from calibration.cal_methods import evaluate, cal_results, TemperatureScaling, MatrixScaling, LogisticCalibration, VectorScaling_NN
from dirichlet import FullDirichletCalibrator
import pickle

Using TensorFlow backend.


In [3]:
import sys
from os import path
import os
import glob

## Read in the parameter tuning results

Path to logits and tuning

In [4]:
PATH = join('..', '..', 'logits')
files_10 = ('probs_resnet_wide32_c10_logits.p', 'probs_densenet40_c10_logits.p',
            'probs_lenet5_c10_logits.p', 'probs_resnet110_SD_c10_logits.p',
           'probs_resnet110_c10_logits.p', 'probs_resnet152_SD_SVHN_logits.p',
           'logits_pretrained_c10_logits.p', 'logits_pretrained_mnist_logits.p',
           'logits_pretrained_svhn_logits.p')

files_100 = ('probs_resnet_wide32_c100_logits.p', 'probs_densenet40_c100_logits.p',
             'probs_lenet5_c100_logits.p', 'probs_resnet110_SD_c100_logits.p',
             'probs_resnet110_c100_logits.p', 'logits_pretrained_c100_logits.p')

PATH_DF_DIR_ODIR = join("..", "..", "tunings_all", "tunings_dir_odir")  # Replace with folder where the tunings are 
PATH_DF_DIR = join("..", "..", "tunings_all", "tunings_dir_l2")
PATH_DF_MAT_ODIR = join("..", "..", "tunings_all", "tunings_mat_odir")
PATH_DF_TEMP = join("..", "..", "tunings_all", "tunings_temp_scale")
PATH_DF_VEC = join("..", "..", "tunings_all", "tunings_vec_scale")

In [5]:
def read_in_results_guo(path, ext = ".p"):
    
    file_path = join(path, "*" + ext)
    files = glob.glob(file_path)

    dfs_list = []


    for fname in files:
        with open(fname, "rb") as f:
            df = pickle.load(f)
            dfs_list.append(df)

    df_tuning = pd.concat(dfs_list, sort=False)

    return df_tuning.reset_index(drop=True)

In [6]:
def read_in_results2(path, ext = ".p"):
    
    file_path = join(path, "*" + ext)
    files = glob.glob(file_path)

    cols = ["Error_test", "ECE_test", "ECE2_test", "ECE_CW_test", "ECE_CW2_test", "ECE_FULL_test", "ECE_FULL2_test", "MCE_test", "MCE2_test", "Loss_test", "Brier_test"]
    cols_ens = [col + "_ens" for col in cols]

    dfs_list = []

    for fname in files:
        with open(fname, "rb") as f:
            df = pickle.load(f) 
            df_ens = df[1].loc[:, cols]
            df_ens.columns = cols_ens
            df_merged = pd.concat([df[0], df_ens], axis=1)    
            dfs_list.append(df_merged)
            
    df_tuning = pd.concat(dfs_list, sort=False)
    
    return df_tuning.reset_index(drop=True)

In [7]:
df_tunings_dir_odir = read_in_results2(PATH_DF_DIR_ODIR)
df_tunings_dir = read_in_results2(PATH_DF_DIR)
df_tunings_mat_odir = read_in_results2(PATH_DF_MAT_ODIR)

## Best results of Dir-l2, Dir-ODIR, MS-ODIR

In [8]:
def get_best_validation(df, method_name, param = "Loss", equal_mu = False):


    names = df.Name.unique()
    
    results = []

    for name in names:
        
        df_sub = df[df.Name == name]
        if equal_mu:  # Get only equal L2 and mu values
            df_sub = df_sub[df_sub.mu == df_sub.L2]

        best_lambda = df_sub.sort_values([param]).iloc[0]["L2"]
        #best_mu = df_sub.sort_values([param]).iloc[0]["mu"]

        results.append(df_sub.loc[df_sub.Name == name].sort_values([param]).iloc[0])
        
    df_temp = pd.DataFrame(results, columns=df_sub.columns)
    cols = df_temp.columns
    cols = list(cols[:1]) + ["Method"] + list(cols[3:]) + ["Opt_by", "L2", "mu"]
    cols.remove("Brier_std")
    cols.remove("Loss_std")
    
    df_temp = df_temp.assign(Opt_by = param).assign(Method = method_name)
    
    
    return df_temp[cols].sort_values(["Name"])

In [9]:
def get_opt_results(df_opt, measure = "Loss_test", opt_by = ["Loss", "Brier", "Error", "ECE"], equal_mu = False):
    

    res = []
    columns = []

    for opt in opt_by:
        df = get_best_validation(df_opt, param=opt, equal_mu = equal_mu).sort_values(["Name"])
        res.append(df[measure].values)
        columns.append("Dirichlet NN -" + opt + " Opt")

    return pd.DataFrame(np.array(res).T, columns=columns)

In [10]:
df_dir_l2_mu_off = get_best_validation(df_tunings_dir_odir, method_name="dir_l2_mu_off", param="Loss", equal_mu=False).round(7)
df_dir_l2 = get_best_validation(df_tunings_dir, method_name="dir_l2", param="Loss", equal_mu=True).round(7) # Equal_mu = True in order to get only L2 regularisation.
df_mat_scale_l2_mu_off = get_best_validation(df_tunings_mat_odir, method_name="mat_scale_l2_mu_off", param="Loss", 
                                             equal_mu=False).round(7)


In [11]:
df_dir_l2_mu_off.head()

Unnamed: 0,Name,Method,Error,ECE,ECE2,ECE_CW,ECE_CW2,ECE_FULL,ECE_FULL2,MCE,...,ECE_CW2_test_ens,ECE_FULL_test_ens,ECE_FULL2_test_ens,MCE_test_ens,MCE2_test_ens,Loss_test_ens,Brier_test_ens,Opt_by,L2,mu
747,densenet40_c10,dir_l2_mu_off,6.24,0.015713,0.002651,0.083977,0.018462,0.167597,0.070183,0.412288,...,0.002633,0.143215,0.054716,0.078576,0.076491,0.223705,0.01097,Loss,1000.0,0.01
4007,densenet40_c100,dir_l2_mu_off,29.62,0.0308,0.000488,0.461025,0.128261,0.791388,0.384814,0.159051,...,0.024202,0.763024,0.353489,0.040744,0.042889,1.059094,0.003998,Loss,5000.0,0.01
1044,lenet5_c10,dir_l2_mu_off,25.52,0.039408,0.006646,0.176001,0.023353,0.654828,0.295644,0.173591,...,0.002486,0.52917,0.211955,0.090094,0.0423,0.744405,0.03511,Loss,0.25,1e-05
3380,lenet5_c100,dir_l2_mu_off,62.26,0.039782,0.001151,0.55228,0.119505,1.489948,0.742912,0.130237,...,0.023383,1.466895,0.689572,0.067742,0.067742,2.489512,0.007605,Loss,0.25,1000000.0
2328,pretrained_c10,dir_l2_mu_off,6.28,0.016186,0.00286,0.089894,0.019674,0.17309,0.070913,0.37011,...,0.006201,0.139026,0.054544,0.248298,0.081494,0.195364,0.009548,Loss,5000.0,10000.0


In [12]:
df_dir_l2.head()

Unnamed: 0,Name,Method,Error,ECE,ECE2,ECE_CW,ECE_CW2,ECE_FULL,ECE_FULL2,MCE,...,ECE_CW2_test_ens,ECE_FULL_test_ens,ECE_FULL2_test_ens,MCE_test_ens,MCE2_test_ens,Loss_test_ens,Brier_test_ens,Opt_by,L2,mu
1782,densenet40_c10,dir_l2,6.5,0.012357,0.002293,0.086156,0.018295,0.176139,0.072837,0.433928,...,0.002692,0.145689,0.055899,0.116788,0.047763,0.220482,0.011024,Loss,0.01,0.01
289,densenet40_c100,dir_l2,35.98,0.093823,0.004412,0.572161,0.140565,1.069323,0.47965,0.217374,...,0.040958,1.051383,0.441055,0.197476,0.197476,1.25293,0.004683,Loss,0.1,0.1
2177,lenet5_c10,dir_l2,25.7,0.04128,0.006159,0.174665,0.024615,0.65564,0.293936,0.165902,...,0.002199,0.527884,0.213082,0.051116,0.03876,0.744182,0.035097,Loss,0.001,0.001
504,lenet5_c100,dir_l2,66.18,0.036629,0.002595,0.542686,0.10855,1.560279,0.758551,0.130136,...,0.022582,1.517733,0.682036,0.056782,0.056782,2.594822,0.007858,Loss,0.01,0.01
3859,pretrained_c10,dir_l2,6.7,0.01677,0.003013,0.093739,0.021596,0.176871,0.070774,0.274225,...,0.00534,0.139637,0.055089,0.124316,0.124316,0.19692,0.009694,Loss,0.01,0.01


In [13]:
df_mat_scale_l2_mu_off.head()

Unnamed: 0,Name,Method,Error,ECE,ECE2,ECE_CW,ECE_CW2,ECE_FULL,ECE_FULL2,MCE,...,ECE_CW2_test_ens,ECE_FULL_test_ens,ECE_FULL2_test_ens,MCE_test_ens,MCE2_test_ens,Loss_test_ens,Brier_test_ens,Opt_by,L2,mu
1936,densenet40_c10,mat_scale_l2_mu_off,6.06,0.015647,0.002933,0.084829,0.019821,0.168832,0.070448,0.469684,...,0.00263,0.140095,0.054258,0.11297,0.083437,0.2224,0.010965,Loss,250.0,0.0001
546,densenet40_c100,mat_scale_l2_mu_off,29.76,0.034567,0.00071,0.456929,0.13111,0.762973,0.378863,0.14373,...,0.026611,0.738238,0.346329,0.050041,0.048906,1.050843,0.003995,Loss,2.5,10000.0
2247,lenet5_c10,mat_scale_l2_mu_off,25.66,0.039376,0.006324,0.179203,0.025234,0.656221,0.297733,0.188044,...,0.002484,0.52443,0.211681,0.100608,0.052627,0.742618,0.035022,Loss,0.1,1e-05
54,lenet5_c100,mat_scale_l2_mu_off,62.34,0.035335,0.000772,0.539752,0.118223,1.499788,0.736994,0.112677,...,0.022916,1.47192,0.68003,0.089387,0.089387,2.486696,0.007598,Loss,0.25,0.01
3966,pretrained_c10,mat_scale_l2_mu_off,6.3,0.018967,0.003141,0.089739,0.020291,0.172345,0.071405,0.308563,...,0.00545,0.140909,0.053865,0.076215,0.081047,0.19634,0.009532,Loss,1000.0,100000.0


## Temperature Scaling and Vector Scaling

In [14]:
df_temp_scale_old = read_in_results_guo(PATH_DF_TEMP)
df_vec_scale_old = read_in_results_guo(PATH_DF_VEC)

In [15]:
df_temp_scale_old.head()

Unnamed: 0,Name,Error,ECE,ECE2,ECE_CW,ECE_CW2,ECE_FULL,ECE_FULL2,MCE,MCE2,Loss,Brier
0,resnet_wide32_c10,6.07,0.045054,0.008892,0.095299,0.011009,0.113805,0.03057,0.372155,0.32434,0.381704,0.010466
1,resnet_wide32_c10_calib,6.07,0.007838,0.001432,0.047749,0.004808,0.134484,0.048069,0.070598,0.092702,0.191482,0.009237
2,resnet_wide32_c10_val_calib,5.58,0.006715,0.001086,0.054137,0.007085,0.138973,0.051454,0.237802,0.065407,0.175842,0.008473
3,densenet40_c10,7.58,0.055003,0.011118,0.114303,0.013347,0.138052,0.038779,0.333955,0.333955,0.428207,0.012738
4,densenet40_c10_calib,7.58,0.009464,0.001835,0.039768,0.002978,0.146121,0.056088,0.099293,0.05591,0.225086,0.010997


The calibration scores come in a little bit different format for Temperature Scaling and Vector Scaling. So we use guo_results_to_df in order to reformat everything into the same shape

In [16]:
def guo_results_to_df(df_res, method_name):
    

    df_res_test = df_res[1::3]
    df_res_test = df_res_test.assign(Name = [name[:-6] for name in df_res_test.Name])
    df_res_test = df_res_test.sort_values(["Name"]).iloc[:, 1:].reset_index(drop=True)

    df_res_val = df_res[2::3]
    df_res_val = df_res_val.assign(Name = [name[:-10] for name in df_res_val.Name])
    df_res_val = df_res_val.sort_values(["Name"]).iloc[:,:-1].reset_index(drop=True)

    new_columns = ["Error_test", "ECE_test", "ECE2_test", "ECE_CW_test", "ECE_CW2_test", "ECE_FULL_test",
                                  "ECE_FULL2_test", "MCE_test", "MCE2_test", "Loss_test", "Brier_test", "Temperature"]
    
    new_columns = [col + "_ens" for col in new_columns]
    
    df_res_test.columns = new_columns[:len(df_res_test.columns)]
    df_merge = pd.concat([df_res_val, df_res_test], axis=1, sort=False).assign(Method = method_name)
    cols = df_merge.columns
    cols = list(cols[:1]) + ["Method"] + list(cols[1:-1])
    return df_merge[cols].round(7)

In [17]:
df_temp_scale = guo_results_to_df(df_temp_scale_old, "temp_scale")
df_vec_scale = guo_results_to_df(df_vec_scale_old, "vec_scale")

In [18]:
df_temp_scale.head()

Unnamed: 0,Name,Method,Error,ECE,ECE2,ECE_CW,ECE_CW2,ECE_FULL,ECE_FULL2,MCE,...,ECE_test_ens,ECE2_test_ens,ECE_CW_test_ens,ECE_CW2_test_ens,ECE_FULL_test_ens,ECE_FULL2_test_ens,MCE_test_ens,MCE2_test_ens,Loss_test_ens,Brier_test_ens
0,densenet40_c10,temp_scale,6.18,0.005947,0.001024,0.048018,0.005209,0.148095,0.0569,0.107191,...,0.009464,0.001834,0.039768,0.002978,0.14612,0.056088,0.099293,0.05591,0.225086,0.010998
1,densenet40_c100,temp_scale,29.74,0.014442,0.000219,0.255396,0.045779,0.791452,0.372116,0.045564,...,0.009021,0.000177,0.186638,0.025082,0.776049,0.358816,0.022128,0.021038,1.057131,0.004009
2,lenet5_c10,temp_scale,27.28,0.01346,0.002358,0.167852,0.01774,0.628964,0.269223,0.116833,...,0.016653,0.002161,0.171409,0.017506,0.585236,0.239951,0.091579,0.038065,0.800311,0.037476
3,lenet5_c100,temp_scale,65.54,0.018939,0.000257,0.423656,0.052449,1.550672,0.697519,0.076594,...,0.014989,0.000231,0.384814,0.03541,1.529989,0.659623,0.058358,0.058358,2.649791,0.007925
4,pretrained_c10,temp_scale,6.26,0.005647,0.001073,0.037638,0.003763,0.147001,0.057338,0.258719,...,0.010651,0.001626,0.04423,0.005349,0.141572,0.05424,0.231497,0.08579,0.194969,0.009517


## Uncalibrated results

For that we can use temperature scaling initial results

In [19]:
cols_test = ["Name", "Method", "Error_test", "ECE_test", "ECE2_test", "ECE_CW_test", "ECE_CW2_test", "ECE_FULL_test",
                                  "ECE_FULL2_test", "MCE_test", "MCE2_test", "Loss_test", "Brier_test"]
cols = ["Name", "Method", "Error", "ECE", "ECE2", "ECE_CW", "ECE_CW2", "ECE_FULL", "ECE_FULL2", "MCE", "MCE2", "Loss", "Brier"]

In [20]:
df_uncal = df_temp_scale_old[0::3].sort_values(["Name"]).iloc[:, :].reset_index(drop=True)
df_uncal = df_uncal.assign(Method = "uncal").round(7)
df_uncal = df_uncal[cols]
df_uncal.columns = cols_test

In [21]:
df_uncal.head()

Unnamed: 0,Name,Method,Error_test,ECE_test,ECE2_test,ECE_CW_test,ECE_CW2_test,ECE_FULL_test,ECE_FULL2_test,MCE_test,MCE2_test,Loss_test,Brier_test
0,densenet40_c10,uncal,7.58,0.055003,0.011118,0.114303,0.013347,0.138052,0.038779,0.333955,0.333955,0.428207,0.012738
1,densenet40_c100,uncal,30.0,0.211563,0.004338,0.470262,0.090263,0.594952,0.264311,0.454003,0.454003,2.017398,0.004907
2,lenet5_c10,uncal,27.26,0.051801,0.010573,0.19849,0.020145,0.535078,0.216411,0.112807,0.112807,0.823261,0.03788
3,lenet5_c100,uncal,66.41,0.121247,0.003249,0.472635,0.057034,1.463512,0.745282,0.200969,0.200969,2.783648,0.008128
4,pretrained_c10,uncal,6.18,0.047603,0.009451,0.103751,0.013446,0.120333,0.031971,0.591731,0.36777,0.390977,0.010899


In [22]:
df_temp_scale.columns

Index(['Name', 'Method', 'Error', 'ECE', 'ECE2', 'ECE_CW', 'ECE_CW2',
       'ECE_FULL', 'ECE_FULL2', 'MCE', 'MCE2', 'Loss', 'Error_test_ens',
       'ECE_test_ens', 'ECE2_test_ens', 'ECE_CW_test_ens', 'ECE_CW2_test_ens',
       'ECE_FULL_test_ens', 'ECE_FULL2_test_ens', 'MCE_test_ens',
       'MCE2_test_ens', 'Loss_test_ens', 'Brier_test_ens'],
      dtype='object')

## Save dataframes with scores.

In [23]:
dfs = [df_uncal, df_temp_scale, df_vec_scale, df_dir_l2, df_dir_l2_mu_off, df_mat_scale_l2_mu_off]

In [24]:
with open("all_scores_val_test_ens_10_27.p", "wb") as f:
    pickle.dump(dfs, f)

## Results to $\LaTeX$

### Load in saved results

In [25]:
with open("all_scores_val_test_ens_10_27.p", "rb") as f:
    dfs = pickle.load(f)

In [26]:
cols_test = ["Name", "Method", "Error_test", "ECE_test", "ECE2_test", "ECE_CW_test", "ECE_CW2_test", "ECE_FULL_test",
                                  "ECE_FULL2_test", "MCE_test", "MCE2_test", "Loss_test", "Brier_test"]
cols_ens = cols_test[:2] + [col + "_ens" for col in cols_test[2:]]

### Concat the results in suitable format

In [27]:
dfs_test = dfs[:1]
dfs_test = [df[cols_test] for df in dfs_test]
dfs_test = pd.concat(dfs_test)

In [28]:
dfs_ens = dfs[1:]
dfs_ens = [df[cols_ens] for df in dfs_ens]
dfs_ens = pd.concat(dfs_ens)
dfs_ens.columns = cols_test

In [29]:
dfs_all = pd.concat([dfs_test, dfs_ens])

In [30]:
dfs_all.reset_index(drop=True, inplace=True)

In [31]:
dfs_all

Unnamed: 0,Name,Method,Error_test,ECE_test,ECE2_test,ECE_CW_test,ECE_CW2_test,ECE_FULL_test,ECE_FULL2_test,MCE_test,MCE2_test,Loss_test,Brier_test
0,densenet40_c10,uncal,7.580000,0.055003,0.011118,0.114303,0.013347,0.138052,0.038779,0.333955,0.333955,0.428207,0.012738
1,densenet40_c100,uncal,30.000000,0.211563,0.004338,0.470262,0.090263,0.594952,0.264311,0.454003,0.454003,2.017398,0.004907
2,lenet5_c10,uncal,27.260000,0.051801,0.010573,0.198490,0.020145,0.535078,0.216411,0.112807,0.112807,0.823261,0.037880
3,lenet5_c100,uncal,66.410000,0.121247,0.003249,0.472635,0.057034,1.463512,0.745282,0.200969,0.200969,2.783648,0.008128
4,pretrained_c10,uncal,6.180000,0.047603,0.009451,0.103751,0.013446,0.120333,0.031971,0.591731,0.367770,0.390977,0.010899
5,pretrained_c100,uncal,26.120000,0.176137,0.003702,0.424139,0.088914,0.532955,0.244951,0.363908,0.363908,1.641199,0.004247
6,pretrained_mnist,uncal,1.820000,0.007475,0.001557,0.024483,0.005068,0.039012,0.014949,0.262550,0.243274,0.057663,0.002761
7,pretrained_svhn,uncal,3.827501,0.077549,0.015177,0.159350,0.018181,0.230387,0.060843,0.300115,0.300115,0.204602,0.007761
8,resnet110_SD_c10,uncal,5.960000,0.041126,0.008144,0.086471,0.009426,0.108112,0.030542,0.324843,0.300097,0.303252,0.009814
9,resnet110_SD_c100,uncal,27.170000,0.158609,0.003258,0.375184,0.067808,0.552465,0.261161,0.482906,0.352578,1.352503,0.004184


In [32]:
def get_latex(dfs, methods_sorted, value = "Loss", round_to = 2, start = 4, start_w = 1, index=True, multiplier = 1):
    df_temp = dfs.pivot_table(index="Name", columns="Method", values=value + "_test")[methods_sorted] #.to_clipboard()
    
    df_temp.index = index_new
    df_temp = df_temp.reindex(models_sorted)
    
    df_ranks = df_temp.rank(axis=1)
    df_temp = (df_temp*multiplier).round(round_to)
    
    # Get latex lines
    str_latex = df_temp.to_latex(index=index)
    latex_lines = str_latex.splitlines()

    ## Get average ranks line with "midrule"
    avg_ranks = " & ".join(map(str, df_ranks.mean().values.round(2)))
    column_name_avg_rank = "" if not index else " avg rank & "
    avg_ranks_str = "\\midrule \n" + column_name_avg_rank + avg_ranks + "\\\\"
    
    
    if latex_lines[0][-8] == "l":
        end_piece = "|c|ccc|cc}"
    else:
        end_piece = "c|ccc|cc}"

    first_line = latex_lines[0][:-7] + end_piece  # TODO based on column numbers
    #latex_lines[0]

    header_str_extra = "" if not index else " & "
    header_str = ["\\begin{table}",
                  "\\centering",
                  "\\captionof{table}{%s}" % value,
                  "\\tiny",
                  first_line,
                  "\\toprule",
                  "%s       & \\multicolumn{3}{c}{general-purpose calibrators} & \\multicolumn{2}{c}{neural-specific calibrators}\\\\ " % header_str_extra,
                  "%s Uncal &  TempS &  Dir-L2 &  Dir-ODIR &  VecS & MS-ODIR \\\\" % header_str_extra,
                  "\\midrule"]
    
    tail_str = ["\\normalsize",
                "\\label{table:res:dnn:%s}" % value.lower(),
                "\hfill",
                "\end{table}"]

    for i, line in enumerate(latex_lines[start:-2]):  # Starting line and ending line, may need some changes
        #print(i, line)  # Debug line printing

        words = line.split("&")

        for j, nr in enumerate(words[start_w:]):
            nr_str = nr.strip(" \\")
            rank_i = df_ranks.iloc[i, j]
            new_nr = "%s_{%i}" % (nr_str, rank_i)

            if rank_i == 1:
                new_nr = "$\mathbf{%s}$" % new_nr
            else:
                new_nr = "$%s$" % new_nr

            words[j + start_w] = new_nr

        new_line = " & ".join(words) + " \\\\"

        latex_lines[i + start] = new_line
        
    latex_lines.insert(start + 6, "\\hline")
    latex_lines.insert(start + 6 + 6 + 1, "\\hline")

    latex_lines.insert(i + start + 1 + 2, avg_ranks_str)
    latex_lines = header_str + latex_lines[start:] + tail_str
    latex_str_new = "\n".join(latex_lines)
    
    return latex_str_new

### Change the index and sort columns

In [33]:
index_new = ['c10_densenet40', 'c100_densenet40', 'c10_lenet5', 'c100_lenet5', 'c10_convnet', 'c100_convnet', 'mnist_mlp',
             'SVHN_convnet', 'c10_resnet110_SD', 'c100_resnet110_SD', 'c10_resnet110', 'c100_resnet110', 'SVHN_resnet152_SD',
             'c10_resnet_wide32', 'c100_resnet_wide32']

models_sorted = ['c10_convnet', 'c10_densenet40', 'c10_lenet5', 'c10_resnet110', 'c10_resnet110_SD', 'c10_resnet_wide32',
                 'c100_convnet', 'c100_densenet40', 'c100_lenet5', 'c100_resnet110', 'c100_resnet110_SD', 'c100_resnet_wide32',
                 'SVHN_convnet', 'SVHN_resnet152_SD']

methods_sorted = ['uncal', 'temp_scale', "dir_l2", 'dir_l2_mu_off', 'vec_scale', "mat_scale_l2_mu_off"]



In [34]:
index_new_v2 = []

for n in models_sorted:
    splits = n.split("_")
    new = "_".join([splits[-1]] + list(splits[:-1]))
    
    index_new_v2.append(new)

### Generate Tables 3 & 4

In [35]:
latex_str = get_latex(dfs_all, methods_sorted, value = "Loss", round_to=3, index=False, start_w = 0, start=4)
print(latex_str)

\begin{table}
\centering
\captionof{table}{Loss}
\tiny
\begin{tabular}{c|ccc|cc}
\toprule
       & \multicolumn{3}{c}{general-purpose calibrators} & \multicolumn{2}{c}{neural-specific calibrators}\\ 
 Uncal &  TempS &  Dir-L2 &  Dir-ODIR &  VecS & MS-ODIR \\
\midrule
$0.391_{6}$ & $\mathbf{0.195_{1}}$ & $0.197_{4}$ & $0.195_{2}$ & $0.198_{5}$ & $0.196_{3}$ \\
$0.428_{6}$ & $0.225_{5}$ & $\mathbf{0.220_{1}}$ & $0.224_{4}$ & $0.222_{2}$ & $0.222_{3}$ \\
$0.823_{6}$ & $0.800_{5}$ & $0.744_{2}$ & $0.744_{3}$ & $0.747_{4}$ & $\mathbf{0.743_{1}}$ \\
$0.358_{6}$ & $0.209_{5}$ & $\mathbf{0.203_{1}}$ & $0.205_{3}$ & $0.206_{4}$ & $0.204_{2}$ \\
$0.303_{6}$ & $0.178_{5}$ & $0.177_{4}$ & $0.176_{3}$ & $0.175_{2}$ & $\mathbf{0.175_{1}}$ \\
$0.382_{6}$ & $0.191_{5}$ & $0.185_{4}$ & $0.182_{2}$ & $0.183_{3}$ & $\mathbf{0.182_{1}}$ \\
\hline
$1.641_{6}$ & $\mathbf{0.942_{1}}$ & $1.189_{5}$ & $0.961_{2}$ & $0.963_{4}$ & $0.961_{3}$ \\
$2.017_{6}$ & $1.057_{2}$ & $1.253_{5}$ & $1.059_{4}$ & $1.057_{3}$

In [36]:
latex_str = get_latex(dfs_all, methods_sorted, value = "ECE_CW", round_to=3, index=True, start_w = 1, start=4, multiplier=1)
print(latex_str)

\begin{table}
\centering
\captionof{table}{ECE_CW}
\tiny
\begin{tabular}{l|c|ccc|cc}
\toprule
 &        & \multicolumn{3}{c}{general-purpose calibrators} & \multicolumn{2}{c}{neural-specific calibrators}\\ 
 &  Uncal &  TempS &  Dir-L2 &  Dir-ODIR &  VecS & MS-ODIR \\
\midrule
c10\_convnet         & $0.104_{6}$ & $0.044_{4}$ & $\mathbf{0.043_{1}}$ & $0.045_{5}$ & $0.044_{3}$ & $0.044_{2}$ \\
c10\_densenet40      & $0.114_{6}$ & $0.040_{5}$ & $\mathbf{0.034_{1}}$ & $0.037_{4}$ & $0.035_{2}$ & $0.037_{3}$ \\
c10\_lenet5          & $0.198_{6}$ & $0.171_{5}$ & $\mathbf{0.052_{1}}$ & $0.059_{4}$ & $0.056_{2}$ & $0.059_{3}$ \\
c10\_resnet110       & $0.098_{6}$ & $0.043_{5}$ & $\mathbf{0.032_{1}}$ & $0.039_{4}$ & $0.036_{2}$ & $0.036_{3}$ \\
c10\_resnet110\_SD    & $0.086_{6}$ & $0.031_{4}$ & $0.031_{5}$ & $0.029_{3}$ & $0.027_{2}$ & $\mathbf{0.027_{1}}$ \\
c10\_resnet\_wide32   & $0.095_{6}$ & $0.048_{5}$ & $0.032_{4}$ & $0.029_{2}$ & $0.031_{3}$ & $\mathbf{0.029_{1}}$ \\
\hline
c100\_convn

### Generate the tables for Supplemental (Table 13-18)

In [37]:
def save_latex(value = "Loss"):

    latex_str = get_latex(dfs_all, methods_sorted, value = value, round_to=5, index=True, start_w = 1, start=4)

    with open("results_dnn_%s.tex" % value.lower(), "w") as f:
        f.write(latex_str)

In [38]:
dfs_all.pivot_table(index="Name", columns="Method", values="Loss" + "_test")[methods_sorted]

Method,uncal,temp_scale,dir_l2,dir_l2_mu_off,vec_scale,mat_scale_l2_mu_off
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
densenet40_c10,0.428207,0.225086,0.220482,0.223705,0.222396,0.2224
densenet40_c100,2.017398,1.057131,1.25293,1.059094,1.057139,1.050843
lenet5_c10,0.823261,0.800311,0.744182,0.744405,0.74688,0.742618
lenet5_c100,2.783648,2.649791,2.594822,2.489512,2.516952,2.486696
pretrained_c10,0.390977,0.194969,0.19692,0.195364,0.197745,0.19634
pretrained_c100,1.641199,0.94162,1.189449,0.961209,0.963109,0.961407
pretrained_mnist,0.057663,0.053675,0.059406,0.056416,0.05615,0.055721
pretrained_svhn,0.204602,0.15142,0.142456,0.137914,0.143917,0.137603
resnet110_SD_c10,0.303252,0.177605,0.176939,0.176085,0.175447,0.175373
resnet110_SD_c100,1.352503,0.942142,1.19837,0.94477,0.922748,0.927306


In [39]:
measures = ["Error", "ECE", "ECE_CW", "Loss", "Brier"]

In [40]:
for measure in measures:
    print(measure)
    save_latex(measure)

Error
ECE
ECE_CW
Loss
Brier


## CD diagrams for Supplemental Figure 11

In [41]:
from scipy.stats import rankdata
from scipy.stats import friedmanchisquare
import Orange

def compute_friedmanchisquare(table):
    '''
    Example:
        - n wine judges each rate k different wines. Are any of the k wines
        ranked consistently higher or lower than the others?
    Our Calibration case:
        - n datasets each rate k different calibration methods. Are any of the
        k calibration methods ranked consistently higher or lower than the
        others?
    This will output a statistic and a p-value
    SciPy does the following:
        - k: is the number of parameters passed to the function
        - n: is the lenght of each array passed to the function
    The two options for the given table are:
        - k is the datasets: table['mean'].values).tolist()
        - k is the calibration methods: table['mean'].T.values).tolist()
    '''
    return friedmanchisquare(*(table.T.values).tolist())

In [42]:
def export_critical_difference(avranks, num_datasets, names, filename,
                               title=None, test='bonferroni-dunn'):
    '''
        test: string in ['nemenyi', 'bonferroni-dunn']
         - nemenyi two-tailed test (up to 20 methods)
         - bonferroni-dunn one-tailed test (only up to 10 methods)

    '''
    if len(avranks) > 10:
        print('Forcing Nemenyi Critical difference')
        test = 'nemenyi'
    cd = Orange.evaluation.compute_CD(avranks, num_datasets, alpha='0.05',
                                      test=test)
    Orange.evaluation.graph_ranks(avranks, names, cd=cd, width=6,
                                  textspace=1.5)
    fig = plt.gcf()
    fig.suptitle(title, horizontalalignment='left')
    plt.savefig(filename)
    plt.close()

In [43]:
methods_sorted_new = [" Uncal",  "TempS", "Dir-L2", "Dir-ODIR", "VecS", "MS-ODIR"]

In [44]:
import os

def get_cd_diagram(dfs_all, measure = "Loss", max_is_better = False, summary_path = "cd_diagrams"):

    table = dfs_all.pivot_table(index="Name", columns="Method", values=measure + "_test")[methods_sorted]
    table.index = index_new
    table = table.reindex(models_sorted)
    table.columns = methods_sorted_new
    
    if max_is_better:
        table *= -1
    
    ranking_table_all = table.apply(rankdata, axis=1).mean()

    ftest = compute_friedmanchisquare(table)
    print('Friedman test on the full table of shape {}'.format(
                np.shape(table)))
    print(ftest)
    
    if not os.path.exists(summary_path):
        print(":/")
        os.makedirs(summary_path)
        
    export_critical_difference(avranks=ranking_table_all,
                           num_datasets=len(table),
                           names=table.columns,
                           filename=os.path.join(summary_path,
                                                 'crit_diff_' +
                                                 measure + '_v2.pdf'),
                           title='(p-value = {:.2e}, #D = {})'.format(ftest.pvalue, len(table)))

In [45]:
measures = ["Error", "ECE", "ECE_CW", "MCE", "Loss", "Brier"]

In [46]:
from matplotlib import pyplot as plt 

for m in measures:
    get_cd_diagram(dfs_all, measure = m)

Friedman test on the full table of shape (14, 6)
FriedmanchisquareResult(statistic=21.41350210970459, pvalue=0.0006765385101213313)
:/
Friedman test on the full table of shape (14, 6)
FriedmanchisquareResult(statistic=30.897959183673436, pvalue=9.811966828236182e-06)
Friedman test on the full table of shape (14, 6)
FriedmanchisquareResult(statistic=30.530612244897952, pvalue=1.1593715754985733e-05)
Friedman test on the full table of shape (14, 6)
FriedmanchisquareResult(statistic=25.551020408163254, pvalue=0.00010902612591859429)
Friedman test on the full table of shape (14, 6)
FriedmanchisquareResult(statistic=40.12244897959181, pvalue=1.4108204351661952e-07)
Friedman test on the full table of shape (14, 6)
FriedmanchisquareResult(statistic=42.44897959183669, pvalue=4.778127100459763e-08)
