# pECE results

Generate Latex tables and CD-diagrams for p-confidence-ECE and p-classwise-ECE.

1. Models need to be trained and tuned for calibrators
2. The "Dirichlet - Final Results" notebook should be runned in order to get all_results which is used for generate_pECE.py
3. pECE results for confidence ECE and classwise ECE should be generated using scripts in folder pECE_generation
4. Put the tunings in correct folders and run this notebook

In [1]:
# Imports to get "utility" package
import sys
from os import path
sys.path.append( path.dirname( path.dirname( path.abspath("calibration") ) ) )

import numpy as np
import pandas as pd
from os.path import join
from calibration.cal_methods import evaluate, cal_results, TemperatureScaling, MatrixScaling, LogisticCalibration, VectorScaling_NN, softmax
from dirichlet import FullDirichletCalibrator
import pickle
from utility.unpickle_probs import unpickle_probs
from utility.evaluation import pECE, classwise_ECE, full_ECE, score_sampling
from scipy.stats import percentileofscore
from sklearn.preprocessing import label_binarize

Using TensorFlow backend.


In [2]:
import sys
from os import path
import os
import glob

In [3]:
PATH = join('..', '..', 'logits')
files_10 = ('probs_resnet_wide32_c10_logits.p', 'probs_densenet40_c10_logits.p',
            'probs_lenet5_c10_logits.p', 'probs_resnet110_SD_c10_logits.p',
           'probs_resnet110_c10_logits.p', 'probs_resnet152_SD_SVHN_logits.p',
           'logits_pretrained_c10_logits.p', 'logits_pretrained_mnist_logits.p',
           'logits_pretrained_svhn_logits.p')

files_100 = ('probs_resnet_wide32_c100_logits.p', 'probs_densenet40_c100_logits.p',
             'probs_lenet5_c100_logits.p', 'probs_resnet110_SD_c100_logits.p',
             'probs_resnet110_c100_logits.p', 'logits_pretrained_c100_logits.p')

In [4]:
path_pECE_cw = join("..", "..", "tunings_all", "generated_pECE", "classwise_ECE")
path_pECE_guo = join("..", "..", "tunings_all", "generated_pECE", "guo_ECE")

In [5]:
def read_in_results2(path, ext = ".p"):
    
    file_path = join(path, "*" + ext)
    files = glob.glob(file_path)

    dfs_list = []

    for fname in files:
        with open(fname, "rb") as f:
            df = pickle.load(f)  
            dfs_list.append(df)
            
    df_tuning = pd.concat(dfs_list, sort=False)
    
    return df_tuning.reset_index(drop=True)

In [6]:
def gen_result_df(path):
    df_gen = read_in_results2(path)
    df_gen['Method'].fillna(df_gen['method'], inplace=True)
    df_gen['pECE'].fillna(df_gen['pECE_ens'], inplace=True)
    
    df_gen.drop(columns=["method", "pECE_ens"], inplace=True)
    
    return df_gen


In [7]:
df_cw = gen_result_df(path_pECE_cw)
df_cw.head()

Unnamed: 0,Name,pECE,L2,mu,Method,index
0,densenet40_c10,0.0,250.0,0.0001,mat_scale_l2_mu_off,
1,densenet40_c100,0.0,2.5,10000.0,mat_scale_l2_mu_off,
2,lenet5_c10,0.0501,0.1,1e-05,mat_scale_l2_mu_off,
3,lenet5_c100,0.0,0.25,0.01,mat_scale_l2_mu_off,
4,pretrained_c10,0.0114,1000.0,100000.0,mat_scale_l2_mu_off,


In [8]:
df_guo = read_in_results2(path_pECE_guo)
df_guo.head()

Unnamed: 0,Name,pECE,Method,L2,mu,index
0,resnet_wide32_c10,0.059,dir_diag,,,
1,densenet40_c10,0.0,dir_diag,,,
2,lenet5_c10,0.029,dir_diag,,,
3,resnet110_SD_c10,0.061,dir_diag,,,
4,resnet110_c10,0.0,dir_diag,,,


In [9]:
df_guo.Method.unique()

array(['dir_diag', 'dir_l2', 'dir_l2_mu', 'dir_l2_mu_off', 'dir_l2_off',
       'dir_vec_scale', 'mat_scale_l2_mu_off', 'mat_scale_l2_mu',
       'mat_scale_l2_off', 'mat_scale_l2', 'temp_scale', 'vec_scale',
       'uncal'], dtype=object)

In [10]:
df_cw.Method.unique()

array(['mat_scale_l2_mu_off', 'mat_scale_l2_mu', 'mat_scale_l2_off',
       'mat_scale_l2', 'dir_diag', 'dir_l2', 'dir_l2_mu', 'dir_l2_off',
       'dir_l2_mu_off', 'dir_vec_scale', 'temp_scale', 'vec_scale',
       'uncal'], dtype=object)

In [11]:
methods_sorted = ['temp_scale', 'vec_scale', 'dir_diag', 
                  'dir_l2', "mat_scale_l2", 
                  'dir_l2_mu', "mat_scale_l2_mu",
                  'dir_l2_off', "mat_scale_l2_off",
                  'dir_l2_mu_off', "mat_scale_l2_mu_off"]

In [12]:
df_guo.pivot_table(index="Name", columns="Method", values="pECE")[methods_sorted].to_clipboard()

In [13]:
df_cw.pivot_table(index="Name", columns="Method", values="pECE")[methods_sorted].to_clipboard()

In [14]:
index_new = ['c10_densenet40', 'c100_densenet40', 'c10_lenet5', 'c100_lenet5', 'c10_convnet', 'c100_convnet', 'mnist_mlp',
             'SVHN_convnet', 'c10_resnet110_SD', 'c100_resnet110_SD', 'c10_resnet110', 'c100_resnet110', 'SVHN_resnet152_SD',
             'c10_resnet_wide32', 'c100_resnet_wide32']

models_sorted = ['c10_convnet', 'c10_densenet40', 'c10_lenet5', 'c10_resnet110', 'c10_resnet110_SD', 'c10_resnet_wide32',
                 'c100_convnet', 'c100_densenet40', 'c100_lenet5', 'c100_resnet110', 'c100_resnet110_SD', 'c100_resnet_wide32',
                 'mnist_mlp', 'SVHN_convnet', 'SVHN_resnet152_SD']

methods_sorted = ['uncal', 'temp_scale', "dir_l2", 'dir_l2_mu_off', 'vec_scale', "mat_scale_l2_mu_off"]


In [15]:
print(models_sorted)

['c10_convnet', 'c10_densenet40', 'c10_lenet5', 'c10_resnet110', 'c10_resnet110_SD', 'c10_resnet_wide32', 'c100_convnet', 'c100_densenet40', 'c100_lenet5', 'c100_resnet110', 'c100_resnet110_SD', 'c100_resnet_wide32', 'mnist_mlp', 'SVHN_convnet', 'SVHN_resnet152_SD']


In [16]:
def get_latex(dfs, methods_sorted, value = "Loss", round_to = 2, start = 4, start_w = 1, max_is_better = False, 
              index=True, multiplier = 1):
    df_temp = dfs.pivot_table(index="Name", columns="Method", values=value)[methods_sorted] #.to_clipboard()
    
    df_temp.index = index_new
    df_temp = df_temp.reindex(models_sorted)
    
    df_ranks = df_temp.rank(axis=1, ascending=max_is_better)
    df_temp = (df_temp*multiplier).round(round_to)
    
    # Get latex lines
    str_latex = df_temp.to_latex(index=index)
    latex_lines = str_latex.splitlines()

    ## Get average ranks line with "midrule"
    avg_ranks = " & ".join(map(str, df_ranks.mean().values.round(2)))
    column_name_avg_rank = "" if not index else " avg rank & "
    avg_ranks_str = "\\midrule \n" + column_name_avg_rank + avg_ranks + "\\\\"
    
    
    if latex_lines[0][-8] == "l":
        end_piece = "|c|ccc|cc}"
    else:
        end_piece = "c|ccc|cc}"

    first_line = latex_lines[0][:-7] + end_piece  # TODO based on column numbers
    #latex_lines[0]

    header_str_extra = "" if not index else " & "
    header_str = ["\\begin{table}",
                  "\\centering",
                  "\\captionof{table}{%s}" % value,
                  "\\tiny",
                  first_line,
                  "\\toprule",
                  "%s       & \\multicolumn{3}{c}{general-purpose calibrators} & \\multicolumn{2}{c}{neural-specific calibrators}\\\\ " % header_str_extra,
                  "%s Uncal &  TempS &  Dir-L2 &  Dir-ODIR &  VecS & MS-ODIR \\\\" % header_str_extra,
                  "\\midrule"]
    
    tail_str = ["\\normalsize",
                "\\label{table:res:dnn:%s}" % value.lower(),
                "\hfill",
                "\end{table}"]

    for i, line in enumerate(latex_lines[start:-2]):  # Starting line and ending line, may need some changes
        #print(i, line)  # Debug line printing

        words = line.split("&")

        for j, nr in enumerate(words[start_w:]):
            nr_str = nr.strip(" \\")
            rank_i = df_ranks.iloc[i, j]
            new_nr = "%s_{%i}" % (nr_str, rank_i)

            if rank_i == 1:
                new_nr = "$\mathbf{%s}$" % new_nr
            else:
                new_nr = "$%s$" % new_nr

            words[j + start_w] = new_nr

        new_line = " & ".join(words) + " \\\\"

        latex_lines[i + start] = new_line
        
    latex_lines.insert(start + 6, "\\hline")
    latex_lines.insert(start + 6 + 6 + 1, "\\hline")

    latex_lines.insert(i + start + 1 + 2, avg_ranks_str)
    latex_lines = header_str + latex_lines[start:] + tail_str
    latex_str_new = "\n".join(latex_lines)
    
    return latex_str_new

In [17]:
def save_latex(df, extra = "_cw"):
    
    value = "pECE"

    latex_str = get_latex(df, methods_sorted, value = value, round_to=5, index=True, start_w = 1, start=4, max_is_better=False)

    with open("results_dnn_%s%s.tex" % (value.lower(), extra), "w") as f:
        f.write(latex_str)

In [18]:
save_latex(df_guo, extra = "")

In [19]:
save_latex(df_cw)

### CD diagrams

In [20]:
from scipy.stats import rankdata
from scipy.stats import friedmanchisquare
import Orange

def compute_friedmanchisquare(table):
    '''
    Example:
        - n wine judges each rate k different wines. Are any of the k wines
        ranked consistently higher or lower than the others?
    Our Calibration case:
        - n datasets each rate k different calibration methods. Are any of the
        k calibration methods ranked consistently higher or lower than the
        others?
    This will output a statistic and a p-value
    SciPy does the following:
        - k: is the number of parameters passed to the function
        - n: is the lenght of each array passed to the function
    The two options for the given table are:
        - k is the datasets: table['mean'].values).tolist()
        - k is the calibration methods: table['mean'].T.values).tolist()
    '''
    return friedmanchisquare(*(table.T.values).tolist())

In [21]:
def export_critical_difference(avranks, num_datasets, names, filename,
                               title=None, test='bonferroni-dunn'):
    '''
        test: string in ['nemenyi', 'bonferroni-dunn']
         - nemenyi two-tailed test (up to 20 methods)
         - bonferroni-dunn one-tailed test (only up to 10 methods)

    '''
    if len(avranks) > 10:
        print('Forcing Nemenyi Critical difference')
        test = 'nemenyi'
    cd = Orange.evaluation.compute_CD(avranks, num_datasets, alpha='0.05',
                                      test=test)
    Orange.evaluation.graph_ranks(avranks, names, cd=cd, width=6,
                                  textspace=1.5)
    fig = plt.gcf()
    fig.suptitle(title, horizontalalignment='left')
    plt.savefig(filename)
    plt.close()

In [22]:
methods_sorted_new = ["Uncal", "TempS", "Dir-L2", "Dir-ODIR", "VecS", "MS-ODIR"]

In [23]:
import os
from matplotlib import pyplot as plt

def get_cd_diagram(dfs_all, measure = "pECE", extra = "_cw", max_is_better = False, summary_path = "cd_diagrams"):

    table = dfs_all.pivot_table(index="Name", columns="Method", values=measure)[methods_sorted]
    table.index = index_new
    table = table.reindex(models_sorted)
    table.columns = methods_sorted_new
    
    if max_is_better:
        table *= -1
    
    ranking_table_all = table.apply(rankdata, axis=1).mean()

    ftest = compute_friedmanchisquare(table)
    print('Friedman test on the full table of shape {}'.format(
                np.shape(table)))
    print(ftest)
    
    if not os.path.exists(summary_path):
        print(":/")
        os.makedirs(summary_path)
        
    export_critical_difference(avranks=ranking_table_all,
                           num_datasets=len(table),
                           names=table.columns,
                           filename=os.path.join(summary_path,
                                                 'crit_diff_' +
                                                 measure + extra + '_v2.pdf'),
                           title='(p-value = {:.2e}, #D = {})'.format(ftest.pvalue, len(table)))

In [24]:
get_cd_diagram(df_cw, max_is_better=True)

Friedman test on the full table of shape (15, 6)
FriedmanchisquareResult(statistic=16.05504587155968, pvalue=0.006688660141740905)
:/


In [25]:
get_cd_diagram(df_guo, extra="", max_is_better=True)

Friedman test on the full table of shape (15, 6)
FriedmanchisquareResult(statistic=19.76454293628816, pvalue=0.0013834504944968717)
