# Calibration Plots and Results

In [1]:
import numpy as np
from scipy.stats import ks_2samp, wilcoxon
import pickle

from calibration.calibrators import *
from validate import calibration_metrics, validation_metrics
from calibration.metrics import *
from utils.plots import *

import pandas as pd

from scipy.stats import friedmanchisquare


In [2]:
n_bins = 10
mode_metrics = 'quantile'
mode_plot = 'uniform'

# Model vs Model Test

In [3]:
model_files = {
    'vgggace': f'experiments\\VGGFace\\results_MCDP_50_0.5.pkl',
    'ncnn': f'experiments\\NCNN\\results_MCDP_50_0.3.pkl',
    'vitb32': f'experiments\\ViT_B_32_ENSEMBLE\\ensemble_10_results.pkl'
}

In [7]:
model_name = "NCNN"

In [11]:
model_files = {
    'MCDP_0.1': f'experiments\\ViT_B_32\\results_MCDP_50_0.1.pkl',
    'MCDP_0.3': f'experiments\\ViT_B_32\\results_MCDP_50_0.3.pkl',
    'MCDP_0.5': f'experiments\\ViT_B_32\\results_MCDP_50_0.5.pkl'
}

In [8]:
model_files = {
    'ENSEMBLE 3': f'experiments\\{model_name}_ENSEMBLE\\ensemble_3_results.pkl',
    'ENSEMBLE 5': f'experiments\\{model_name}_ENSEMBLE\\ensemble_5_results.pkl',
    'ENSEMBLE 10': f'experiments\\{model_name}_ENSEMBLE\\ensemble_10_results.pkl',
    'MCDP_0.1': f'experiments\\{model_name}\\results_MCDP_50_0.1.pkl',
    'MCDP_0.3': f'experiments\\{model_name}\\results_MCDP_50_0.3.pkl',
    'MCDP_0.5': f'experiments\\{model_name}\\results_MCDP_50_0.5.pkl'
}

In [4]:
model_files = {
    'vgggace': f'experiments\\VGGFace\\results_MCDP_50_0.5.pkl',
    'ncnn': f'experiments\\NCNN\\results_MCDP_50_0.3.pkl',
    'vitb32': f'experiments\\ViT_B_32_ENSEMBLE\\ensemble_10_results.pkl'
}

In [8]:
folder = ''
model_name = 'VGGFace' # change model here

filename = 'results_MCDP_50_0.5.pkl'

original = f'experiments\\{folder}\\{model_name}\\{filename}'
LS_01 = f'experiments\\{folder}\\{model_name}_LS_01\\{filename}'
LS_03 = f'experiments\\{folder}\\{model_name}_LS_03\\{filename}'
LS_05 = f'experiments\\{folder}\\{model_name}_LS_05\\{filename}'
LINEAR = f'experiments\\{folder}\\{model_name}_LINEAR\\{filename}'
SIGMOID = f'experiments\\{folder}\\{model_name}_SIGMOID\\{filename}'
STEP = f'experiments\\{folder}\\{model_name}_STEP\\{filename}'
HIST = f'experiments\\{folder}\\{model_name}_HIST\\{filename}'
ISOTONIC = f'experiments\\{folder}\\{model_name}_ISOTONIC\\{filename}'
PLATT = f'experiments\\{folder}\\{model_name}_PLATT\\{filename}'
TEMPERATURE = f'experiments\\{folder}\\{model_name}_TEMP\\{filename}'


In [9]:
model_files = {
    "ORIGINAL": original,
    "HIST": HIST, 
    "ISOTONIC": ISOTONIC, 
    "LINEAR": LINEAR,
    "LS_01": LS_01,
    "LS_03": LS_03,
    "LS_05": LS_05,
    "PLATT": PLATT,
    "SIGMOID": SIGMOID,
    "STEP": STEP,
    "TEMPERATURE": TEMPERATURE
}

In [5]:
# These dictionaries will store metric values across folds
metric_keys = ['Accuracy', 'F1 Score', 'Precision', 'Sensitivity', 'Specificity', 'AUC', 'ECE', 'MCE', 'NLL', 'Brier']
metrics_dict = {model: {key: [] for key in metric_keys} for model in model_files}

In [6]:
# Load and extract metrics for each model
for model, filepath in model_files.items():
    with open(filepath, 'rb') as f:
        results_df = pd.DataFrame(pickle.load(f))

    for fold in results_df['fold'].unique():
        fold_df = results_df[results_df['fold'] == fold]
        metrics_cls = validation_metrics(fold_df['preds'], fold_df['probs'], fold_df['labels'])
        metrics_calib = calibration_metrics(fold_df['probs'], fold_df['labels'], n_bins=n_bins, mode=mode_metrics)

        for key in metrics_cls:
            metrics_dict[model][key].append(metrics_cls[key])
        for key in metrics_calib:
            metrics_dict[model][key].append(metrics_calib[key])

# Run Friedman test for each metric
friedman_results = {}
for key in metric_keys:
    data = [metrics_dict[model][key] for model in model_files]
    stat, p = friedmanchisquare(*data)
    friedman_results[key] = {'Friedman χ²': stat, 'p-value': p}

friedman_df = pd.DataFrame(friedman_results).T

In [7]:
friedman_df

Unnamed: 0,Friedman χ²,p-value
Accuracy,1.076923,0.583645
F1 Score,2.205128,0.332019
Precision,0.666667,0.716531
Sensitivity,2.114286,0.347447
Specificity,0.058824,0.971017
AUC,3.8,0.149569
ECE,5.0,0.082085
MCE,4.2,0.122456
NLL,2.4,0.301194
Brier,2.4,0.301194


# Calibration Test


In [3]:
folder = ''
model_name = 'ViT_B_32_ENSEMBLE' # change model here

filename = 'ensemble_10_results.pkl'

original = f'experiments\\{folder}\\{model_name}\\{filename}'
LS_01 = f'experiments\\{folder}\\{model_name}_LS_01\\{filename}'
LS_03 = f'experiments\\{folder}\\{model_name}_LS_03\\{filename}'
LS_05 = f'experiments\\{folder}\\{model_name}_LS_05\\{filename}'
LINEAR = f'experiments\\{folder}\\{model_name}_LINEAR\\{filename}'
SIGMOID = f'experiments\\{folder}\\{model_name}_SIGMOID\\{filename}'
STEP = f'experiments\\{folder}\\{model_name}_STEP\\{filename}'
HIST = f'experiments\\{folder}\\{model_name}_HIST\\{filename}'
ISOTONIC = f'experiments\\{folder}\\{model_name}_ISOTONIC\\{filename}'
PLATT = f'experiments\\{folder}\\{model_name}_PLATT\\{filename}'
TEMPERATURE = f'experiments\\{folder}\\{model_name}_TEMP\\{filename}'

models = [HIST, ISOTONIC, LINEAR, LS_01, LS_03, LS_05, PLATT, SIGMOID, STEP, TEMPERATURE]



In [4]:
metrics_dict_original = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
                'Sensitivity': [], 'Specificity': [], 'AUC': [], 
                'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}

metrics_cls_aux = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
            'Sensitivity': [], 'Specificity': [], 'AUC': []}

metrics_calib_aux = {'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}


with open(original, 'rb') as f:
    results_originais = pd.DataFrame(pickle.load(f))
    
    # Collect metric values for each fold
for fold in results_originais['fold'].unique():
    metrics_cls = validation_metrics(results_originais[results_originais['fold']==fold]['preds'], results_originais[results_originais['fold']==fold]['probs'], results_originais[results_originais['fold']==fold]['labels'])
    metrics_calib = calibration_metrics(results_originais[results_originais['fold']==fold]['probs'], results_originais[results_originais['fold']==fold]['labels'], n_bins=n_bins, mode=mode_metrics)

    for metric in metrics_cls_aux.keys():
        metrics_dict_original[metric].append(metrics_cls[metric])

    for metric in metrics_calib_aux.keys():
        metrics_dict_original[metric].append(metrics_calib[metric])

In [5]:
for model in models:

    print(model)
    print()

    metrics_dict = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
                'Sensitivity': [], 'Specificity': [], 'AUC': [], 
                'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}
    
    metrics_cls_aux = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
            'Sensitivity': [], 'Specificity': [], 'AUC': []}

    metrics_calib_aux = {'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}


    with open(model, 'rb') as f:
        results = pd.DataFrame(pickle.load(f))
        
        # Collect metric values for each fold
    for fold in results['fold'].unique():
        metrics_cls = validation_metrics(results[results['fold']==fold]['preds'], results[results['fold']==fold]['probs'], results[results['fold']==fold]['labels'])
        metrics_calib = calibration_metrics(results[results['fold']==fold]['probs'], results[results['fold']==fold]['labels'], n_bins=n_bins, mode=mode_metrics)

        for metric in metrics_cls_aux.keys():
            metrics_dict[metric].append(metrics_cls[metric])

        for metric in metrics_calib_aux.keys():
            metrics_dict[metric].append(metrics_calib[metric])
        

    for metric in metrics_dict.keys():
        
        values_originais = np.asarray(metrics_dict_original[metric])
        values = np.asarray(metrics_dict[metric])

        try:
            stat, p_value = wilcoxon(values_originais, values)
            print(str(round(p_value,3)).replace('.',','))
        except ValueError:
            print(f'####')

    print("Probability Distribution Comparison (Wilcoxon and K-S Tests)")

    # Flatten all probabilities from all folds
    probs_model = results['probs'].values
    probs_originais = results_originais['probs'].values

    try:
        # Wilcoxon signed-rank test (paired comparison, only if lengths match)
        if len(probs_originais) == len(probs_model):
            stat_w, p_w = wilcoxon(probs_originais, probs_model)
            print(f"Wilcoxon p-value: {str(round(p_w, 50)).replace('.', ',')}")
        else:
            print("Wilcoxon skipped: lengths of probability arrays do not match.")

        # Kolmogorov–Smirnov test (doesn't require same length)
        
        stat_ks, p_ks = ks_2samp(probs_originais, probs_model)
        print(f"K-S test p-value: {str(round(p_ks, 50)).replace('.', ',')}")

    except Exception as e:
        print(f"Error in probability tests: {e}")


        
    print()

            

experiments\\ViT_B_32_ENSEMBLE_HIST\ensemble_10_results.pkl

0,469
0,195
0,312
0,031
0,188
0,193
0,014
0,02
0,557
0,557
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 6,768284781429555e-05
K-S test p-value: 0,00016146159705177112

experiments\\ViT_B_32_ENSEMBLE_ISOTONIC\ensemble_10_results.pkl

1,0
1,0
0,812
0,5
0,875
0,426
0,037
0,492
0,695
0,695
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 0,002633864527092048
K-S test p-value: 7,055727434101373e-05

experiments\\ViT_B_32_ENSEMBLE_LINEAR\ensemble_10_results.pkl

0,004
0,004
0,91
0,004
0,688
0,006
0,004
1,0
0,006
0,002
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 0,0013697249281084726
K-S test p-value: 2,340748933183906e-07

experiments\\ViT_B_32_ENSEMBLE_LS_01\ensemble_10_results.pkl

0,5
0,75
0,5
1,0
0,5
0,312
0,049
0,432
0,375
0,432
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 0,002512967563672408
K-

  z = (r_plus - mn) / se


0,109
0,039
0,496
0,027
0,688
0,014
0,002
0,275
0,004
0,002
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 0,7674771710735947
K-S test p-value: 5,4089853241471996e-14

experiments\\ViT_B_32_ENSEMBLE_STEP\ensemble_10_results.pkl

0,049
0,049
0,492
0,031
0,328
0,037
0,004
1,0
0,006
0,006
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 0,893903061219879
K-S test p-value: 5,312547715088182e-05

experiments\\ViT_B_32_ENSEMBLE_TEMP\ensemble_10_results.pkl



  z = (r_plus - mn) / se


1,0
1,0
1,0
1,0
1,0
1,0
0,105
0,02
0,432
0,77
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 8,537708855959898e-07
K-S test p-value: 5,312547715088182e-05



# TEST ORIGINAL VS MCDP

In [1]:
import numpy as np
from scipy.stats import ks_2samp, wilcoxon
import pickle

from calibration.calibrators import *
from validate import calibration_metrics, validation_metrics
from calibration.metrics import *
from utils.plots import *

import pandas as pd


In [2]:
n_bins = 10
mode_metrics = 'quantile'
mode_plot = 'uniform'

## MCDP

In [None]:
folder = ''
model_name = 'NCNN' # change model here

filename = 'results.pkl'
filename_MCDP = 'results_MCDP_50_0.5.pkl'

original = f'experiments\\{folder}\\{model_name}\\{filename}'
new = f'experiments\\{folder}\\{model_name}\\{filename_MCDP}'



## ENSEMBLES

In [81]:
model_name = 'ViT_B_32' # change model here

filename = 'results.pkl'
filename_ensemble = 'ensemble_3_results.pkl'

original = f'experiments\\{model_name}\\{filename}'
new = f'experiments\\{model_name}_ENSEMBLE\\{filename_ensemble}'

## RUN

In [82]:
metrics_dict_original = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
                'Sensitivity': [], 'Specificity': [], 'AUC': [], 
                'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}

metrics_cls_aux = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
            'Sensitivity': [], 'Specificity': [], 'AUC': []}

metrics_calib_aux = {'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}


with open(original, 'rb') as f:
    results_originais = pd.DataFrame(pickle.load(f))
    
    # Collect metric values for each fold
for fold in results_originais['fold'].unique():
    metrics_cls = validation_metrics(results_originais[results_originais['fold']==fold]['preds'], results_originais[results_originais['fold']==fold]['probs'], results_originais[results_originais['fold']==fold]['labels'])
    metrics_calib = calibration_metrics(results_originais[results_originais['fold']==fold]['probs'], results_originais[results_originais['fold']==fold]['labels'], n_bins=n_bins, mode=mode_metrics)

    for metric in metrics_cls_aux.keys():
        metrics_dict_original[metric].append(metrics_cls[metric])

    for metric in metrics_calib_aux.keys():
        metrics_dict_original[metric].append(metrics_calib[metric])

In [83]:
metrics_dict = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
            'Sensitivity': [], 'Specificity': [], 'AUC': [], 
            'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}

metrics_cls_aux = {'Accuracy': [], 'F1 Score': [], 'Precision': [], 
        'Sensitivity': [], 'Specificity': [], 'AUC': []}

metrics_calib_aux = {'ECE': [], 'MCE': [], 'NLL': [], 'Brier': []}


with open(new, 'rb') as f:
    results = pd.DataFrame(pickle.load(f))
    
    # Collect metric values for each fold
for fold in results['fold'].unique():
    metrics_cls = validation_metrics(results[results['fold']==fold]['preds'], results[results['fold']==fold]['probs'], results[results['fold']==fold]['labels'])
    metrics_calib = calibration_metrics(results[results['fold']==fold]['probs'], results[results['fold']==fold]['labels'], n_bins=n_bins, mode=mode_metrics)

    for metric in metrics_cls_aux.keys():
        metrics_dict[metric].append(metrics_cls[metric])

    for metric in metrics_calib_aux.keys():
        metrics_dict[metric].append(metrics_calib[metric])
    

for metric in metrics_dict.keys():
    
    values_originais = np.asarray(metrics_dict_original[metric])
    values = np.asarray(metrics_dict[metric])

    try:
        stat, p_value = wilcoxon(values_originais, values)
        print(str(round(p_value,3)).replace('.',','))
    except ValueError:
        print(f'####')

print("Probability Distribution Comparison (Wilcoxon and K-S Tests)")

# Flatten all probabilities from all folds
probs_model = results['probs'].values
probs_originais = results_originais['probs'].values

try:
    # Wilcoxon signed-rank test (paired comparison, only if lengths match)
    if len(probs_originais) == len(probs_model):
        stat_w, p_w = wilcoxon(probs_originais, probs_model)
        print(f"Wilcoxon p-value: {str(round(p_w, 6)).replace('.', ',')}")
    else:
        print("Wilcoxon skipped: lengths of probability arrays do not match.")

    # Kolmogorov–Smirnov test (doesn't require same length)
    
    stat_ks, p_ks = ks_2samp(probs_originais, probs_model)
    print(f"K-S test p-value: {str(round(p_ks, 6)).replace('.', ',')}")

except Exception as e:
    print(f"Error in probability tests: {e}")


    
print()

        

0,625
0,625
0,875
1,0
1,0
0,82
1,0
0,695
0,375
1,0
Probability Distribution Comparison (Wilcoxon and K-S Tests)
Wilcoxon p-value: 0,134469
K-S test p-value: 0,791667

