In [2]:
##############################################
# Import Library
##############################################

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.isotonic import IsotonicRegression

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy
from scipy.stats import bernoulli, norm, zscore

import calibration_metric
import importlib
importlib.reload(calibration_metric)
from calibration_metric import ece, ace, tce

In [3]:
##############################################
# Prepare Data
##############################################

def toy_data(p_train, p_test, num_train=14000, num_test=6000):
    np.random.seed(seed=0)
    
    df_train_y = bernoulli.rvs(p_train, size=num_train)
    df_test_y = bernoulli.rvs(p_test, size=num_test)
    
    df_train_X = norm.rvs(loc=0.5*(2*df_train_y-1), scale=2, size=len(df_train_y))
    df_test_X = norm.rvs(loc=0.5*(2*df_test_y-1), scale=2, size=len(df_test_y))
    
    return df_train_X.reshape(-1,1), df_train_y, df_test_X.reshape(-1,1), df_test_y

names_df = [ "50-50", "50-40", "50-60", "01-01", "01-00", "01-02" ]
dfs = [
    toy_data(0.50, 0.50),
    toy_data(0.50, 0.40),
    toy_data(0.50, 0.60),
    toy_data(0.01, 0.01),
    toy_data(0.01, 0.00),
    toy_data(0.01, 0.02)
]


In [4]:
##############################################
# Fit Algrithms and Measure Errors
##############################################

def do_experiment(dfs, names_df):
    tces = np.zeros(len(names_df))
    taes = np.zeros(len(names_df))
    eces = np.zeros(len(names_df))
    aces = np.zeros(len(names_df))
    mces = np.zeros(len(names_df))
    maes = np.zeros(len(names_df))
    
    for ith, (df_train_X, df_train_y, df_test_X, df_test_y) in enumerate(dfs):
        print("- Data: " + names_df[ith])
        
        alg = LogisticRegression(max_iter=1000, random_state=0)
        alg.fit(df_train_X, df_train_y)
        preds_test = alg.predict_proba(df_test_X)[:,1]
            
        filename = "./Plot/GDA/" + names_df[ith] + "_LR"
        n_min = int( len(df_test_y) / 20 )
        n_max = int( len(df_test_y) / 5 )
        n_bin = 10
        
        tces[ith] = tce(preds_test, df_test_y, strategy="pavabc", n_min=n_min, n_max=n_max, savepath=filename+"_tce_pavabc.png")
        taes[ith] = tce(preds_test, df_test_y, strategy="quantile", n_bin=n_bin, savepath=filename+"_tce_quantile.png")
        eces[ith] = ece(preds_test, df_test_y, n_bin=n_bin, mode='l1', savepath=filename+"_ece.png")
        aces[ith] = ace(preds_test, df_test_y, n_bin=n_bin, mode='l1', savepath=filename+"_ace.png")
        mces[ith] = ece(preds_test, df_test_y, n_bin=n_bin, mode='inf')
        maes[ith] = ace(preds_test, df_test_y, n_bin=n_bin, mode='inf')
    
    filename = "./Result/GDA/summary.csv"
    result = pd.DataFrame(np.c_[ tces, taes, eces, aces, mces, maes ])
    result.index = names_df
    result.columns = ['TCE(P)', 'TCE(Q)', 'ECE', 'ACE', 'MCE', 'MCE(Q)']
    result.to_csv(filename, float_format="%.4f")
    pd.options.display.float_format = '{:.4f}'.format
    display(result)
    
do_experiment(dfs, names_df)


- Data: 50-50
- Data: 50-40
- Data: 50-60
- Data: 01-01
- Data: 01-00
- Data: 01-02


Unnamed: 0,TCE(P),TCE(Q),ECE,ACE,MCE,MCE(Q)
50-50,7.2833,10.8833,0.0138,0.015,0.102,0.0528
50-40,96.1,96.4667,0.0963,0.0951,0.1466,0.1314
50-60,98.8333,98.9333,0.1097,0.1096,0.1423,0.1372
01-01,13.1167,11.55,0.0052,0.0059,0.0052,0.0131
01-00,99.9167,96.9,0.0181,0.0181,0.0181,0.0382
01-02,92.9833,98.45,0.0228,0.0228,0.0228,0.0446
