In [1]:
import os
import numpy as np
import pandas as pd
import itertools

In [2]:
top_models_path = [
    'data/non_aggregated_results_fairautoml_adult_flipped_20200903.csv',  # Adult
    'data/non_aggregated_results_fairautoml_AOF_20200903.csv',            # AOF
    'data/non_aggregated_results_fairautoml_compas_20200904.csv',         # COMPAS
    'data/non_aggregated_results_fairautoml_donors_choose_20200902.csv',  # Donors Choose
    'data/non_aggregated_results_fairautoml_oob_min_metric.csv',          # OOB Tuners (TPE)
]

In [3]:
results_df = None

for path in top_models_path:#[:4]:
    df = pd.read_csv(path)
    
    perf_metric = df['performance_metric'].unique()[0]
    fair_metric = df['fairness_metric'].unique()[0]
    
    if 'performance_val' not in set(df.columns):
        df['performance_val'] = df[f"validation_{perf_metric}"]
    if 'fairness_val' not in set(df.columns):
        df['fairness_val'] = df[f"validation_{fair_metric}"]
    if 'performance_test' not in set(df.columns):
        df['performance_test'] = df[f"test_{perf_metric}"]
    if 'fairness_test' not in set(df.columns):
        df['fairness_test'] = df[f"test_{fair_metric}"]
    
    results_df = df if results_df is None else pd.concat((results_df, df), axis=0)

In [4]:
results_df = results_df[[
    'dataset', 'task_type', 'alpha',
    'performance_val', 'fairness_val',
    'performance_test', 'fairness_test',
    'model_classpath', 'hyperparameters', 'class_ratio',
    'model_config_uuid', 'model_uuid',
    'si', 'i', 'ni', 'ri', 'target_threshold',
]]

In [5]:
results_df.head()

Unnamed: 0,dataset,task_type,alpha,performance_val,fairness_val,performance_test,fairness_test,model_classpath,hyperparameters,class_ratio,model_config_uuid,model_uuid,si,i,ni,ri,target_threshold
0,Adult,Hyperband,dynamic,0.941,0.91024,0.93272,0.84107,fairautoml.nn.wrappers.FeedForwardClassifier,"{'batch_size': 2048, 'max_epochs': 75, 'optimi...",,e90d361bb0bfc5448ae077a02f47afe2,98f05504756f5a66e28f5515f4c00cdc,3.0,0.0,34.0,3.704,tpr = 50
1,Adult,Hyperband,0.5,0.91896,0.99948,0.91367,0.94126,fairautoml.nn.wrappers.FeedForwardClassifier,"{'batch_size': 2048, 'max_epochs': 25, 'optimi...",,ffa8b120ec9b1ae9201a6634fd9dffa4,cad876af4952ec67447a5e79422310c2,4.0,0.0,81.0,1.235,tpr = 50
2,Adult,Hyperband,dynamic,0.93142,0.93815,0.92774,0.88856,fairautoml.nn.wrappers.FeedForwardClassifier,"{'batch_size': 2048, 'max_epochs': 25, 'optimi...",,592dde8641d47d278a158df7d45f1ce0,772f5373cdb06dc996a79f8474acef54,3.0,0.0,34.0,3.704,tpr = 50
3,Adult,Hyperband,dynamic,0.94315,0.93658,0.93605,0.88501,fairautoml.nn.wrappers.FeedForwardClassifier,"{'batch_size': 2048, 'max_epochs': 25, 'optimi...",,7e1266b6f483aaa91345b090acfa8e87,8a3f2ed18191d2b036bf6d656fb63891,3.0,1.0,11.0,11.111,tpr = 50
4,Adult,Hyperband,dynamic,0.92067,0.99373,0.91778,0.93524,fairautoml.nn.wrappers.FeedForwardClassifier,"{'batch_size': 2048, 'max_epochs': 25, 'optimi...",,25b7ae7ed998f0b159e77417ec8aab68,cbbaad53880e7dace0db138a34868a0c,4.0,0.0,81.0,1.235,tpr = 50


In [6]:
results_df.value_counts('dataset')

dataset
AOF              135
Adult            135
COMPAS           135
Donors Choose    135
dtype: int64

In [7]:
results_df = results_df[results_df['task_type'] != 'RandomSampler']

In [8]:
results_df['tuner'] = results_df.apply(
    lambda row: f"{row.task_type} {row.alpha}",
    axis=1,
)

# Changing to naming convention used on the paper
tuner_rename = {
    'RandomSearch 0.5': 'FairRS',
    'RandomSearch 1': 'RS',
    'Hyperband dynamic': 'FB-auto',
    'Hyperband 0.5': 'FB',
    'Hyperband 1': 'HB',
    'TPESampler 0.5': 'FairTPE',
    'TPESampler 1.0': 'TPE',
}

results_df['tuner'] = results_df['tuner'].apply(
    lambda tuner: tuner_rename[tuner] if tuner in tuner_rename else tuner
)
results_df['tuner'].value_counts()

FB-auto    60
FB         60
RS         60
TPE        60
FairTPE    60
FairRS     60
HB         60
Name: tuner, dtype: int64

In [9]:
key_columns = ['dataset', 'tuner', 'alpha']
metric_columns = ['performance_val', 'fairness_val', 'performance_test', 'fairness_test']

results_df_metrics = results_df[key_columns + metric_columns]

agg_results = results_df_metrics.groupby(['dataset', 'tuner', 'alpha']).agg(['mean', 'std'])
agg_results.round(decimals=3) * 100

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,performance_val,performance_val,fairness_val,fairness_val,performance_test,performance_test,fairness_test,fairness_test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std
dataset,tuner,alpha,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AOF,FB,0.5,50.7,15.8,76.0,15.0,52.6,14.6,81.3,12.3
AOF,FB-auto,dynamic,61.7,6.5,68.1,14.0,64.0,5.1,74.2,12.9
AOF,FairRS,0.5,60.4,9.2,64.1,15.6,62.6,10.0,68.6,15.6
AOF,FairTPE,0.5,55.7,7.1,76.9,17.0,59.2,9.0,80.0,14.6
AOF,HB,1,68.4,1.0,32.3,6.2,68.4,1.4,35.2,6.7
AOF,RS,1,67.8,0.6,42.2,10.7,68.1,1.6,45.0,11.5
AOF,TPE,1.0,68.7,1.0,30.5,4.8,68.5,1.5,33.7,5.2
Adult,FB,0.5,92.7,3.4,94.0,5.4,92.3,3.3,89.5,6.1
Adult,FB-auto,dynamic,92.0,4.4,94.7,2.8,91.6,4.2,90.9,5.2
Adult,FairRS,0.5,93.6,6.5,79.4,8.1,93.8,6.3,78.6,8.0


## Compute statistical differences between every candidate pair of results

In [10]:
from scipy import stats

results_indexed = results_df_metrics.set_index(['dataset', 'tuner'])

all_statistical_tests = dict()

for metric_col in metric_columns:
    metric_tests = dict()

    for dataset in results_df_metrics['dataset'].unique():
        dataset_results = results_indexed.loc[dataset]
        dataset_tests = dict()

        for tuner_type_outer in results_df_metrics['tuner'].unique():
            outer_tuner_tests = dict()

            outer_values = dataset_results.loc[tuner_type_outer]
            outer_values = outer_values[metric_col]

            for tuner_type_inner in results_df_metrics['tuner'].unique():

                inner_values = dataset_results.loc[tuner_type_inner]
                inner_values = inner_values[metric_col]

                # Run statistical test
                stats_val, p_val = stats.ks_2samp(outer_values, inner_values)

                outer_tuner_tests[tuner_type_inner] = p_val  # Extract p-value

            dataset_tests[tuner_type_outer] = outer_tuner_tests
            
        metric_tests[dataset] = dataset_tests

    all_statistical_tests[metric_col] = metric_tests

## Aggregate results

In [11]:
stats_tests_dfs = dict()

for metric in all_statistical_tests.keys():
    for dataset in all_statistical_tests[metric].keys():
        dataset_tests = all_statistical_tests[metric][dataset]
        tests_df = pd.DataFrame(dataset_tests)
        stats_tests_dfs[f"{dataset}_{metric}"] = tests_df


## Show p-values

In [12]:
from IPython.display import Markdown, display

for key, df in stats_tests_dfs.items():
    styled_df = df.style.applymap(
        lambda x: "background-color: #fc6203" if x<1e-2
            else "background-color: #fcba03" if x<5e-2
            else "background-color: #dffc03" if x<1e-1
            else "background-color: white"
    )

    display(Markdown(f"### {key}"))
    display(styled_df)
    print()


### Adult_performance_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.678138,0.0,0.0,0.001837,0.075464,0.0
FB,0.678138,1.0,0.0,0.0,0.001837,0.385547,0.0
RS,0.0,0.0,1.0,0.938331,0.0,0.0,0.184416
HB,0.0,0.0,0.938331,1.0,0.0,0.0,0.184416
FairRS,0.001837,0.001837,0.0,0.0,1.0,0.075464,0.0
FairTPE,0.075464,0.385547,0.0,0.0,0.075464,1.0,0.0
TPE,0.0,0.0,0.184416,0.184416,0.0,0.0,1.0





### AOF_performance_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.007656,6e-06,0.0,0.678138,0.026248,6e-06
FB,0.007656,1.0,0.0,0.0,0.184416,0.678138,0.0
RS,6e-06,0.0,1.0,0.184416,5.2e-05,0.0,0.007656
HB,0.0,0.0,0.184416,1.0,0.000353,0.0,0.678138
FairRS,0.678138,0.184416,5.2e-05,0.000353,1.0,0.075464,0.000353
FairTPE,0.026248,0.678138,0.0,0.0,0.075464,1.0,0.0
TPE,6e-06,0.0,0.007656,0.678138,0.000353,0.0,1.0





### COMPAS_performance_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.184416,0.0,0.0,0.001837,0.001837,0.0
FB,0.184416,1.0,0.0,0.0,0.184416,0.075464,0.0
RS,0.0,0.0,1.0,0.184416,0.0,0.0,0.075464
HB,0.0,0.0,0.184416,1.0,0.0,0.0,0.938331
FairRS,0.001837,0.184416,0.0,0.0,1.0,0.999789,0.0
FairTPE,0.001837,0.075464,0.0,0.0,0.999789,1.0,0.0
TPE,0.0,0.0,0.075464,0.938331,0.0,0.0,1.0





### Donors Choose_performance_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.678138,0.0,0.0,0.000353,0.000353,0.0
FB,0.678138,1.0,0.0,0.0,0.000353,0.000353,0.0
RS,0.0,0.0,1.0,0.026248,0.0,0.0,0.007656
HB,0.0,0.0,0.026248,1.0,0.0,0.0,0.678138
FairRS,0.000353,0.000353,0.0,0.0,1.0,0.678138,0.0
FairTPE,0.000353,0.000353,0.0,0.0,0.678138,1.0,0.0
TPE,0.0,0.0,0.007656,0.678138,0.0,0.0,1.0





### Adult_fairness_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.678138,0.0,0.0,0.0,0.0,0.0
FB,0.678138,1.0,0.0,0.0,5.2e-05,5.2e-05,0.0
RS,0.0,0.0,1.0,0.007656,0.0,0.0,0.938331
HB,0.0,0.0,0.007656,1.0,0.0,0.0,0.075464
FairRS,0.0,5.2e-05,0.0,0.0,1.0,0.678138,0.0
FairTPE,0.0,5.2e-05,0.0,0.0,0.678138,1.0,0.0
TPE,0.0,0.0,0.938331,0.075464,0.0,0.0,1.0





### AOF_fairness_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.184416,0.0,0.0,0.184416,0.184416,0.0
FB,0.184416,1.0,0.0,0.0,0.184416,0.678138,0.0
RS,0.0,0.0,1.0,0.007656,0.000353,0.0,0.001837
HB,0.0,0.0,0.007656,1.0,0.0,0.0,0.938331
FairRS,0.184416,0.184416,0.000353,0.0,1.0,0.075464,0.0
FairTPE,0.184416,0.678138,0.0,0.0,0.075464,1.0,0.0
TPE,0.0,0.0,0.001837,0.938331,0.0,0.0,1.0





### COMPAS_fairness_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.999789,0.0,0.0,0.000353,0.007656,0.0
FB,0.999789,1.0,0.0,0.0,0.000353,0.007656,0.0
RS,0.0,0.0,1.0,0.999789,0.0,0.0,0.678138
HB,0.0,0.0,0.999789,1.0,6e-06,0.0,0.938331
FairRS,0.000353,0.000353,0.0,6e-06,1.0,0.938331,0.0
FairTPE,0.007656,0.007656,0.0,0.0,0.938331,1.0,0.0
TPE,0.0,0.0,0.678138,0.938331,0.0,0.0,1.0





### Donors Choose_fairness_val

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.385547,0.0,0.0,0.075464,0.026248,0.0
FB,0.385547,1.0,0.0,0.0,0.678138,0.385547,0.0
RS,0.0,0.0,1.0,0.001837,0.0,0.0,0.075464
HB,0.0,0.0,0.001837,1.0,0.0,0.0,0.678138
FairRS,0.075464,0.678138,0.0,0.0,1.0,0.938331,0.0
FairTPE,0.026248,0.385547,0.0,0.0,0.938331,1.0,0.0
TPE,0.0,0.0,0.075464,0.678138,0.0,0.0,1.0





### Adult_performance_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.678138,0.0,0.0,0.007656,0.184416,0.0
FB,0.678138,1.0,0.0,0.0,0.007656,0.385547,0.0
RS,0.0,0.0,1.0,0.007656,0.0,0.0,0.385547
HB,0.0,0.0,0.007656,1.0,0.0,0.0,0.184416
FairRS,0.007656,0.007656,0.0,0.0,1.0,0.075464,0.0
FairTPE,0.184416,0.385547,0.0,0.0,0.075464,1.0,0.0
TPE,0.0,0.0,0.385547,0.184416,0.0,0.0,1.0





### AOF_performance_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.026248,0.026248,0.001837,0.678138,0.184416,0.007656
FB,0.026248,1.0,0.0,0.0,0.001837,0.184416,0.0
RS,0.026248,0.0,1.0,0.678138,0.184416,0.007656,0.938331
HB,0.001837,0.0,0.678138,1.0,0.184416,0.007656,0.999789
FairRS,0.678138,0.001837,0.184416,0.184416,1.0,0.385547,0.075464
FairTPE,0.184416,0.184416,0.007656,0.007656,0.385547,1.0,0.007656
TPE,0.007656,0.0,0.938331,0.999789,0.075464,0.007656,1.0





### COMPAS_performance_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.385547,0.001837,0.026248,0.007656,0.007656,0.007656
FB,0.385547,1.0,0.0,6e-06,0.075464,0.075464,6e-06
RS,0.001837,0.0,1.0,0.385547,0.0,0.0,0.184416
HB,0.026248,6e-06,0.385547,1.0,0.0,0.0,0.385547
FairRS,0.007656,0.075464,0.0,0.0,1.0,0.999789,6e-06
FairTPE,0.007656,0.075464,0.0,0.0,0.999789,1.0,0.0
TPE,0.007656,6e-06,0.184416,0.385547,6e-06,0.0,1.0





### Donors Choose_performance_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.938331,0.0,0.0,0.184416,0.678138,0.0
FB,0.938331,1.0,0.0,0.0,0.385547,0.184416,0.0
RS,0.0,0.0,1.0,0.385547,0.0,0.0,0.999789
HB,0.0,0.0,0.385547,1.0,0.0,0.0,0.184416
FairRS,0.184416,0.385547,0.0,0.0,1.0,0.678138,0.0
FairTPE,0.678138,0.184416,0.0,0.0,0.678138,1.0,0.0
TPE,0.0,0.0,0.999789,0.184416,0.0,0.0,1.0





### Adult_fairness_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.938331,0.0,0.0,5.2e-05,0.0,0.0
FB,0.938331,1.0,0.0,0.0,0.001837,0.000353,0.0
RS,0.0,0.0,1.0,0.026248,0.0,0.0,0.938331
HB,0.0,0.0,0.026248,1.0,0.0,0.0,0.075464
FairRS,5.2e-05,0.001837,0.0,0.0,1.0,0.184416,0.0
FairTPE,0.0,0.000353,0.0,0.0,0.184416,1.0,0.0
TPE,0.0,0.0,0.938331,0.075464,0.0,0.0,1.0





### AOF_fairness_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.184416,0.0,0.0,0.385547,0.385547,0.0
FB,0.184416,1.0,0.0,0.0,0.075464,0.938331,0.0
RS,0.0,0.0,1.0,0.026248,0.000353,0.0,0.001837
HB,0.0,0.0,0.026248,1.0,0.0,0.0,0.938331
FairRS,0.385547,0.075464,0.000353,0.0,1.0,0.075464,0.0
FairTPE,0.385547,0.938331,0.0,0.0,0.075464,1.0,0.0
TPE,0.0,0.0,0.001837,0.938331,0.0,0.0,1.0





### COMPAS_fairness_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.184416,0.0,0.0,6e-06,5.2e-05,0.0
FB,0.184416,1.0,0.0,5.2e-05,0.001837,0.007656,5.2e-05
RS,0.0,0.0,1.0,0.075464,6e-06,6e-06,0.385547
HB,0.0,5.2e-05,0.075464,1.0,0.007656,0.007656,0.678138
FairRS,6e-06,0.001837,6e-06,0.007656,1.0,0.938331,5.2e-05
FairTPE,5.2e-05,0.007656,6e-06,0.007656,0.938331,1.0,5.2e-05
TPE,0.0,5.2e-05,0.385547,0.678138,5.2e-05,5.2e-05,1.0





### Donors Choose_fairness_test

Unnamed: 0,FB-auto,FB,RS,HB,FairRS,FairTPE,TPE
FB-auto,1.0,0.938331,0.0,0.0,0.000353,0.000353,0.0
FB,0.938331,1.0,0.0,0.0,0.001837,0.000353,0.0
RS,0.0,0.0,1.0,0.385547,0.0,0.0,0.678138
HB,0.0,0.0,0.385547,1.0,0.0,0.0,0.938331
FairRS,0.000353,0.001837,0.0,0.0,1.0,0.385547,0.0
FairTPE,0.000353,0.000353,0.0,0.0,0.385547,1.0,0.0
TPE,0.0,0.0,0.678138,0.938331,0.0,0.0,1.0



