In [1]:
import os
import numpy as np
import pandas as pd 
import itertools
import logging

from pathlib import Path
from typing import Union

### Experiment Setup 

Each dataset is comprises different properties, including performance and fairness metrics, as well as thresholds.

In [2]:
DATASET_NAME = 'Adult'
DATA_DIR = Path('../data')

In [3]:
!ls {DATA_DIR}

Adult.csv  AOF.csv  COMPAS.csv	DonorsChoose.csv  EG_experiment.20200925.csv


In [4]:
DATASET_TO_CONFIGS = {
    'Adult': {
        'performance': 'precision',
        'fairness': 'gender_rel_tpr_parity',
        'fairness_label': 'eq. opportunity',
        'target_threshold': 'tpr = 50',
        'ylim': (0.45, 1.01),
    },
    'COMPAS': {
        'performance': 'precision',        
        'fairness': 'race_rel_fpr_parity',
        'fairness_label': 'predictive eq.',
        'target_threshold': 'fpr = 2',
        'metrics_lim': (0.15, 1.01),

    },
    'DonorsChoose': {
        'performance': 'precision',
        'fairness': 'poverty_rel_tpr_parity',
        'fairness_label': 'eq. opportunity',
        'target_threshold': 'pp = 1000',
        'metrics_lim': (0.2, 1),
    },
    'AOF': {
        'performance': 'recall',
        'fairness': 'age_rel_fpr_parity',
        'fairness_label': 'predictive eq.',
        'target_threshold': 'fpr = 5',
        'metrics_lim': (0.15, 1.01),
    },
}

# Tables 
DATASET_CONFIGS = DATASET_TO_CONFIGS[DATASET_NAME]

# Metrics
PERFORMANCE_METRIC = DATASET_CONFIGS['performance']
PERFORMANCE_LABEL = DATASET_CONFIGS.get('performance_label', PERFORMANCE_METRIC)
PERFORMANCE_LABEL = f"accuracy ({PERFORMANCE_LABEL})"

FAIRNESS_METRIC = DATASET_CONFIGS['fairness']
FAIRNESS_LABEL = DATASET_CONFIGS.get('fairness_label', FAIRNESS_METRIC)
FAIRNESS_LABEL = f"fairness ({FAIRNESS_LABEL})"

TARGET_THRESHOLD = DATASET_CONFIGS['target_threshold']
YLIM = DATASET_CONFIGS.get('ylim', (0, 1.01))

In [5]:
RESULTS_DIR = Path(f'../results/{DATASET_NAME}')
os.makedirs(RESULTS_DIR, exist_ok=True)

# A. Load Data

The data contains both the validation and test data for the same `eval_experiment_uuid`, `run_uuid`, and `index`. It also contains the predictive accuracy and fairness metrics for each evaluated model.

In [6]:
all_artifacts = pd.read_csv(DATA_DIR / f"{DATASET_NAME}.csv", index_col=0)
all_artifacts.head()

Unnamed: 0,index,eval_experiment_uuid,run_uuid,si,i,ni,ri,metric_val,model_config_uuid,model_uuid,...,matrix_type,target_threshold,validation_precision,validation_gender_rel_tpr_parity,alpha,alpha_shift,eta_exp_budget,tuner,test_precision,test_gender_rel_tpr_parity
0,1,a444331d20d5e02d8f766873d7616474,06ecaec1696944bcef0a08895f351afc,4,0,81,1.235,0.7991,2f26ce3f54c85e97b07f842394837ce3,94747998b08e6de9b70aa3f118173a0d,...,validation,tpr = 50,0.9683,0.629917,0.5,0.0,0.0123,FB (α=0.5),0.9736,0.630274
1,2,a444331d20d5e02d8f766873d7616474,06ecaec1696944bcef0a08895f351afc,4,0,81,1.235,0.733,63570e643b3bc6a1a90d406e9b03d35a,4a5bb76a7915981374b59bb138140980,...,validation,tpr = 50,0.9837,0.482216,0.5,0.0,0.0123,FB (α=0.5),0.983,0.48024
2,3,a444331d20d5e02d8f766873d7616474,06ecaec1696944bcef0a08895f351afc,4,0,81,1.235,0.936,acdc802be17d996db5897788de56d974,dcd640e460dc559b19e473d28ac7a60a,...,validation,tpr = 50,0.895,0.976976,0.5,0.0,0.0123,FB (α=0.5),0.0,0.0
3,4,a444331d20d5e02d8f766873d7616474,06ecaec1696944bcef0a08895f351afc,4,0,81,1.235,0.7967,9576edde7e47de908c0a07a27b6cf797,c04adf640a6c72d33505ae9dd906dbe4,...,validation,tpr = 50,0.8323,0.761067,0.5,0.0,0.0123,FB (α=0.5),0.8378,0.766146
4,5,a444331d20d5e02d8f766873d7616474,06ecaec1696944bcef0a08895f351afc,4,0,81,1.235,0.7744,5922613d58f875366fefd06b9d8ef989,dcf70c441865698ee7eea75e85ddb9a2,...,validation,tpr = 50,0.7211,0.827607,0.5,0.0,0.0123,FB (α=0.5),0.7189,0.818355


In [7]:
all_artifacts.columns

Index(['index', 'eval_experiment_uuid', 'run_uuid', 'si', 'i', 'ni', 'ri',
       'metric_val', 'model_config_uuid', 'model_uuid', 'model_classpath',
       'hyperparameters', 'class_ratio', 'iteration_type', 'iteration_params',
       'matrix_type', 'target_threshold', 'validation_precision',
       'validation_gender_rel_tpr_parity', 'alpha', 'alpha_shift',
       'eta_exp_budget', 'tuner', 'test_precision',
       'test_gender_rel_tpr_parity'],
      dtype='object')

## A.1. Transformations 

In [8]:
float_cols = (
    f'validation_{PERFORMANCE_METRIC}',
    f'validation_{FAIRNESS_METRIC}', 
    'alpha_shift',
    'eta_exp_budget', 
    f'test_{PERFORMANCE_METRIC}', 
    f'test_{FAIRNESS_METRIC}', 
)

for col in float_cols:
    all_artifacts[col] = all_artifacts[col].astype(float)
    
    
all_artifacts['iteration_params'] = all_artifacts['iteration_params'].apply(eval)
all_artifacts['hyperparameters'] = all_artifacts['hyperparameters'].apply(eval)

## A.2. Results processing 

Now that the results are already on their final form, we want to transform them in a way that allow us to represent the result of each HO algorithm.

In [9]:
# Unique identifier of a hyperparameter tuner model (unit of comparison)
ID_COLS = ['eval_experiment_uuid', 'run_uuid', 'index']

In [10]:
def get_top_model_static_alpha(run_df, alpha, matrix_type='validation'):
    metric_vals = run_df.apply(
        lambda row: 
            row[f'{matrix_type}_{PERFORMANCE_METRIC}'] * alpha +
            row[f'{matrix_type}_{FAIRNESS_METRIC}'] * (1-alpha),
        axis=1,
    )
    max_idx = np.argmax(metric_vals)
    return run_df.iloc[max_idx]

def get_top_model(run_df, alpha='dynamic', matrix_type='validation'):
    if alpha == 'dynamic':
        avg_f = run_df[f'{matrix_type}_{FAIRNESS_METRIC}'].mean()
        avg_p = run_df[f'{matrix_type}_{PERFORMANCE_METRIC}'].mean()
        alpha = 0.5 + ((avg_f - avg_p) / 2)
        #print(f'Dynamic alpha for run {run_df["run_uuid"].unique()[0]} = {alpha}')
    else:
        alpha = float(alpha)

    assert isinstance(alpha, (float, int))
    return get_top_model_static_alpha(run_df, alpha)

In [11]:
top_models = all_artifacts.groupby('run_uuid').apply(
    lambda df: get_top_model(df, alpha=df['alpha'].unique()[0])
)

In [12]:
full_agg_table = top_models.groupby('tuner').agg(['mean', 'std'])#, 'count'])

In [13]:
import scipy.stats as st

# B. Validation & Test Results

In [14]:
v_metric_perf, v_metric_fair = f'validation_{PERFORMANCE_METRIC}', f'validation_{FAIRNESS_METRIC}'
t_metric_perf, t_metric_fair = f'test_{PERFORMANCE_METRIC}', f'test_{FAIRNESS_METRIC}'

agg_table = full_agg_table[[v_metric_perf, v_metric_fair, t_metric_perf, t_metric_fair]]

In [15]:
agg_table.reset_index(drop=False, inplace=True)

latex_table = pd.DataFrame(columns=[
    'Algorithm', 'Val. Perf.', 'Val. Fair.', 'Test Perf.', 'Test Fair.'
])

latex_table['Algorithm'] = agg_table['tuner'].apply(
    lambda el: {
        'FB': 'FB-auto',
        'FB (α=0.5)': 'FB-bal',
        'RS (α=0.5)': 'RS-bal',
        'HB': 'HB',
        'RS': 'RS',
    }[el]
)
#latex_table['Algorithm'] = agg_table['tuner']

latex_table['Val. Perf.'] = agg_table.apply(
    lambda row: f'$ {100 * row[v_metric_perf, "mean"]:.3} $',
    axis=1,
)
latex_table['Val. Fair.'] = agg_table.apply(
    lambda row: f'$ {100 * row[v_metric_fair, "mean"]:.3} $',
    axis=1,
)

latex_table['Test Perf.'] = agg_table.apply(
    lambda row: f'$ {100 * row[t_metric_perf, "mean"]:.3} $',
    axis=1,
)
latex_table['Test Fair.'] = agg_table.apply(
    lambda row: f'$ {100 * row[t_metric_fair, "mean"]:.3} $',
    axis=1,
)

latex_table.to_latex(
    RESULTS_DIR / 'results_table.tex',
    index=False
)
latex_table

Unnamed: 0,Algorithm,Val. Perf.,Val. Fair.,Test Perf.,Test Fair.
0,FB-auto,$ 90.6 $,$ 95.6 $,$ 90.1 $,$ 93.9 $
1,FB-bal,$ 91.9 $,$ 94.2 $,$ 79.7 $,$ 79.0 $
2,HB,$ 99.4 $,$ 53.2 $,$ 99.4 $,$ 53.3 $
3,RS,$ 99.3 $,$ 55.1 $,$ 99.4 $,$ 55.1 $
4,RS-bal,$ 89.8 $,$ 83.7 $,$ 90.5 $,$ 83.4 $


# C. P-Values

In [16]:
val_performance = {
    tuner: top_models[top_models['tuner'] == tuner][f'validation_{PERFORMANCE_METRIC}'].to_list()
    for tuner in top_models['tuner'].unique()
}
val_fairness = {
    tuner: top_models[top_models['tuner'] == tuner][f'validation_{FAIRNESS_METRIC}'].to_list()
    for tuner in top_models['tuner'].unique()
}

In [17]:
reference_groups = ['HB', 'RS']
other_groups = ['FB', 'FB (α=0.5)', 'RS (α=0.5)']

In [18]:
tuners = list(val_performance.keys())
tuners_cat = np.array(tuners, dtype=object)


def compute_ks_test(results, tuners=tuners, tuners_cat=tuners_cat):
    ks_values = pd.crosstab(tuners_cat, tuners_cat)
    p_values = pd.crosstab(tuners_cat, tuners_cat)

    for row_tuner, col_tuner in itertools.product(tuners, tuners):
        row_values = results[row_tuner]
        col_values = results[col_tuner]
        ks_stats, p_value = st.ks_2samp(row_values, col_values)
        
        ks_values.loc[row_tuner, col_tuner] = ks_stats
        p_values.loc[row_tuner, col_tuner] = p_value

    return ks_values, p_values

## C.1. Validation

### Performance

In [19]:
pd.set_option('precision', 15)

In [20]:
val_perf_ks, val_perf_p = compute_ks_test(val_performance)
val_perf_p = val_perf_p[reference_groups]

In [21]:
val_perf_p.to_csv(RESULTS_DIR / 'val_perf_p_values.csv')
val_perf_p

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,1.289345e-08,1.289345e-08
FB (α=0.5),1.289345e-08,1.289345e-08
HB,1.0,0.075464009481327
RS,0.075464009481327,1.0
RS (α=0.5),1.289345e-08,1.289345e-08


In [22]:
val_perf_p <= 0.01

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


In [23]:
val_perf_p <= 0.05

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


### Fairness

In [24]:
val_fair_ks, val_fair_p = compute_ks_test(val_fairness)
val_fair_p = val_fair_p[reference_groups]

In [25]:
val_fair_p.to_csv(RESULTS_DIR / 'val_fair_p_values.csv')
val_fair_p

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,1.289345e-08,1.289345e-08
FB (α=0.5),1.289345e-08,1.289345e-08
HB,1.0,0.184416176844498
RS,0.184416176844498,1.0
RS (α=0.5),1.289345e-08,1.289345e-08


In [26]:
val_fair_p[reference_groups] <= 0.01

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


In [27]:
val_fair_p[reference_groups] <= 0.05

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


## C.2. Test

In [28]:
test_performance = {
    tuner: top_models[top_models['tuner'] == tuner][f'test_{PERFORMANCE_METRIC}'].to_list()
    for tuner in top_models['tuner'].unique()
}
test_fairness = {
    tuner: top_models[top_models['tuner'] == tuner][f'test_{FAIRNESS_METRIC}'].to_list()
    for tuner in top_models['tuner'].unique()
}

### Performance (test)

In [29]:
test_perf_ks, test_perf_p = compute_ks_test(test_performance)
test_perf_p = test_perf_p[reference_groups]

In [30]:
test_perf_p.to_csv(RESULTS_DIR / 'test_perf_p_values.csv')
test_perf_p

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,1.289345e-08,1.289345e-08
FB (α=0.5),1.289345e-08,1.289345e-08
HB,1.0,0.678138227068097
RS,0.678138227068097,1.0
RS (α=0.5),5.608650783e-06,5.608650783e-06


In [31]:
test_perf_p[reference_groups] <= 0.01

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


In [32]:
test_perf_p[reference_groups] <= 0.05

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


### Fairness (test)

In [33]:
test_fair_ks, test_fair_p = compute_ks_test(test_fairness)
test_fair_p = test_fair_p[reference_groups]

In [34]:
test_fair_p.to_csv(RESULTS_DIR / 'test_fair_p_values.csv')
test_fair_p

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,1.289345e-08,1.289345e-08
FB (α=0.5),5.608650783e-06,5.608650783e-06
HB,1.0,0.184416176844498
RS,0.184416176844498,1.0
RS (α=0.5),1.289345e-08,1.289345e-08


In [35]:
test_fair_p[reference_groups] <= 0.01 

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True


In [36]:
test_fair_p[reference_groups] <= 0.05

col_0,HB,RS
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,True,True
FB (α=0.5),True,True
HB,False,False
RS,False,False
RS (α=0.5),True,True
