In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import kruskal, mannwhitneyu
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, average_precision_score, f1_score,
                             precision_score, recall_score, matthews_corrcoef, roc_auc_score)
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

sns.set_theme(style = 'whitegrid', context='paper', palette='Greys', font_scale=1.5)

In [4]:
def evaluate_methods(train, test, label, mlp_test_ap, rf_test_ap, xgb_test_ap, mlp_test_prec, rf_test_prec, xgb_test_prec, mlp_test_rec, rf_test_rec, xgb_test_rec, mlp_test_f1, rf_test_f1, xgb_test_f1, mlp_test_acc, rf_test_acc, xgb_test_acc):
    mlp_sp = MLPClassifier(alpha=0.0001, hidden_layer_sizes=(50,), learning_rate='constant', learning_rate_init=0.01, max_iter=200)
    rf_sp = RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=1, min_samples_split=10, n_estimators=500)
    xgb_sp = XGBClassifier(colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=6, subsample=0.6)

    mlp_sp.fit(train.drop(columns=label), train[label])
    rf_sp.fit(train.drop(columns=label), train[label])
    xgb_sp.fit(train.drop(columns=label), train[label])

    mlp_test_sp = mlp_sp.predict_proba(test.drop(columns=label))
    rf_test_sp = rf_sp.predict_proba(test.drop(columns=label))
    xgb_test_sp = xgb_sp.predict_proba(test.drop(columns=label))

    mlp_test_pred_sp = mlp_sp.predict(test.drop(columns=label))
    rf_test_pred_sp = rf_sp.predict(test.drop(columns=label))
    xgb_test_pred_sp = xgb_sp.predict(test.drop(columns=label))

    mlp_test_ap.append(roc_auc_score(test[label], mlp_test_sp[:, 1]))
    rf_test_ap.append(roc_auc_score(test[label], rf_test_sp[:, 1]))
    xgb_test_ap.append(roc_auc_score(test[label], xgb_test_sp[:, 1]))

    mlp_test_prec.append(precision_score(test[label], mlp_test_pred_sp))
    rf_test_prec.append(precision_score(test[label], rf_test_pred_sp))
    xgb_test_prec.append(precision_score(test[label], xgb_test_pred_sp))

    mlp_test_rec.append(recall_score(test[label], mlp_test_pred_sp))
    rf_test_rec.append(recall_score(test[label], rf_test_pred_sp))
    xgb_test_rec.append(recall_score(test[label], xgb_test_pred_sp))

    mlp_test_f1.append(f1_score(test[label], mlp_test_pred_sp))
    rf_test_f1.append(f1_score(test[label], rf_test_pred_sp))
    xgb_test_f1.append(f1_score(test[label], xgb_test_pred_sp))

    mlp_test_acc.append(accuracy_score(test[label], mlp_test_pred_sp))
    rf_test_acc.append(accuracy_score(test[label], rf_test_pred_sp))
    xgb_test_acc.append(accuracy_score(test[label], xgb_test_pred_sp))

## MLP Base Classifier

In [5]:
data = pd.read_csv('../data/bias_symptoms_mlp_reduced.csv', index_col=[0,1])
data.index = data.index.droplevel(1)
labels = ['statistical_parity', 'equal_opportunity', 'average_odds']

In [6]:
# Average Precision

sp_mlp_train_ap = []
sp_rf_train_ap = []
sp_xgb_train_ap = []
sp_mlp_test_ap = []
sp_rf_test_ap = []
sp_xgb_test_ap = []

eo_mlp_train_ap = []
eo_rf_train_ap = []
eo_xgb_train_ap = []
eo_mlp_test_ap = []
eo_rf_test_ap = []
eo_xgb_test_ap = []

ao_mlp_train_ap = []
ao_rf_train_ap = []
ao_xgb_train_ap = []
ao_mlp_test_ap = []
ao_rf_test_ap = []
ao_xgb_test_ap = []

# Precision

sp_mlp_train_prec = []
sp_rf_train_prec = []
sp_xgb_train_prec = []
sp_mlp_test_prec = []
sp_rf_test_prec = []
sp_xgb_test_prec = []

eo_mlp_train_prec = []
eo_rf_train_prec = []
eo_xgb_train_prec = []
eo_mlp_test_prec = []
eo_rf_test_prec = []
eo_xgb_test_prec = []

ao_mlp_train_prec = []
ao_rf_train_prec = []
ao_xgb_train_prec = []
ao_mlp_test_prec = []
ao_rf_test_prec = []
ao_xgb_test_prec = []

# Recall

sp_mlp_train_rec = []
sp_rf_train_rec = []
sp_xgb_train_rec = []
sp_mlp_test_rec = []
sp_rf_test_rec = []
sp_xgb_test_rec = []

eo_mlp_train_rec = []
eo_rf_train_rec = []
eo_xgb_train_rec = []
eo_mlp_test_rec = []
eo_rf_test_rec = []
eo_xgb_test_rec = []

ao_mlp_train_rec = []
ao_rf_train_rec = []
ao_xgb_train_rec = []
ao_mlp_test_rec = []
ao_rf_test_rec = []
ao_xgb_test_rec = []

# F1 Score

sp_mlp_train_f1 = []
sp_rf_train_f1 = []
sp_xgb_train_f1 = []
sp_mlp_test_f1 = []
sp_rf_test_f1 = []
sp_xgb_test_f1 = []

eo_mlp_train_f1 = []
eo_rf_train_f1 = []
eo_xgb_train_f1 = []
eo_mlp_test_f1 = []
eo_rf_test_f1 = []
eo_xgb_test_f1 = []

ao_mlp_train_f1 = []
ao_rf_train_f1 = []
ao_xgb_train_f1 = []
ao_mlp_test_f1 = []
ao_rf_test_f1 = []
ao_xgb_test_f1 = []

# Accuracy

sp_mlp_train_acc = []
sp_rf_train_acc = []
sp_xgb_train_acc = []
sp_mlp_test_acc = []
sp_rf_test_acc = []
sp_xgb_test_acc = []

eo_mlp_train_acc = []
eo_rf_train_acc = []
eo_xgb_train_acc = []
eo_mlp_test_acc = []
eo_rf_test_acc = []
eo_xgb_test_acc = []

ao_mlp_train_acc = []
ao_rf_train_acc = []
ao_xgb_train_acc = []
ao_mlp_test_acc = []
ao_rf_test_acc = []
ao_xgb_test_acc = []

# MCC
sp_mlp_train_mcc = []
sp_rf_train_mcc = []
sp_xgb_train_mcc = []
sp_mlp_test_mcc = []
sp_rf_test_mcc = []
sp_xgb_test_mcc = []

eo_mlp_train_mcc = []
eo_rf_train_mcc = []
eo_xgb_train_mcc = []
eo_mlp_test_mcc = []
eo_rf_test_mcc = []
eo_xgb_test_mcc = []

ao_mlp_train_mcc = []
ao_rf_train_mcc = []
ao_xgb_train_mcc = []
ao_mlp_test_mcc = []
ao_rf_test_mcc = []
ao_xgb_test_mcc = []

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for itrain, itest in kfold.split(data.index.unique().values):
    train_index = data.index.unique()[itrain]
    test_index = data.index.unique()[itest]
    print(train_index)
    print(test_index)
    train = data.loc[train_index]
    test = data.loc[test_index]

    print("Train size: ", train.shape[0])
    print("Test size: ", test.shape[0])

    # Statistical Parity

    evaluate_methods(
        train, 
        test, 
        'statistical_parity', 
        sp_mlp_test_ap, 
        sp_rf_test_ap, 
        sp_xgb_test_ap, 
        sp_mlp_test_prec, 
        sp_rf_test_prec, 
        sp_xgb_test_prec, 
        sp_mlp_test_rec, 
        sp_rf_test_rec, 
        sp_xgb_test_rec, 
        sp_mlp_test_f1, 
        sp_rf_test_f1, 
        sp_xgb_test_f1, 
        sp_mlp_test_acc, 
        sp_rf_test_acc, 
        sp_xgb_test_acc
      )

    # Equal Opportunity

    evaluate_methods(
        train,
        test,
        "equal_opportunity",
        eo_mlp_test_ap,
        eo_rf_test_ap,
        eo_xgb_test_ap,
        eo_mlp_test_prec,
        eo_rf_test_prec,
        eo_xgb_test_prec,
        eo_mlp_test_rec,
        eo_rf_test_rec,
        eo_xgb_test_rec,
        eo_mlp_test_f1,
        eo_rf_test_f1,
        eo_xgb_test_f1,
        eo_mlp_test_acc,
        eo_rf_test_acc,
        eo_xgb_test_acc,
    )

    # Average Odds

    evaluate_methods(
        train,
        test,
        "average_odds",
        ao_mlp_test_ap,
        ao_rf_test_ap,
        ao_xgb_test_ap,
        ao_mlp_test_prec,
        ao_rf_test_prec,
        ao_xgb_test_prec,
        ao_mlp_test_rec,
        ao_rf_test_rec,
        ao_xgb_test_rec,
        ao_mlp_test_f1,
        ao_rf_test_f1,
        ao_xgb_test_f1,
        ao_mlp_test_acc,
        ao_rf_test_acc,
        ao_xgb_test_acc,
    )

Index(['german', 'student', 'adult', 'park', 'wine', 'placement', 'hearth',
       'resyduo', 'us', 'law', 'medical', 'vaccine', 'compas', 'ricci', 'bank',
       'health', 'diabetic', 'crime', 'ibm'],
      dtype='object', name='data')
Index(['arrhythmia', 'drug', 'cmc', 'obesity', 'credit'], dtype='object', name='data')
Train size:  5492
Test size:  438
Index(['arrhythmia', 'student', 'adult', 'park', 'placement', 'hearth', 'drug',
       'us', 'cmc', 'law', 'vaccine', 'compas', 'obesity', 'ricci', 'credit',
       'bank', 'health', 'crime', 'ibm'],
      dtype='object', name='data')
Index(['german', 'wine', 'resyduo', 'medical', 'diabetic'], dtype='object', name='data')
Train size:  4612
Test size:  1318
Index(['arrhythmia', 'german', 'wine', 'placement', 'hearth', 'drug',
       'resyduo', 'us', 'cmc', 'medical', 'vaccine', 'obesity', 'ricci',
       'credit', 'bank', 'health', 'diabetic', 'crime', 'ibm'],
      dtype='object', name='data')
Index(['student', 'adult', 'park', 'law',

In [7]:
sp_data_full_xgb = pd.DataFrame({
    'ap': sp_xgb_test_ap,
    'accuracy': sp_xgb_test_acc,
    'precision': sp_xgb_test_prec,
    'recall': sp_xgb_test_rec,
    'f1': sp_xgb_test_f1,
})
sp_data_full_xgb['model'] = 'XGBoost'
sp_data_full_rf = pd.DataFrame({
    'ap': sp_rf_test_ap,
    'accuracy': sp_rf_test_acc,
    'precision': sp_rf_test_prec,
    'recall': sp_rf_test_rec,
    'f1': sp_rf_test_f1,
})
sp_data_full_rf['model'] = 'Random Forest'
sp_data_full_mlp = pd.DataFrame({
    'ap': sp_mlp_test_ap,
    'accuracy': sp_mlp_test_acc,
    'precision': sp_mlp_test_prec,
    'recall': sp_mlp_test_rec,
    'f1': sp_mlp_test_f1,
}) 
sp_data_full_mlp['model'] = 'MLP'
sp_data_full = pd.concat([sp_data_full_xgb, sp_data_full_rf, sp_data_full_mlp])

eo_data_full_xgb = pd.DataFrame({
    'ap': eo_xgb_test_ap,
    'accuracy': eo_xgb_test_acc,
    'precision': eo_xgb_test_prec,
    'recall': eo_xgb_test_rec,
    'f1': eo_xgb_test_f1,
})
eo_data_full_xgb['model'] = 'XGBoost'
eo_data_full_rf = pd.DataFrame({
    'ap': eo_rf_test_ap,
    'accuracy': eo_rf_test_acc,
    'precision': eo_rf_test_prec,
    'recall': eo_rf_test_rec,
    'f1': eo_rf_test_f1,
})
eo_data_full_rf['model'] = 'Random Forest'
eo_data_full_mlp = pd.DataFrame({
    'ap': eo_mlp_test_ap,
    'accuracy': eo_mlp_test_acc,
    'precision': eo_mlp_test_prec,
    'recall': eo_mlp_test_rec,
    'f1': eo_mlp_test_f1,
})
eo_data_full_mlp['model'] = 'MLP'
eo_data_full = pd.concat([eo_data_full_xgb, eo_data_full_rf, eo_data_full_mlp])

ao_data_full_xgb = pd.DataFrame({
    'ap': ao_xgb_test_ap,
    'accuracy': ao_xgb_test_acc,
    'precision': ao_xgb_test_prec,
    'recall': ao_xgb_test_rec,
    'f1': ao_xgb_test_f1,
})
ao_data_full_xgb['model'] = 'XGBoost'
ao_data_full_rf = pd.DataFrame({
    'ap': ao_rf_test_ap,
    'accuracy': ao_rf_test_acc,
    'precision': ao_rf_test_prec,
    'recall': ao_rf_test_rec,
    'f1': ao_rf_test_f1,
})
ao_data_full_rf['model'] = 'Random Forest'
ao_data_full_mlp = pd.DataFrame({
    'ap': ao_mlp_test_ap,
    'accuracy': ao_mlp_test_acc,
    'precision': ao_mlp_test_prec,
    'recall': ao_mlp_test_rec,
    'f1': ao_mlp_test_f1,
})
ao_data_full_mlp['model'] = 'MLP'
ao_data_full = pd.concat([ao_data_full_xgb, ao_data_full_rf, ao_data_full_mlp])

In [8]:
def group_metrics(data):
    return data.groupby('model').agg(['mean', 'std']).round(3).reset_index()

In [9]:
sp = group_metrics(sp_data_full)
eo = group_metrics(eo_data_full)
ao = group_metrics(ao_data_full)
full_data = pd.concat([sp_data_full, eo_data_full, ao_data_full])
full = group_metrics(full_data)
sp.to_csv("bias_symptoms_sp.csv")
eo.to_csv("bias_symptoms_eo.csv")
ao.to_csv("bias_symptoms_ao.csv")
full.to_csv("bias_symptoms_full.csv")

In [10]:
sp

Unnamed: 0_level_0,model,ap,ap,accuracy,accuracy,precision,precision,recall,recall,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,MLP,0.82,0.091,0.737,0.152,0.674,0.164,0.589,0.162,0.613,0.134
1,Random Forest,0.898,0.042,0.817,0.118,0.806,0.119,0.646,0.191,0.706,0.142
2,XGBoost,0.893,0.054,0.833,0.103,0.78,0.132,0.714,0.176,0.737,0.14


In [11]:
eo

Unnamed: 0_level_0,model,ap,ap,accuracy,accuracy,precision,precision,recall,recall,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,MLP,0.69,0.13,0.627,0.134,0.612,0.265,0.58,0.09,0.575,0.167
1,Random Forest,0.779,0.127,0.75,0.119,0.689,0.209,0.73,0.138,0.703,0.178
2,XGBoost,0.791,0.138,0.737,0.114,0.682,0.205,0.71,0.139,0.689,0.172


In [12]:
ao

Unnamed: 0_level_0,model,ap,ap,accuracy,accuracy,precision,precision,recall,recall,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,MLP,0.829,0.079,0.767,0.093,0.72,0.163,0.611,0.191,0.655,0.172
1,Random Forest,0.836,0.067,0.814,0.072,0.792,0.117,0.687,0.131,0.735,0.124
2,XGBoost,0.838,0.094,0.804,0.093,0.767,0.115,0.699,0.148,0.729,0.129


### Comparison between groups

In [13]:
def compute_kruscal(sp_data_full, sp_data_full_mlp, sp_data_full_rf):
    ap = kruskal(sp_data_full["ap"], sp_data_full_mlp["ap"], sp_data_full_rf["ap"])
    accuracy = kruskal(sp_data_full["accuracy"], sp_data_full_mlp["accuracy"], sp_data_full_rf["accuracy"])
    precision = kruskal(sp_data_full["precision"], sp_data_full_mlp["precision"], sp_data_full_rf["precision"])
    recall = kruskal(sp_data_full["recall"], sp_data_full_mlp["recall"], sp_data_full_rf["recall"])
    f1 = kruskal(sp_data_full["f1"], sp_data_full_mlp["f1"], sp_data_full_rf["f1"])
    print(round(ap[1],2))
    print(round(accuracy[1],2))
    print(round(precision[1],2))
    print(round(recall[1],2))
    print(round(f1[1],2))

In [14]:
compute_kruscal(sp_data_full_xgb, sp_data_full_mlp, sp_data_full_rf)

0.28
0.33
0.51
0.4
0.14


In [15]:
compute_kruscal(eo_data_full_xgb, eo_data_full_mlp, eo_data_full_rf)

0.11
0.09
0.97
0.11
0.26


In [16]:
compute_kruscal(ao_data_full_xgb, ao_data_full_mlp, ao_data_full_rf)

0.89
0.62
0.73
0.45
0.47
