In [25]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import kruskal, mannwhitneyu
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, average_precision_score, f1_score,
                             precision_score, recall_score, matthews_corrcoef, roc_auc_score)
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from joblib import load

sns.set_theme(style = 'whitegrid', context='paper', palette='Greys', font_scale=1.5)

In [26]:
data = pd.read_csv('../data/bias_symptoms_logreg_reduced.csv', index_col=[0,1])
data.index = data.index.droplevel(1)
labels = ['statistical_parity', 'equal_opportunity', 'average_odds']

In [29]:
def evaluate_methods(train, test, label, mlp_test_ap, rf_test_ap, xgb_test_ap, mlp_test_prec, rf_test_prec, xgb_test_prec, mlp_test_rec, rf_test_rec, xgb_test_rec, mlp_test_f1, rf_test_f1, xgb_test_f1, mlp_test_acc, rf_test_acc, xgb_test_acc, mlp_sp, rf_sp, xgb_sp):
    mlp_sp.fit(train.drop(columns=label), train[label])
    rf_sp.fit(train.drop(columns=label), train[label])
    xgb_sp.fit(train.drop(columns=label), train[label])

    # mlp_test_sp = mlp_sp.predict_proba(test.drop(columns=label))
    # rf_test_sp = rf_sp.predict_proba(test.drop(columns=label))
    # xgb_test_sp = xgb_sp.predict_proba(test.drop(columns=label))

    mlp_test_pred_sp = mlp_sp.predict(test.drop(columns=label))
    rf_test_pred_sp = rf_sp.predict(test.drop(columns=label))
    xgb_test_pred_sp = xgb_sp.predict(test.drop(columns=label))

    # mlp_test_ap.append(roc_auc_score(test[label], mlp_test_sp[:, 1]))
    # rf_test_ap.append(roc_auc_score(test[label], rf_test_sp[:, 1]))
    # xgb_test_ap.append(roc_auc_score(test[label], xgb_test_sp[:, 1]))

    mlp_test_prec.append(precision_score(test[label], mlp_test_pred_sp))
    rf_test_prec.append(precision_score(test[label], rf_test_pred_sp))
    xgb_test_prec.append(precision_score(test[label], xgb_test_pred_sp))

    mlp_test_rec.append(recall_score(test[label], mlp_test_pred_sp))
    rf_test_rec.append(recall_score(test[label], rf_test_pred_sp))
    xgb_test_rec.append(recall_score(test[label], xgb_test_pred_sp))

    mlp_test_f1.append(f1_score(test[label], mlp_test_pred_sp))
    rf_test_f1.append(f1_score(test[label], rf_test_pred_sp))
    xgb_test_f1.append(f1_score(test[label], xgb_test_pred_sp))

    mlp_test_acc.append(accuracy_score(test[label], mlp_test_pred_sp))
    rf_test_acc.append(accuracy_score(test[label], rf_test_pred_sp))
    xgb_test_acc.append(accuracy_score(test[label], xgb_test_pred_sp))

In [30]:
# Average Precision

sp_mlp_train_ap = []
sp_rf_train_ap = []
sp_xgb_train_ap = []
sp_mlp_test_ap = []
sp_rf_test_ap = []
sp_xgb_test_ap = []

eo_mlp_train_ap = []
eo_rf_train_ap = []
eo_xgb_train_ap = []
eo_mlp_test_ap = []
eo_rf_test_ap = []
eo_xgb_test_ap = []

ao_mlp_train_ap = []
ao_rf_train_ap = []
ao_xgb_train_ap = []
ao_mlp_test_ap = []
ao_rf_test_ap = []
ao_xgb_test_ap = []

# Precision

sp_mlp_train_prec = []
sp_rf_train_prec = []
sp_xgb_train_prec = []
sp_mlp_test_prec = []
sp_rf_test_prec = []
sp_xgb_test_prec = []

eo_mlp_train_prec = []
eo_rf_train_prec = []
eo_xgb_train_prec = []
eo_mlp_test_prec = []
eo_rf_test_prec = []
eo_xgb_test_prec = []

ao_mlp_train_prec = []
ao_rf_train_prec = []
ao_xgb_train_prec = []
ao_mlp_test_prec = []
ao_rf_test_prec = []
ao_xgb_test_prec = []

# Recall

sp_mlp_train_rec = []
sp_rf_train_rec = []
sp_xgb_train_rec = []
sp_mlp_test_rec = []
sp_rf_test_rec = []
sp_xgb_test_rec = []

eo_mlp_train_rec = []
eo_rf_train_rec = []
eo_xgb_train_rec = []
eo_mlp_test_rec = []
eo_rf_test_rec = []
eo_xgb_test_rec = []

ao_mlp_train_rec = []
ao_rf_train_rec = []
ao_xgb_train_rec = []
ao_mlp_test_rec = []
ao_rf_test_rec = []
ao_xgb_test_rec = []

# F1 Score

sp_mlp_train_f1 = []
sp_rf_train_f1 = []
sp_xgb_train_f1 = []
sp_mlp_test_f1 = []
sp_rf_test_f1 = []
sp_xgb_test_f1 = []

eo_mlp_train_f1 = []
eo_rf_train_f1 = []
eo_xgb_train_f1 = []
eo_mlp_test_f1 = []
eo_rf_test_f1 = []
eo_xgb_test_f1 = []

ao_mlp_train_f1 = []
ao_rf_train_f1 = []
ao_xgb_train_f1 = []
ao_mlp_test_f1 = []
ao_rf_test_f1 = []
ao_xgb_test_f1 = []

# Accuracy

sp_mlp_train_acc = []
sp_rf_train_acc = []
sp_xgb_train_acc = []
sp_mlp_test_acc = []
sp_rf_test_acc = []
sp_xgb_test_acc = []

eo_mlp_train_acc = []
eo_rf_train_acc = []
eo_xgb_train_acc = []
eo_mlp_test_acc = []
eo_rf_test_acc = []
eo_xgb_test_acc = []

ao_mlp_train_acc = []
ao_rf_train_acc = []
ao_xgb_train_acc = []
ao_mlp_test_acc = []
ao_rf_test_acc = []
ao_xgb_test_acc = []

# MCC
sp_mlp_train_mcc = []
sp_rf_train_mcc = []
sp_xgb_train_mcc = []
sp_mlp_test_mcc = []
sp_rf_test_mcc = []
sp_xgb_test_mcc = []

eo_mlp_train_mcc = []
eo_rf_train_mcc = []
eo_xgb_train_mcc = []
eo_mlp_test_mcc = []
eo_rf_test_mcc = []
eo_xgb_test_mcc = []

ao_mlp_train_mcc = []
ao_rf_train_mcc = []
ao_xgb_train_mcc = []
ao_mlp_test_mcc = []
ao_rf_test_mcc = []
ao_xgb_test_mcc = []

kfold = KFold(n_splits=24, shuffle=True, random_state=42)
for itrain, itest in kfold.split(data.index.unique().values):
    train_index = data.index.unique()[itrain]
    test_index = data.index.unique()[itest]
    print(train_index)
    print(test_index)
    train = data.loc[train_index].drop(columns='pos_prob')
    test = data.loc[test_index].drop(columns='pos_prob')

    print("Train size: ", train.shape[0])
    print("Test size: ", test.shape[0])

    # Statistical Parity

    mlp_sp = load('../../ml_models/bias_symptoms_logreg_reduced_mlp_statistical_parity_nodsp.joblib')
    rf_sp = load('../../ml_models/bias_symptoms_logreg_reduced_rf_statistical_parity_nodsp.joblib')
    xgb_sp = load('../../ml_models/bias_symptoms_logreg_reduced_xgb_statistical_parity_nodsp.joblib')

    evaluate_methods(
        train, 
        test, 
        'statistical_parity', 
        sp_mlp_test_ap, 
        sp_rf_test_ap, 
        sp_xgb_test_ap, 
        sp_mlp_test_prec, 
        sp_rf_test_prec, 
        sp_xgb_test_prec, 
        sp_mlp_test_rec, 
        sp_rf_test_rec, 
        sp_xgb_test_rec, 
        sp_mlp_test_f1, 
        sp_rf_test_f1, 
        sp_xgb_test_f1, 
        sp_mlp_test_acc, 
        sp_rf_test_acc, 
        sp_xgb_test_acc,
        mlp_sp,
        rf_sp,
        xgb_sp
      )

    # Equal Opportunity

    mlp_eo = load('../../ml_models/bias_symptoms_logreg_reduced_mlp_equal_opportunity_nodsp.joblib')
    rf_eo = load('../../ml_models/bias_symptoms_logreg_reduced_rf_equal_opportunity_nodsp.joblib')
    xgb_eo = load('../../ml_models/bias_symptoms_logreg_reduced_xgb_equal_opportunity_nodsp.joblib')

    evaluate_methods(
        train,
        test,
        "equal_opportunity",
        eo_mlp_test_ap,
        eo_rf_test_ap,
        eo_xgb_test_ap,
        eo_mlp_test_prec,
        eo_rf_test_prec,
        eo_xgb_test_prec,
        eo_mlp_test_rec,
        eo_rf_test_rec,
        eo_xgb_test_rec,
        eo_mlp_test_f1,
        eo_rf_test_f1,
        eo_xgb_test_f1,
        eo_mlp_test_acc,
        eo_rf_test_acc,
        eo_xgb_test_acc,
        mlp_eo,
        rf_eo,
        xgb_eo
    )

    # Average Odds

    mlp_ao = load('../../ml_models/bias_symptoms_logreg_reduced_mlp_average_odds_nodps.joblib')
    rf_ao = load('../../ml_models/bias_symptoms_logreg_reduced_rf_average_odds_nodsp.joblib')
    xgb_ao = load('../../ml_models/bias_symptoms_logreg_reduced_xgb_average_odds_nodsp.joblib')

    evaluate_methods(
        train,
        test,
        "average_odds",
        ao_mlp_test_ap,
        ao_rf_test_ap,
        ao_xgb_test_ap,
        ao_mlp_test_prec,
        ao_rf_test_prec,
        ao_xgb_test_prec,
        ao_mlp_test_rec,
        ao_rf_test_rec,
        ao_xgb_test_rec,
        ao_mlp_test_f1,
        ao_rf_test_f1,
        ao_xgb_test_f1,
        ao_mlp_test_acc,
        ao_rf_test_acc,
        ao_xgb_test_acc,
        mlp_ao,
        rf_ao,
        xgb_ao
    )

  

Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['drug'], dtype='object', name='data')
Train size:  5910
Test size:  20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['obesity'], dtype='object', name='data')
Train size:  5870
Test size:  60
Index(['german', 'student', 'adult', 'park', 'wine', 'placement', 'hearth',
       'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['arrhythmia'], dtype='object', name='data')
Train size:  5612
Test size:  318
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'crime', 'ibm'],
      dtype='object', name='data')
Index(['cmc'], dtype='object', name='data')
Train size:  5900
Test size:  30
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['resyduo'], dtype='object', name='data')
Train size:  5920
Test size:  10


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['medical'], dtype='object', name='data')
Train size:  5869
Test size:  61
Index(['arrhythmia', 'student', 'adult', 'park', 'wine', 'placement', 'hearth',
       'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['german'], dtype='object', name='data')
Train size:  5412
Test size:  518
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'crime',
       'ibm'],
      dtype='object', na

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'adult', 'park', 'wine', 'placement', 'hearth',
       'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['student'], dtype='object', name='data')
Train size:  5621
Test size:  309
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'crime', 'ibm'],
      dtype='object', name='data')
Index(['law'], dtype='object', name='data')
Train size:  5880
Test size:  50


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['compas'], dtype='object', name='data')
Train size:  4559
Test size:  1371
Index(['arrhythmia', 'german', 'student', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'crime', 'ibm'],
      dtype='object', name='data')
Index(['adult'], dtype='object', name='data')
Train size:  5028
Test size:  902
Index(['arrhythmia', 'german', 'student', 'adult', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'crime', 'ibm'],
      dtype='objec

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'ibm'],
      dtype='object', name='data')
Index(['crime'], dtype='object', name='data')
Train size:  5909
Test size:  21
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['ricci'], dtype='object', name='data')
Train size:  5883
Test size:  47
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'diabetic', 'crime',
       'ibm'],
      dtype='object', n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'crime'],
      dtype='object', name='data')
Index(['ibm'], dtype='object', name='data')
Train size:  5640
Test size:  290
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['hearth'], dtype='object', name='data')
Train size:  5892
Test size:  38
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic',
       'crime', 'ibm'],
      dtype='objec

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['vaccine'], dtype='object', name='data')
Train size:  5720
Test size:  210
Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'placement',
       'hearth', 'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine',
       'compas', 'obesity', 'ricci', 'credit', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['bank'], dtype='object', name='data')
Train size:  5490
Test size:  440


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Index(['arrhythmia', 'german', 'student', 'adult', 'park', 'wine', 'hearth',
       'drug', 'resyduo', 'us', 'cmc', 'law', 'medical', 'vaccine', 'compas',
       'obesity', 'ricci', 'credit', 'bank', 'health', 'diabetic', 'crime',
       'ibm'],
      dtype='object', name='data')
Index(['placement'], dtype='object', name='data')
Train size:  5775
Test size:  155


In [32]:
sp_data_full_xgb = pd.DataFrame({
    'accuracy': sp_xgb_test_acc,
    'precision': sp_xgb_test_prec,
    'recall': sp_xgb_test_rec,
    'f1': sp_xgb_test_f1,
})
sp_data_full_xgb['model'] = 'XGBoost'
sp_data_full_rf = pd.DataFrame({
    'accuracy': sp_rf_test_acc,
    'precision': sp_rf_test_prec,
    'recall': sp_rf_test_rec,
    'f1': sp_rf_test_f1,
})
sp_data_full_rf['model'] = 'Random Forest'
sp_data_full_mlp = pd.DataFrame({
    'accuracy': sp_mlp_test_acc,
    'precision': sp_mlp_test_prec,
    'recall': sp_mlp_test_rec,
    'f1': sp_mlp_test_f1,
}) 
sp_data_full_mlp['model'] = 'MLP'
sp_data_full = pd.concat([sp_data_full_xgb, sp_data_full_rf, sp_data_full_mlp])

eo_data_full_xgb = pd.DataFrame({
    'accuracy': eo_xgb_test_acc,
    'precision': eo_xgb_test_prec,
    'recall': eo_xgb_test_rec,
    'f1': eo_xgb_test_f1,
})
eo_data_full_xgb['model'] = 'XGBoost'
eo_data_full_rf = pd.DataFrame({
    'accuracy': eo_rf_test_acc,
    'precision': eo_rf_test_prec,
    'recall': eo_rf_test_rec,
    'f1': eo_rf_test_f1,
})
eo_data_full_rf['model'] = 'Random Forest'
eo_data_full_mlp = pd.DataFrame({
    'accuracy': eo_mlp_test_acc,
    'precision': eo_mlp_test_prec,
    'recall': eo_mlp_test_rec,
    'f1': eo_mlp_test_f1,
})
eo_data_full_mlp['model'] = 'MLP'
eo_data_full = pd.concat([eo_data_full_xgb, eo_data_full_rf, eo_data_full_mlp])

ao_data_full_xgb = pd.DataFrame({
    'accuracy': ao_xgb_test_acc,
    'precision': ao_xgb_test_prec,
    'recall': ao_xgb_test_rec,
    'f1': ao_xgb_test_f1,
})
ao_data_full_xgb['model'] = 'XGBoost'
ao_data_full_rf = pd.DataFrame({
    'accuracy': ao_rf_test_acc,
    'precision': ao_rf_test_prec,
    'recall': ao_rf_test_rec,
    'f1': ao_rf_test_f1,
})
ao_data_full_rf['model'] = 'Random Forest'
ao_data_full_mlp = pd.DataFrame({
    'accuracy': ao_mlp_test_acc,
    'precision': ao_mlp_test_prec,
    'recall': ao_mlp_test_rec,
    'f1': ao_mlp_test_f1,
})
ao_data_full_mlp['model'] = 'MLP'
ao_data_full = pd.concat([ao_data_full_xgb, ao_data_full_rf, ao_data_full_mlp])

In [33]:
def group_metrics(data):
    return data.groupby('model').agg(['mean', 'std']).round(3).reset_index()

In [34]:
sp = group_metrics(sp_data_full)
eo = group_metrics(eo_data_full)
ao = group_metrics(ao_data_full)
full_data = pd.concat([sp_data_full, eo_data_full, ao_data_full])
full = group_metrics(full_data)
sp.to_csv("bias_symptoms_sp.csv")
eo.to_csv("bias_symptoms_eo.csv")
ao.to_csv("bias_symptoms_ao.csv")
full.to_csv("bias_symptoms_full.csv")

In [35]:
sp

Unnamed: 0_level_0,model,accuracy,accuracy,precision,precision,recall,recall,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,MLP,0.791,0.187,0.565,0.36,0.726,0.375,0.567,0.348
1,Random Forest,0.784,0.172,0.592,0.367,0.594,0.383,0.55,0.335
2,XGBoost,0.778,0.184,0.571,0.363,0.62,0.368,0.543,0.332


## Accuracy of DSP and SP

In [21]:
import numpy as np

In [15]:
data = pd.read_csv('../data/bias_symptoms_logreg_reduced.csv', index_col=[0,1])

In [20]:
acc_list = []
prec_list = []
rec_list = []
f1_list = []
for dataset in data.index.get_level_values(0).unique():
    print(f"Dataset: {dataset}")
    dataset_data = data.xs(dataset, level=0)
    y_true = dataset_data['statistical_parity']
    y_pred = dataset_data['pos_prob'].apply(lambda x: 1 if x > 0.5 else 0)
    
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    
    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}")

Dataset: arrhythmia
Accuracy: 0.5912, F1 Score: 0.7111, Precision: 0.9816, Recall: 0.5575
Dataset: german
Accuracy: 0.5946, F1 Score: 0.2045, Precision: 1.0000, Recall: 0.1139
Dataset: student
Accuracy: 0.8252, F1 Score: 0.0690, Precision: 1.0000, Recall: 0.0357
Dataset: adult
Accuracy: 0.6386, F1 Score: 0.1809, Precision: 0.8372, Recall: 0.1014
Dataset: park
Accuracy: 0.2500, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Dataset: wine
Accuracy: 0.3000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Dataset: placement
Accuracy: 0.6516, F1 Score: 0.2703, Precision: 0.9091, Recall: 0.1587
Dataset: hearth
Accuracy: 0.5263, F1 Score: 0.4706, Precision: 1.0000, Recall: 0.3077
Dataset: drug
Accuracy: 0.2500, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Dataset: resyduo
Accuracy: 1.0000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Dataset: us
Accuracy: 0.8667, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Dataset: cmc
Accuracy: 0.5667, F1 Score: 0.0000, Precisi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [22]:
np.mean(acc_list), np.mean(prec_list), np.mean(rec_list), np.mean(f1_list)
print(f"Mean Accuracy: {np.mean(acc_list):.4f}, Mean F1 Score: {np.mean(f1_list):.4f}, Mean Precision: {np.mean(prec_list):.4f}, Mean Recall: {np.mean(rec_list):.4f}")

Mean Accuracy: 0.6007, Mean F1 Score: 0.1629, Mean Precision: 0.4352, Mean Recall: 0.1264


In [23]:
np.std(acc_list), np.std(prec_list), np.std(rec_list), np.std(f1_list)
print(f"Std Accuracy: {np.std(acc_list):.4f}, Std F1 Score: {np.std(f1_list):.4f}, Std Precision: {np.std(prec_list):.4f}, Std Recall: {np.std(rec_list):.4f}")

Std Accuracy: 0.3100, Std F1 Score: 0.2381, Std Precision: 0.4555, Std Recall: 0.2291


In [24]:
len(acc_list)

24

In [None]:
from pingouin import wilcoxon

In [43]:
def get_wilcoxon(model_acc, model_prec, model_rec, model_f1, dsp_acc, dsp_prec, dsp_rec, dsp_f1):
    acc_wilcoxon = wilcoxon(model_acc, dsp_acc)[['p-val', 'CLES']]
    prec_wilcoxon = wilcoxon(model_prec, dsp_prec)[['p-val', 'CLES']]
    rec_wilcoxon = wilcoxon(model_rec, dsp_rec)[['p-val', 'CLES']]
    f1_wilcoxon = wilcoxon(model_f1, dsp_f1)[['p-val', 'CLES']]
    
    return pd.concat([acc_wilcoxon, prec_wilcoxon, rec_wilcoxon, f1_wilcoxon], axis=0)

In [44]:
get_wilcoxon(sp_xgb_test_acc, sp_xgb_test_prec, sp_xgb_test_rec, sp_xgb_test_f1, acc_list, prec_list, rec_list, f1_list)

Unnamed: 0,p-val,CLES
Wilcoxon,0.037213,0.661458
Wilcoxon,0.337877,0.574653
Wilcoxon,0.000111,0.827257
Wilcoxon,0.000166,0.802083


In [45]:
get_wilcoxon(sp_rf_test_acc, sp_rf_test_prec, sp_rf_test_rec, sp_rf_test_f1, acc_list, prec_list, rec_list, f1_list)

Unnamed: 0,p-val,CLES
Wilcoxon,0.014349,0.681424
Wilcoxon,0.258275,0.577257
Wilcoxon,0.000262,0.802951
Wilcoxon,0.000344,0.792535


In [46]:
get_wilcoxon(sp_mlp_test_acc, sp_mlp_test_prec, sp_mlp_test_rec, sp_mlp_test_f1, acc_list, prec_list, rec_list, f1_list)

Unnamed: 0,p-val,CLES
Wilcoxon,0.027448,0.680556
Wilcoxon,0.298819,0.586806
Wilcoxon,9.7e-05,0.858507
Wilcoxon,0.000146,0.820312
