In [None]:
import numpy as np
import pandas as pd
import aif360
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight

from xgboost import XGBClassifier

########################SET folder to path of code directory in the project folder#####################
## SET PATH to code directory in the project folder
code_path = "C:\\Users\\evdoxiataka\\codes\\effi_user_study_1\\code\\"
import sys  
sys.path.insert(1, code_path)

########################SET folder to project directory path#####################
folder= "C:\\Users\\evdoxiataka\\codes\\effi_user_study_1\\"

##
from utils.fairness_metrics import demographic_parity_ratio_, equal_opportunity_difference_, average_odds_difference_ 
from utils.utils import *

group_fair = ['DemographicParityRatio',
              'EqualOpportunityDifference',
              'AverageOddsDifference']

protected_attributes = ['CODE_GENDER',
                       'NAME_FAMILY_STATUS',
                       'AGE']

In [None]:
test_df = pd.read_csv(folder+'data/processed_data/test_df.csv', delimiter=',')
test_df_final = pd.read_csv(folder+'data/processed_data/test_df_final.csv', delimiter=',')
test_df_final = test_df_final.loc[:, ~test_df_final.columns.isin(['SK_ID_CURR'])]

##
train_df_train = pd.read_csv(folder+'data/processed_data/train_df_train.csv', delimiter=',')
train_df_test = pd.read_csv(folder+'data/processed_data/train_df_test.csv', delimiter=',')

X_train_original = train_df_train.loc[:, ~train_df_train.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_train_original = train_df_train.loc[:, train_df_train.columns == 'TARGET']
X_test_original = train_df_test.loc[:, ~train_df_test.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_test_original = train_df_test.loc[:, train_df_test.columns == 'TARGET']

## load original test set to be used for fairness metrics
train_df = pd.read_csv(folder+'data/processed_data/train_df.csv', delimiter=',')

### train_df_train
train_df_app_ids = train_df_train['SK_ID_CURR'].tolist()
train_df_train_or = train_df[train_df['SK_ID_CURR'].isin(train_df_app_ids)]
train_df_train_or = train_df_train_or.reset_index().set_index('SK_ID_CURR').loc[train_df_app_ids].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(train_df_train_or)
train_df_train_bin = binning(train_df_train_or, train_df_train)

### train_df_test
test_df_app_ids = train_df_test['SK_ID_CURR'].tolist()
train_df_test_or = train_df[train_df['SK_ID_CURR'].isin(test_df_app_ids)]
train_df_test_or = train_df_test_or.reset_index().set_index('SK_ID_CURR').loc[test_df_app_ids].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(train_df_test_or)
train_df_test_bin = binning(train_df_test_or, train_df_test)

##
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y = y_train_original
)

## Get baseline metrics
baseline = pd.read_csv(folder+'data/results/global/Labels/group_fairness_global-Labels.csv', delimiter=',')
baseline = baseline[baseline['iteration']==0]
baseline = baseline[baseline['Feature'].isin(protected_attributes)]
baseline = baseline.loc[:,baseline.columns.isin(['Feature']+group_fair)]

dpr_age_base = baseline[baseline['Feature']=='AGE']['DemographicParityRatio'].tolist()[0]
dpr_gender_base = baseline[baseline['Feature']=='CODE_GENDER']['DemographicParityRatio'].tolist()[0]
dpr_maritstat_base = baseline[baseline['Feature']=='NAME_FAMILY_STATUS']['DemographicParityRatio'].tolist()[0]

eod_age_base = baseline[baseline['Feature']=='AGE']['EqualOpportunityDifference'].tolist()[0]
eod_gender_base = baseline[baseline['Feature']=='CODE_GENDER']['EqualOpportunityDifference'].tolist()[0]
eod_maritstat_base = baseline[baseline['Feature']=='NAME_FAMILY_STATUS']['EqualOpportunityDifference'].tolist()[0]

aod_age_base = baseline[baseline['Feature']=='AGE']['AverageOddsDifference'].tolist()[0]
aod_gender_base = baseline[baseline['Feature']=='CODE_GENDER']['AverageOddsDifference'].tolist()[0]
aod_maritstat_base = baseline[baseline['Feature']=='NAME_FAMILY_STATUS']['AverageOddsDifference'].tolist()[0]

print('baseline', 'DPR GENDER', dpr_gender_base, 'DPR Marit Stat', dpr_maritstat_base, 'DPR AGE', dpr_age_base, 
      'EOD GENDER', eod_gender_base, 'EOD Marit Stat', eod_maritstat_base, 'EOD AGE', eod_age_base, 
      'AOD GENDER', aod_gender_base, 'AOD Marit Stat', aod_maritstat_base, 'AOD AGE', aod_age_base)

In [None]:
print('baseline', 'DPR GENDER', dpr_gender_base, 'DPR Marit Stat', dpr_maritstat_base, 'DPR AGE', dpr_age_base, 
      'EOD GENDER', eod_gender_base, 'EOD Marit Stat', eod_maritstat_base, 'EOD AGE', eod_age_base, 
      'AOD GENDER', aod_gender_base, 'AOD Marit Stat', aod_maritstat_base, 'AOD AGE', aod_age_base)

## GENDER

In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])],
    label_names=['TARGET'],
    protected_attribute_names=['CODE_GENDER_LE'])

RW = Reweighing(unprivileged_groups=[{'CODE_GENDER_LE':0}],
               privileged_groups=[{'CODE_GENDER_LE':1}])
RW.fit(binaryLabelDataset)
dataset_transf_train = RW.transform(binaryLabelDataset)

gender_transf_weights = dataset_transf_train.instance_weights
# gender_transf_weights

In [None]:
model_g = XGBClassifier(random_state = 15, eta = 0.3)

model_g.fit(X_train_original, y_train_original, sample_weight=classes_weights*gender_transf_weights)

y_pred = model_g.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy_g = accuracy_score(y_test_original, predictions)
print('acc:',accuracy_g)

##add predictions to original test set 
train_df_test_bin_ = train_df_test_bin.copy()   
train_df_test_bin_.insert(loc=1, column="Predicted_Result", value = predictions)

## fairness metrics
dpr_gender_g = demographic_parity_ratio_(train_df_test_bin_, 'CODE_GENDER')
dpr_maritstat_g = demographic_parity_ratio_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
dpr_age_g = demographic_parity_ratio_(train_df_test_bin_, 'AGE')

eod_gender_g = equal_opportunity_difference_(train_df_test_bin_, 'CODE_GENDER')
eod_maritstat_g = equal_opportunity_difference_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
eod_age_g = equal_opportunity_difference_(train_df_test_bin_, 'AGE')

aod_gender_g = average_odds_difference_(train_df_test_bin_, 'CODE_GENDER')
aod_maritstat_g = average_odds_difference_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
aod_age_g = average_odds_difference_(train_df_test_bin_, 'AGE')

print('values', 'DPR GENDER', dpr_gender_g, 'DPR Marit Stat', dpr_maritstat_g, 'DPR AGE', dpr_age_g, 
      'EOD GENDER', eod_gender_g, 'EOD Marit Stat', eod_maritstat_g, 'EOD AGE', eod_age_g, 
      'AOD GENDER', aod_gender_g, 'AOD Marit Stat', aod_maritstat_g, 'AOD AGE', aod_age_g)

print('perc changes','DPR GENDER', ((dpr_gender_g-dpr_gender_base)/dpr_gender_base)*100, 
      'DPR Marit Stat', ((dpr_maritstat_g-dpr_maritstat_base)/dpr_maritstat_base)*100, 
    'DPR AGE', ((dpr_age_g-dpr_age_base)/dpr_age_base)*100, 
      'EOD GENDER', ((eod_gender_g-eod_gender_base)/eod_gender_base)*100, 
      'EOD Marit Stat', ((eod_maritstat_g-eod_maritstat_base)/eod_maritstat_base)*100, 
      'EOD AGE', ((eod_age_g-eod_age_base)/eod_age_base)*100, 
      'AOD GENDER', ((aod_gender_g-aod_gender_base)/aod_gender_base)*100, 
      'AOD Marit Stat', ((aod_maritstat_g-aod_maritstat_base)/aod_maritstat_base)*100, 
      'AOD AGE', ((aod_age_g-aod_age_base)/aod_age_base)*100)

## MARITAL STATUS

In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])],
    label_names=['TARGET'],
    protected_attribute_names=['NAME_FAMILY_STATUS_LE'])

RW = Reweighing(unprivileged_groups=[{'NAME_FAMILY_STATUS_LE':1}],
               privileged_groups=[{'NAME_FAMILY_STATUS_LE':4}])
RW.fit(binaryLabelDataset)
dataset_transf_train = RW.transform(binaryLabelDataset)

maritStat_transf_weights = dataset_transf_train.instance_weights

In [None]:
model_m = XGBClassifier(random_state = 15, eta = 0.3)

model_m.fit(X_train_original, y_train_original, sample_weight=classes_weights*maritStat_transf_weights)

y_pred = model_m.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy_m = accuracy_score(y_test_original, predictions)
print('acc:',accuracy_m)
# train_df_test.insert(loc=1, column="Predicted_Result", value=predictions)

##add predictions to original test set 
train_df_test_bin_ = train_df_test_bin.copy()   
train_df_test_bin_.insert(loc=1, column="Predicted_Result", value = predictions)

## fairness metrics
dpr_gender_m = demographic_parity_ratio_(train_df_test_bin_, 'CODE_GENDER')
dpr_maritstat_m = demographic_parity_ratio_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
dpr_age_m = demographic_parity_ratio_(train_df_test_bin_, 'AGE')

eod_gender_m = equal_opportunity_difference_(train_df_test_bin_, 'CODE_GENDER')
eod_maritstat_m = equal_opportunity_difference_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
eod_age_m = equal_opportunity_difference_(train_df_test_bin_, 'AGE')

aod_gender_m = average_odds_difference_(train_df_test_bin_, 'CODE_GENDER')
aod_maritstat_m = average_odds_difference_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
aod_age_m = average_odds_difference_(train_df_test_bin_, 'AGE')

print('values', 'DPR GENDER', dpr_gender_m, 'DPR Marit Stat', dpr_maritstat_m, 'DPR AGE', dpr_age_m, 
      'EOD GENDER', eod_gender_m, 'EOD Marit Stat', eod_maritstat_m, 'EOD AGE', eod_age_m, 
      'AOD GENDER', aod_gender_m, 'AOD Marit Stat', aod_maritstat_m, 'AOD AGE', aod_age_m)

print('perc changes','DPR GENDER', ((dpr_gender_m-dpr_gender_base)/dpr_gender_base)*100, 
      'DPR Marit Stat', ((dpr_maritstat_m-dpr_maritstat_base)/dpr_maritstat_base)*100, 
    'DPR AGE', ((dpr_age_m-dpr_age_base)/dpr_age_base)*100, 
      'EOD GENDER', ((eod_gender_m-eod_gender_base)/eod_gender_base)*100, 
      'EOD Marit Stat', ((eod_maritstat_m-eod_maritstat_base)/eod_maritstat_base)*100, 
      'EOD AGE', ((eod_age_m-eod_age_base)/eod_age_base)*100, 
      'AOD GENDER', ((aod_gender_m-aod_gender_base)/aod_gender_base)*100, 
      'AOD Marit Stat', ((aod_maritstat_m-aod_maritstat_base)/aod_maritstat_base)*100, 
      'AOD AGE', ((aod_age_m-aod_age_base)/aod_age_base)*100)

## AGE

In [None]:
train_age = train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])]
ages = []
for age in train_age['AGE']:
    if age<40.:
        ages.append(0)
    else:
        ages.append(1)
train_age['AGE'] = ages

In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_age,
    label_names=['TARGET'],
    protected_attribute_names=['AGE'])

RW = Reweighing(unprivileged_groups=[{'AGE':0}],
               privileged_groups=[{'AGE':1}])
RW.fit(binaryLabelDataset)
dataset_transf_train = RW.transform(binaryLabelDataset)
# dataset_transf_train = dataset_transf_train.convert_to_dataframe()

age_transf_weights = dataset_transf_train.instance_weights

In [None]:
model_a = XGBClassifier(random_state = 15, eta = 0.3)

model_a.fit(X_train_original, y_train_original, sample_weight=classes_weights*age_transf_weights)

y_pred = model_a.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy_a = accuracy_score(y_test_original, predictions)
print('acc:',accuracy_a)
# train_df_test.insert(loc=1, column="Predicted_Result", value=predictions)

##add predictions to original test set 
train_df_test_bin_ = train_df_test_bin.copy()   
train_df_test_bin_.insert(loc=1, column="Predicted_Result", value = predictions)

## fairness metrics
dpr_gender_a = demographic_parity_ratio_(train_df_test_bin_, 'CODE_GENDER')
dpr_maritstat_a = demographic_parity_ratio_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
dpr_age_a = demographic_parity_ratio_(train_df_test_bin_, 'AGE')

eod_gender_a = equal_opportunity_difference_(train_df_test_bin_, 'CODE_GENDER')
eod_maritstat_a = equal_opportunity_difference_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
eod_age_a = equal_opportunity_difference_(train_df_test_bin_, 'AGE')

aod_gender_a = average_odds_difference_(train_df_test_bin_, 'CODE_GENDER')
aod_maritstat_a = average_odds_difference_(train_df_test_bin_, 'NAME_FAMILY_STATUS')
aod_age_a = average_odds_difference_(train_df_test_bin_, 'AGE')

print('values', 'DPR GENDER', dpr_gender_a, 'DPR Marit Stat', dpr_maritstat_a, 'DPR AGE', dpr_age_a, 
      'EOD GENDER', eod_gender_a, 'EOD Marit Stat', eod_maritstat_a, 'EOD AGE', eod_age_a, 
      'AOD GENDER', aod_gender_a, 'AOD Marit Stat', aod_maritstat_a, 'AOD AGE', aod_age_a)

print('perc changes','DPR GENDER', ((dpr_gender_a-dpr_gender_base)/dpr_gender_base)*100, 
      'DPR Marit Stat', ((dpr_maritstat_a-dpr_maritstat_base)/dpr_maritstat_base)*100, 
    'DPR AGE', ((dpr_age_a-dpr_age_base)/dpr_age_base)*100, 
      'EOD GENDER', ((eod_gender_a-eod_gender_base)/eod_gender_base)*100, 
      'EOD Marit Stat', ((eod_maritstat_a-eod_maritstat_base)/eod_maritstat_base)*100, 
      'EOD AGE', ((eod_age_a-eod_age_base)/eod_age_base)*100, 
      'AOD GENDER', ((aod_gender_a-aod_gender_base)/aod_gender_base)*100, 
      'AOD Marit Stat', ((aod_maritstat_a-aod_maritstat_base)/aod_maritstat_base)*100, 
      'AOD AGE', ((aod_age_a-aod_age_base)/aod_age_base)*100)