In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import aif360
from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
        import load_preproc_data_adult, load_preproc_data_german, load_preproc_data_compas
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
# import matplotlib.pyplot as plt

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate

########################SET folder to path of code directory in the project folder#####################
## SET PATH to code directory in the project folder
code_path = ""
import sys  
sys.path.insert(1, code_path)

##
from utils.fairness_metrics import demographic_parity_ratio_
from utils.utils import *

# from common_utils import compute_metrics

In [None]:
test_df = pd.read_csv('test_df.csv', delimiter=',')
test_df_final = pd.read_csv('test_df_final.csv', delimiter=',')
test_df_final = test_df_final.loc[:, ~test_df_final.columns.isin(['SK_ID_CURR'])]

##
train_df_train = pd.read_csv('train_df_train.csv', delimiter=',')
train_df_test = pd.read_csv('train_df_test.csv', delimiter=',')

X_train_original = train_df_train.loc[:, ~train_df_train.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_train_original = train_df_train.loc[:, train_df_train.columns == 'TARGET']
X_test_original = train_df_test.loc[:, ~train_df_test.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_test_original = train_df_test.loc[:, train_df_test.columns == 'TARGET']

## load original test set to be used for fairness metrics
train_df = pd.read_csv('train_df.csv', delimiter=',')

### train_df_train
train_df_train_or = train_df[train_df['SK_ID_CURR'].isin(train_df_train['SK_ID_CURR'].tolist())]
lst = train_df_train['SK_ID_CURR'].tolist()
train_df_train_or = train_df_train_or.reset_index().set_index('SK_ID_CURR').loc[lst].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(train_df_train_or)
train_df_train_bin = binning(train_df_train_or, train_df_train)

### train_df_test
train_df_test_or = train_df[train_df['SK_ID_CURR'].isin(train_df_test['SK_ID_CURR'].tolist())]
lst = train_df_test['SK_ID_CURR'].tolist()
train_df_test_or = train_df_test_or.reset_index().set_index('SK_ID_CURR').loc[lst].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(train_df_test_or)
train_df_test_bin = binning(train_df_test_or, train_df_test)

##
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y = y_train_original
)
# classes_weights

In [None]:
train_df_train[train_df_train['SK_ID_CURR']==226040]['CODE_GENDER_LE']

In [None]:
tr = pd.read_csv('train_df.csv', delimiter=',')
tr[tr['SK_ID_CURR']==226040]['CODE_GENDER']

In [None]:
lst = train_df_train['SK_ID_CURR'].tolist()
tr_or = tr.reset_index().set_index('SK_ID_CURR').loc[lst].reset_index().set_index('index').rename_axis(None)
tr_or['AGE'].unique()

In [None]:
train_df_train

## GENDER

In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])],
    label_names=['TARGET'],
    protected_attribute_names=['CODE_GENDER_LE'])

RW = Reweighing(unprivileged_groups=[{'CODE_GENDER_LE':0}],
               privileged_groups=[{'CODE_GENDER_LE':1}])
RW.fit(binaryLabelDataset)
dataset_transf_train = RW.transform(binaryLabelDataset)
# dataset_transf_train = dataset_transf_train.convert_to_dataframe()

gender_transf_weights = dataset_transf_train.instance_weights
# gender_transf_weights

In [None]:
model_g = XGBClassifier(random_state = 15, eta = 0.3)

model_g.fit(X_train_original, y_train_original, sample_weight=classes_weights*gender_transf_weights)

y_pred = model_g.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy_g = accuracy_score(y_test_original, predictions)
print('acc:',accuracy_g)
# train_df_test.insert(loc=1, column="Predicted_Result", value=predictions)

##add predictions to original test set 
train_df_test_bin_ = train_df_test_bin.copy()   
train_df_test_bin_.insert(loc=1, column="Predicted_Result", value = predictions)

## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'CODE_GENDER')

In [None]:
## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'NAME_FAMILY_STATUS')

In [None]:
## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'AGE')

## MARITAL STATUS

In [None]:
train_df_train['NAME_FAMILY_STATUS_LE'].head()

In [None]:
train_df_train['NAME_FAMILY_STATUS_LE']
train_df['NAME_FAMILY_STATUS'].head()

In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])],
    label_names=['TARGET'],
    protected_attribute_names=['NAME_FAMILY_STATUS_LE'])

RW = Reweighing(unprivileged_groups=[{'NAME_FAMILY_STATUS_LE':1}],
               privileged_groups=[{'NAME_FAMILY_STATUS_LE':4}])
RW.fit(binaryLabelDataset)
dataset_transf_train = RW.transform(binaryLabelDataset)

maritStat_transf_weights = dataset_transf_train.instance_weights

In [None]:
model_m = XGBClassifier(random_state = 15, eta = 0.3)

model_m.fit(X_train_original, y_train_original, sample_weight=classes_weights*maritStat_transf_weights)

y_pred = model_m.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy_m = accuracy_score(y_test_original, predictions)
print('acc:',accuracy_m)
# train_df_test.insert(loc=1, column="Predicted_Result", value=predictions)

##add predictions to original test set 
train_df_test_bin_ = train_df_test_bin.copy()   
train_df_test_bin_.insert(loc=1, column="Predicted_Result", value = predictions)

## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'CODE_GENDER')

In [None]:
## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'NAME_FAMILY_STATUS')

In [None]:
## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'AGE')

## AGE

In [None]:
train_age = train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])]
ages = []
for age in train_age['AGE']:
    if age<40.:
        ages.append(0)
    else:
        ages.append(1)
train_age['AGE'] = ages

In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_df_train.loc[:, ~train_df_train.columns.isin(['SK_ID_CURR'])],
    label_names=['TARGET'],
    protected_attribute_names=['AGE'])

RW = Reweighing(unprivileged_groups=[{'AGE':0}],
               privileged_groups=[{'AGE':1}])
RW.fit(binaryLabelDataset)
dataset_transf_train = RW.transform(binaryLabelDataset)
# dataset_transf_train = dataset_transf_train.convert_to_dataframe()

age_transf_weights = dataset_transf_train.instance_weights

In [None]:
model_a = XGBClassifier(random_state = 15, eta = 0.3)

model_a.fit(X_train_original, y_train_original, sample_weight=classes_weights*age_transf_weights)

y_pred = model_a.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy_a = accuracy_score(y_test_original, predictions)
print('acc:',accuracy_a)
# train_df_test.insert(loc=1, column="Predicted_Result", value=predictions)

##add predictions to original test set 
train_df_test_bin_ = train_df_test_bin.copy()   
train_df_test_bin_.insert(loc=1, column="Predicted_Result", value = predictions)

## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'AGE')

In [None]:
## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'NAME_FAMILY_STATUS')

In [None]:
## fairness metrics
demographic_parity_ratio_(train_df_test_bin_, 'AGE')