In [None]:
import numpy as np
import pandas as pd
import json
import math

########################SET folder to path of code directory in the project folder#####################
## SET PATH to code directory in the project folder
code_path = ""
import sys  
sys.path.insert(1, code_path)

########################SET folder to project directory path#####################
folder= ""

from utils.utils import manipulate_categ_values, binning, reweighing, attributes_names_mapping
from utils.fairness_metrics import DPR_AOD_fairness

import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit,train_test_split

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Split Dataset

In [None]:
## FIRST
## load training and test set
train_df_train = pd.read_csv(folder+'data\\processed_data\\train_df_train.csv', delimiter=',')
train_df_test = pd.read_csv(folder+'data\\processed_data\\train_df_test.csv', delimiter=',')

train_df_all = pd.concat([train_df_train, train_df_test], ignore_index=True)
y_train_df_all = train_df_all['TARGET'].tolist()

sss = StratifiedShuffleSplit(n_splits=1, train_size=10000, test_size=1100, random_state=13)
for i, (train_index, test_index) in enumerate(sss.split(np.zeros(len(train_df_all)), y_train_df_all)):
    pass
    # print(f"Fold {i}:")
    # print(f"  Train: index={train_index}")
    # print(f"  Test:  index={test_index}")

train_df = train_df_all.iloc[train_index.tolist()]

## Resplit test
test_df_all = train_df_all.iloc[test_index.tolist()]
y_test_df_all = test_df_all['TARGET'].tolist()
sss = StratifiedShuffleSplit(n_splits=1, train_size=1000, test_size=100, random_state=13)
for i, (val_index, test_2_index) in enumerate(sss.split(np.zeros(len(test_df_all)), y_test_df_all)):
    pass
    # print(f"Fold {i}:")
    # print(f"  Train: index={val_index}")
    # print(f"  Test:  index={test_2_index}")

validation_df = test_df_all.iloc[val_index.tolist()]
test_df = test_df_all.iloc[test_2_index.tolist()]

##
train_df.to_csv(folder+'data\\processed_data\\train.csv',index='ignore') 
validation_df.to_csv(folder+'data\\processed_data\\validation.csv') 
test_df.to_csv(folder+'data\\processed_data\\test.csv') 

In [None]:
np.sort(train_df_all['AMT_REQ_CREDIT_BUREAU_HOUR'].unique())


In [None]:
print(np.sort(test_df['AMT_REQ_CREDIT_BUREAU_HOUR'].unique()))
print(np.sort(test_df['AMT_REQ_CREDIT_BUREAU_DAY'].unique()))
print(np.sort(test_df['AMT_REQ_CREDIT_BUREAU_WEEK'].unique()))
print(np.sort(test_df['AMT_REQ_CREDIT_BUREAU_MON'].unique()))
print(np.sort(test_df['AMT_REQ_CREDIT_BUREAU_QRT'].unique()))
print(np.sort(test_df['AMT_REQ_CREDIT_BUREAU_YEAR'].unique()))

In [None]:
test_df[test_df['AMT_REQ_CREDIT_BUREAU_WEEK']==1.]

In [None]:
test_df[test_df['AMT_REQ_CREDIT_BUREAU_HOUR']==1.]

In [None]:
test_df[test_df['AMT_REQ_CREDIT_BUREAU_DAY']==1.]

In [None]:
test_df[test_df['AMT_REQ_CREDIT_BUREAU_MON']==1.]

In [None]:
test_df[test_df['OBS_30_CNT_SOCIAL_CIRCLE']==1.]

In [None]:
train_df_or = pd.read_csv(folder+'data\\processed_data\\train_df.csv', delimiter=',')

## TEST SET
test_df_app_ids = test_df['SK_ID_CURR'].tolist()
test_df_or = train_df_or[train_df_or['SK_ID_CURR'].isin(test_df_app_ids)]
test_df_or = test_df_or.reset_index().set_index('SK_ID_CURR').loc[test_df_app_ids].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(test_df_or)
test_df_bin = binning(test_df_or, test_df)

np.sort(test_df_or['AMT_REQ_CREDIT_BUREAU_HOUR'].unique())

In [None]:
np.sort(test_df['AMT_REQ_CREDIT_BUREAU_HOUR'].unique())

## Apply Binning

In [None]:
train_df_or = pd.read_csv(folder+'data\\processed_data\\train_df.csv', delimiter=',')

## TEST SET
test_df_app_ids = test_df['SK_ID_CURR'].tolist()
test_df_or = train_df_or[train_df_or['SK_ID_CURR'].isin(test_df_app_ids)]
test_df_or = test_df_or.reset_index().set_index('SK_ID_CURR').loc[test_df_app_ids].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(test_df_or)
test_df_bin = binning(test_df_or, test_df)

## VALIDATION SET
validation_df_app_ids = validation_df['SK_ID_CURR'].tolist()
validation_df_or = train_df_or[train_df_or['SK_ID_CURR'].isin(validation_df_app_ids)]
validation_df_or = validation_df_or.reset_index().set_index('SK_ID_CURR').loc[validation_df_app_ids].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(validation_df_or)
validation_df_bin = binning(validation_df_or, validation_df)

##
validation_df_bin.to_csv(folder+'data\\processed_data\\validation_bin.csv') 
test_df_bin.to_csv(folder+'data\\processed_data\\test_bin.csv') 

## Training

In [None]:
x_train = train_df.loc[:, ~train_df.columns.isin(['SK_ID_CURR', 'TARGET'])] 
y_train = train_df.loc[:, train_df.columns.isin(['TARGET'])] 
x_test = test_df.loc[:, ~test_df.columns.isin(['SK_ID_CURR', 'TARGET'])] 
y_test = test_df.loc[:, test_df.columns.isin(['TARGET'])] 

## INSTACES WEIGHTS
instances_weights = reweighing(y_train, 0, 0.5)

## TRAINING
params = {'random_state': 15, 'eta':0.3, "device": "cuda"}
model = xgb.train(params, xgb.DMatrix(x_train, label=y_train, weight=instances_weights))

## FEATURE WEIGHTS
ft_weights_le = model.get_score(importance_type='weight')
total_weight = 0
for attr in ft_weights_le:
    total_weight= total_weight + ft_weights_le[attr]
ft_weights = {}
for attr in attributes_names_mapping:
    if attr in ft_weights_le:
        ft_weights[attr] = ft_weights_le[attr]/total_weight
    elif attr+"_LE" in ft_weights_le:
        ft_weights[attr] = ft_weights_le[attr+"_LE"]/total_weight
    else:
        ft_weights[attr] = 0

## PREDICTIONS
conf = model.predict(xgb.DMatrix(x_test))
conf = [1. if c>1 else c for c in conf ]
predictions = [round(value) for value in conf]

## CALCULATE FAIRNESS METRICS: DPR & AOD
test_df_binned_ = test_df_bin.copy() 
test_df_binned_.insert(loc=1, 
                       column="Predicted_Result", 
                       value = predictions)
fairness_results = DPR_AOD_fairness(attributes_names_mapping, test_df_binned_)

with open(folder+'data\\processed_data\\fairness.json", "w") as final:
    json.dump(fairness_results, final)

## Applications 

In [None]:
applications_df = pd.DataFrame()

leng = len(test_df['SK_ID_CURR'])

applications_df['Application_id'] = test_df['SK_ID_CURR']
applications_df['Predicted_decision'] = ['Accepted' if pred else 'Rejected' for pred in predictions]
applications_df['Prediction_Confidence_Accepted'] = [math.floor(conf[i]*100) if pred else math.floor((1-conf[i])*100) for i,pred in enumerate(predictions)]
applications_df['Prediction_Confidence_Rejected'] = [100-conf for conf in applications_df['Prediction_Confidence_Accepted']]

#Ft_values
for index,attr in enumerate(list(attributes_names_mapping.keys())):
    applications_df["ft_name_" + str(index)] = [attributes_names_mapping[attr]]*leng 
    applications_df["ft_value_" + str(index)] = [test_df_or[test_df_or['SK_ID_CURR'] == app_id][attr].tolist()[0] for app_id in test_df['SK_ID_CURR']]
    applications_df["ft_weight_" + str(index)] = [round(list(ft_weights.values())[index],3)]*leng

applications_df.to_csv(folder+'data\\processed_data\\Applications.csv') 

In [None]:
for att in ft_weights:
    print(attributes_names_mapping[att], round(ft_weights[att],3))

In [None]:
count =0
for pred in predictions:
    if pred == 0:
        count = count+1
count

In [None]:
count = 0
for i in predictions:
    if i ==1 :
        count=count+1
count/100

In [None]:

count =0
for pred in y_train['TARGET'].tolist():
    if pred == 1:
        count = count+1
count/len(y_train)

In [None]:
100-75.18

In [None]:
instances_weights

In [None]:
y_train

In [None]:
train_df[train_df['TARGET']==0]

In [None]:
2482/10000

In [None]:
test_df[test_df['TARGET']==0]

In [None]:
25/100