In [None]:
import numpy as np 
import pandas as pd

np.random.seed(0)

########################SET folder to project directory path#####################
folder=""

## Loading Home Credit Training and Test Sets
This data was downloaded from https://www.kaggle.com/competitions/home-credit-default-risk/data.

In [None]:
application_train = pd.read_csv(folder+'data/HomeCreditDataset/application_train.csv', delimiter=',')
display(application_train.shape)

In [None]:
application_test = pd.read_csv(folder+'data/HomeCreditDataset/application_test.csv', delimiter=',')
display(application_test.shape)

## Analysing Class Imbalance

In [None]:
rejected_applications_df = application_train[(application_train['TARGET'] == 1)] # Risk detected in loan application, hence reject it.
accepted_applications_df = application_train[(application_train['TARGET'] == 0)] #No Risk

print("Rejected Applications in Trainset: ", len(rejected_applications_df))
print("Accepted Applications in Trainset: ", len(accepted_applications_df))

In [None]:
# Dataframe without any na values
accepted_without_na = accepted_applications_df.dropna()
accepted_without_na.shape

In [None]:
# DF with Na values 
accepted_with_na = accepted_applications_df[accepted_applications_df.isna().any(axis='columns')]
accepted_with_na.shape

In [None]:
# Take remaining rows from accepted_applications_sampled

remaining = 100000 - (len(rejected_applications_df) + len(accepted_without_na))
print(remaining)

accepted_sampled_na = accepted_with_na.sample(n = remaining+1, random_state=0)
accepted_sampled_na.shape

In [None]:
# train_df[train_df["CODE_GENDER"] == "XNA"].index

In [None]:
# # drop_xna_index = train_df[train_df["CODE_GENDER"] == "XNA"].index
# drop_xna_index

In [None]:
train_df = pd.concat([rejected_applications_df, accepted_without_na, accepted_sampled_na ], ignore_index=True)

#Shuffle
train_df = train_df.sample(frac=1).reset_index(drop=True)

#Sort
train_df.sort_values(by=['SK_ID_CURR'],inplace = True,ignore_index=True)

# Dropping Extra Gender

train_df.drop(index = train_df[train_df["CODE_GENDER"] == "XNA"].index, inplace = True)

display(train_df.shape)

In [None]:
test_df = application_test.iloc[:1000,:]
test_df.shape

## Feature Selection

### Missing Values

In [None]:
col = application_train.columns

#List of all features containing null values
features_with_na = []

#Checking for NaN
for i in range(len(col)):
    check_for_nan = application_train[col[i]].isnull().values.any()
    if check_for_nan == True: 
        features_with_na.append(col[i])

# print("Features containing Null values: ")
# print('\n')
# print(features_with_na)
# print('\n')

#Calculating the percentage of NaN values
inconsistent_features = []
for i in features_with_na:
#     print(i) 
    percent_missing = 100*application_train[i].isnull().sum() / application_train.shape[0]
    
#     print ('Percentage of NaN: ' , round(percent_missing,2),"%")
#     print('\n')
    if percent_missing > 25:
#         print(i) 
#         print ('Percentage of NaN: ' , round(percent_missing,2),"%")
#         print('\n')
        inconsistent_features.append(i)
# print('\n')

print("Num of features with at least 1 NaN value: ", len(features_with_na))
print("Num of features with more than 25% of missing values : ", len(inconsistent_features))

In [None]:
inconsistent_features

As the relevant feature is just "OCCUPATION_TYPE" which describes the occupation of an individual, we will keep this feature and remove the rest. 

In [None]:
inconsistent_features.remove('OCCUPATION_TYPE')
inconsistent_features

train_df.drop(columns = inconsistent_features,inplace = True)
train_df.shape

In [None]:
test_df.drop(columns = inconsistent_features,inplace = True)
test_df.shape

### Dropping Inexplainable features

In [None]:
train_df.columns

In [None]:
inexplanaible_features = ['EXT_SOURCE_2', "EXT_SOURCE_3",'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [None]:
train_df.drop(columns = inexplanaible_features,inplace = True)
display(train_df.shape)

test_df.drop(columns = inexplanaible_features,inplace = True)
display(test_df.shape)

### Converting Days Features

In [None]:
train_df.columns

In [None]:
train_df["DAYS_BIRTH"] = round(train_df["DAYS_BIRTH"]/-365)
train_df["DAYS_EMPLOYED"] = round(train_df["DAYS_EMPLOYED"]/-365)
train_df["DAYS_REGISTRATION"] = round(train_df["DAYS_REGISTRATION"]/-365)
train_df["DAYS_ID_PUBLISH"] = round(train_df["DAYS_ID_PUBLISH"]/-365)
train_df["DAYS_LAST_PHONE_CHANGE"] = round(train_df["DAYS_LAST_PHONE_CHANGE"]/-365)

test_df["DAYS_BIRTH"] = round(test_df["DAYS_BIRTH"]/-365)
test_df["DAYS_EMPLOYED"] = round(test_df["DAYS_EMPLOYED"]/-365)
test_df["DAYS_REGISTRATION"] = round(test_df["DAYS_REGISTRATION"]/-365)
test_df["DAYS_ID_PUBLISH"] = round(test_df["DAYS_ID_PUBLISH"]/-365)
test_df["DAYS_LAST_PHONE_CHANGE"] = round(test_df["DAYS_LAST_PHONE_CHANGE"]/-365)

In [None]:
train_df.rename(columns={'DAYS_BIRTH':'AGE','DAYS_EMPLOYED': 'YEARS_EMPLOYED',
                             'DAYS_REGISTRATION':"YEARS_REGISTRATION" , 'DAYS_ID_PUBLISH':'YEARS_ID_PUBLISH', 
                              'DAYS_LAST_PHONE_CHANGE':"YEARS_LAST_PHONE_CHANGE"}, inplace=True)

test_df.rename(columns={'DAYS_BIRTH':'AGE','DAYS_EMPLOYED': 'YEARS_EMPLOYED',
                             'DAYS_REGISTRATION':"YEARS_REGISTRATION" , 'DAYS_ID_PUBLISH':'YEARS_ID_PUBLISH', 
                              'DAYS_LAST_PHONE_CHANGE':"YEARS_LAST_PHONE_CHANGE"}, inplace=True)

In [None]:
test_df.columns 

In [None]:
test_df.to_csv(folder+'data/processed_data/test_df.csv',index=False) 

### Converting Detected Risk to application Accepted/Rejected
We used the mapping:

   0    / 1  ->     1   /   0

no risk/risk -> accepted/rejected

In [None]:
train_df

In [None]:
train_df.replace({'TARGET' : {0:1, 1:0} },inplace=True)

In [None]:
train_df

In [None]:
train_df.to_csv(folder+'data/processed_data/train_df.csv',index=False) 

### Imputation

Imputing categorical columns with "Unknown" label when the value is missing

In [None]:
train_df_cat = train_df.select_dtypes(exclude=["number"])
train_df_cat.fillna("Unknown", inplace = True)

In [None]:
test_df_cat = test_df.select_dtypes(exclude=["number"])
test_df_cat.fillna("Unknown", inplace = True)

In [None]:
train_df_cat.shape

In [None]:
test_df_cat.shape

In [None]:
for i in range(len(train_df_cat.columns)):
    print(train_df_cat.columns[i])
    
# List of Columns before Encoding
initial_columns = train_df_cat.columns

### Label Encoding
Hot encoding categorical variables to numerical values for training and predicting

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for i in range(len(train_df_cat.columns)):
    print(train_df_cat.columns[i])
    train_df_cat[str(train_df_cat.columns[i]) + "_LE"] = le.fit_transform(train_df_cat[train_df_cat.columns[i]])
    test_df_cat[str(test_df_cat.columns[i]) + "_LE"] = le.fit_transform(test_df_cat[train_df_cat.columns[i]])

In [None]:
train_df_cat

In [None]:
test_df_cat

### Imputing Numerical Columns with average values when a value is missing

In [None]:
train_df_num = train_df.select_dtypes(include=["number"])
train_df_num.fillna(train_df_num.mean(), inplace = True)
train_df_num

In [None]:
test_df_num = test_df.select_dtypes(include=["number"])
test_df_num.fillna(test_df_num.mean(), inplace = True)
test_df_num

In [None]:
train_df_num.columns

In [None]:
a = test_df_num['AGE']
a.max()

In [None]:
train_df_cat["ORGANIZATION_TYPE_LE"].max()

In [None]:
a = test_df_num['AMT_INCOME_TOTAL']/10000
a.max()

### Normalisation

In [None]:
absurdly_high_value_features = ['AMT_INCOME_TOTAL','AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']

In [None]:
for i in absurdly_high_value_features: 
    
    train_df_num[i] = round(train_df_num[i]/10000)
    test_df_num[i] = round(test_df_num[i]/10000)

In [None]:
train_df.columns

In [None]:
train_df_new = pd.concat([train_df_num, train_df_cat], axis=1)

In [None]:
train_df_new

In [None]:
test_df_new = pd.concat([test_df_num, test_df_cat], axis=1)

In [None]:
test_df_new

In [None]:
# Drop categorical columns before encoding
train_df_final = train_df_new.drop(columns = initial_columns)
# train_df_final.drop(columns = ["SK_ID_CURR"],inplace = True)
# train_df_final.to_csv(folder+'data/processed_data/train_df_final.csv',index=False) 

In [None]:
test_df_final = test_df_new.drop(columns = initial_columns)
# test_df_final.drop(columns = ["SK_ID_CURR"],inplace = True)
test_df_final.to_csv(folder+'data/processed_data/test_df_final.csv',index=False) 

Final Shapes

In [None]:
train_df_final.shape

In [None]:
test_df_final.shape

## Training-Test Sets Split
Note: This splitting generates a test set from the Home Credit Dataset's original training set. This test set will be used as a form of validation set to test the accuracy and fairness of the AI model before and after integrating participants' feedback. This test set, the *train_df_test* as we call it, is different from the *test_df* set, which is a subset of the Home Credit Dataset's original test set used to be shown to participants through the UI prototype and ask them to provide feedback.

In [None]:
## Splitting of independent and dependent variable
X = train_df_final.loc[:, train_df_final.columns != 'TARGET']
y = train_df_final.loc[:, train_df_final.columns == 'TARGET']

display(X.shape)
display(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train_original, X_test_original,y_train_original, y_test_original = train_test_split(X,y, test_size = 0.3, random_state = 15,shuffle=True)

print(X.shape)

print("Trainning and Validation Set for X", X_train_original.shape)
print("Trainning and Validation Set for y",y_train_original.shape)
print("Test Set for X",X_test_original.shape)
print("Test Set for y",y_test_original.shape)

In [None]:
train_df_train = pd.concat([X_train_original, y_train_original], axis=1)
train_df_train.to_csv(folder+'data/processed_data/train_df_train.csv',index=False) 
train_df_train

In [None]:
train_df_test = pd.concat([X_test_original, y_test_original], axis=1)
# train_df_test.to_csv('train_df_test.csv',index=True, index_label='index') 
train_df_test.to_csv(folder+'data/processed_data/train_df_test.csv',index=False) 
train_df_test