# Predicting the outcome of loan applications
# 2. Data preparation

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import matplotlib.pyplot as plt
from matplotlib import rc

p = os.path.abspath('../')
if p not in sys.path:
    sys.path.append(p)
    
from shared.plotting import percentage_stacked_bar_plot
from shared.data_processing import encode_categorical

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Load raw data

In [2]:
df = pd.read_csv('./data/loan_data.csv')

## Recode the label
I will re-encode the labels in a more intuitive way where `1` (positive) indicates something that requires human attention, i.e. a rejection, while `0` (negative) suggests that everything is fine, i.e. an acceptance.

In [3]:
df['label'] = df['accepted'].map({2: 0, 1: 1})

## Encoding the categorical variables
The categorical variables need to be converted to numbers, i.e. ***encoded***, in order to be interpretable by the models. Among the ones we pre-selected in the exploratory data analysis, we distinguish 3 types of categorical variables.

### Binary categorical variables
These are the most straighforward, as they can just be encoded with the two values `0` and `1`.

In [4]:
df['foreign_worker'].value_counts()

yes    963
no      37
Name: foreign_worker, dtype: int64

In [5]:
df['foreign_worker_binary'] = df['foreign_worker'].map({'no': 0, 'yes': 1})

### Ordinal categorical variables
These have more than two level, but the levels have an intrinsic ordering. This allows us to encode them using integers which are ordered accordingly.

In [6]:
df['checking_status'].value_counts()

None          394
<0DM          274
0_to_200DM    269
>200DM         63
Name: checking_status, dtype: int64

In [7]:
df['checking_status_ordinal'] = df['checking_status'].map({
    'None': -1,
    '<0DM': 0,
    '0_to_200DM': 1,
    '>200DM': 2
})

In [8]:
df['savings_status'].value_counts()

<100DM                        603
Unknown_or_no_savings_acct    183
100_to_500DM                  103
500_to_1000DM                  63
>1000DM                        48
Name: savings_status, dtype: int64

In [9]:
df['savings_status_ordinal'] = df['savings_status'].map({
    'Unknown_or_no_savings_acct': -1,
    '<100DM': 0,
    '100_to_500DM': 1,
    '500_to_1000DM': 2,
    '>1000DM': 3
})

In [10]:
df['employment'].value_counts()

1_to_4yrs     339
>7yrs         253
4_to_7yrs     174
<1yr          172
unemployed     62
Name: employment, dtype: int64

In [11]:
df['employment_ordinal'] = df['employment'].map({
    'unemployed': -1,
    '<1yr': 0, 
    '1_to_4yrs': 1,
    '4_to_7yrs' : 2,
    '>7yrs': 3
})

In [12]:
df['installment_commitment'].value_counts()

4    476
2    231
3    157
1    136
Name: installment_commitment, dtype: int64

Installment commitment is already ordinally encoded.

In [13]:
df['installment_commitment_ordinal'] = df['installment_commitment'].astype(float)

## Save the data set with the encoded variables

In [14]:
df.to_csv('./data/loan_data_prepped.csv', index=False)

## Other categorical variables
For the others, there is no clear way to associate them to numbers, so we need to find another way. I am going to avoid one-hot encoding for this data set, because it's too small, and we could potentially end up with hundreds of features and only 1000 data points. I will use an encoding based on the effect of the feature on the target. **Since this encoding depends on the target, it needs to be computed using training data only, i.e. separately for each cross-validation fold.** Below is an example using all data.

In [15]:
OTHER_CATEGORICAL = ['loan_history', 'purpose', 'other_parties', 'property_magnitude',
                     'other_payment_plans', 'housing', 'personal_status', 'job']

In [16]:
df = encode_categorical(df, OTHER_CATEGORICAL, 'label')
df.head()

Unnamed: 0,checking_status,duration,loan_history,purpose,loan_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,employment_ordinal,installment_commitment_ordinal,loan_history_encoded,purpose_encoded,other_parties_encoded,property_magnitude_encoded,other_payment_plans_encoded,housing_encoded,personal_status_encoded,job_encoded
0,<0DM,6,Critical_acct_other_loans_existing,radio/television,1169,Unknown_or_no_savings_acct,>7yrs,4,male_single,,...,3,4.0,0,2,1,0,0,0,0,1
1,0_to_200DM,48,Existing_loans_paid_till_now,radio/television,5951,<100DM,1_to_4yrs,2,female_divorced/separated/married,,...,1,2.0,2,2,1,0,0,0,2,1
2,,12,Critical_acct_other_loans_existing,education,2096,<100DM,4_to_7yrs,2,male_single,,...,2,2.0,0,9,1,0,0,0,0,0
3,<0DM,42,Existing_loans_paid_till_now,furniture/equipment,7882,<100DM,4_to_7yrs,2,male_single,guarantor,...,2,2.0,2,3,0,1,0,2,0,1
4,<0DM,24,Delay_in_past,new_car,4870,<100DM,1_to_4yrs,3,male_single,,...,1,3.0,1,7,1,3,0,2,0,1


## Feature selection

Here are the numerical and encoded categorical features we settled on so far.

In [17]:
NUMERICAL_FEATURES = ['duration', 'loan_amount', 'age']

FIXED_CATEGORICAL = ['foreign_worker_binary', 'checking_status_ordinal', 'savings_status_ordinal',
                     'employment_ordinal', 'installment_commitment_ordinal']

VARIABLE_CATEGORICAL = [f'{feature}_encoded' for feature in OTHER_CATEGORICAL]

In [18]:
FEATURES = NUMERICAL_FEATURES + FIXED_CATEGORICAL + VARIABLE_CATEGORICAL
FEATURES

['duration',
 'loan_amount',
 'age',
 'foreign_worker_binary',
 'checking_status_ordinal',
 'savings_status_ordinal',
 'employment_ordinal',
 'installment_commitment_ordinal',
 'loan_history_encoded',
 'purpose_encoded',
 'other_parties_encoded',
 'property_magnitude_encoded',
 'other_payment_plans_encoded',
 'housing_encoded',
 'personal_status_encoded',
 'job_encoded']

So far, we have selected the features manually. But are we sure that we are not using redundant information? We can check this with PCA. It looks like even dropping 1 feature (out of 16) makes the explained variance drop below 99%. Dropping 4 features (out of 16) reduces the explained variance to less than 90%, so overall it seems like using all features is best in this case.

In [19]:
for feature in FEATURES:
    df[feature] = df[feature].astype(float)

In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[FEATURES])

In [21]:
pca = PCA(n_components=15, svd_solver='full')

In [22]:
pca.fit(X_scaled)

print(np.sum(pca.explained_variance_ratio_))
print(pca.singular_values_)

0.9838594400066427
[48.44731464 40.52018056 35.24649143 34.6623883  34.16889778 31.67105786
 31.24285108 30.5724615  30.27273735 29.5594652  28.91870215 27.52984534
 27.04571435 25.184898   21.85023303]


In [23]:
pca = PCA(n_components=12, svd_solver='full')

In [24]:
pca.fit(X_scaled)

print(np.sum(pca.explained_variance_ratio_))
print(pca.singular_values_)

0.8686605377981226
[48.44731464 40.52018056 35.24649143 34.6623883  34.16889778 31.67105786
 31.24285108 30.5724615  30.27273735 29.5594652  28.91870215 27.52984534]
