In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import func
import importlib
importlib.reload(func)

<module 'func' from '/Users/chooenming/Desktop/Credit_Risk_Modelling/PD_Model/Model/func.py'>

In [3]:
train = pd.read_csv(r"/Users/chooenming/Desktop/Credit_Risk_Modelling/PD_Model/Data/cs-training.csv").drop(["Unnamed: 0"], axis=1)
test = pd.read_csv(r"/Users/chooenming/Desktop/Credit_Risk_Modelling/PD_Model/Data/cs-test.csv").drop(["Unnamed: 0"], axis=1)

In [4]:
train.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [10]:
print("Training data shape: ", train.shape)
print("Testing data shape: ", test.shape)

Training data shape:  (150000, 11)
Testing data shape:  (101503, 11)


# Data Pre-Processing

In [17]:
# check and remove duplicates
train_redup = func.removeDup(train)

False    149391
True        609
Name: count, dtype: int64
After removing duplicated rows:  0  of duplicated rows


In [15]:
# check for null values
func.findMiss(train_redup)

SeriousDlqin2yrs                         0.00
RevolvingUtilizationOfUnsecuredLines     0.00
age                                      0.00
NumberOfTime30-59DaysPastDueNotWorse     0.00
DebtRatio                                0.00
MonthlyIncome                           19.56
NumberOfOpenCreditLinesAndLoans          0.00
NumberOfTimes90DaysLate                  0.00
NumberRealEstateLoansOrLines             0.00
NumberOfTime60-89DaysPastDueNotWorse     0.00
NumberOfDependents                       2.56
dtype: float64

In [19]:
train_redup[train_redup.NumberOfDependents.isnull()].describe()
# train_redup[train_redup.MonthlyIncome .isnull()].describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,3828.0,3828.0,3828.0,3828.0,3828.0,0.0,3828.0,3828.0,3828.0,3828.0,0.0
mean,0.046499,11.003369,59.741641,0.572623,1110.713689,,5.708986,0.497126,0.605799,0.474138,
std,0.210592,240.656436,18.345175,6.52732,4235.410634,,4.086337,6.523199,0.92099,6.520343,
min,0.0,0.0,21.0,0.0,0.0,,0.0,0.0,0.0,0.0,
25%,0.0,0.009228,48.0,0.0,25.0,,3.0,0.0,0.0,0.0,
50%,0.0,0.04767,61.0,0.0,398.0,,5.0,0.0,0.0,0.0,
75%,0.0,0.259028,74.0,0.0,1587.0,,8.0,0.0,1.0,0.0,
max,1.0,10821.0,109.0,98.0,220516.0,,30.0,98.0,15.0,98.0,


It is possible for monthly income and number of dependents are missing.
<br>If no monthly income + number of dependents, less likely to get loan

## Missing Value Imputation

In [20]:
fam_miss = train_redup[train_redup.NumberOfDependents.isnull()]
fam_nmiss = train_redup[train_redup.NumberOfDependents.notnull()]

In [22]:
fam_nmiss["MonthlyIncome"].agg(["mean", "median", "min", "max"])

mean      6.675098e+03
median    5.400000e+03
min       0.000000e+00
max       3.008750e+06
Name: MonthlyIncome, dtype: float64

In [23]:
filled_train = train_redup.copy()
filled_train["NumberOfDependents"] = filled_train["NumberOfDependents"].fillna(0)
filled_train["MonthlyIncome"] = filled_train["MonthlyIncome"].fillna(0)

In [24]:
func.findMiss(filled_train)

SeriousDlqin2yrs                        0.0
RevolvingUtilizationOfUnsecuredLines    0.0
age                                     0.0
NumberOfTime30-59DaysPastDueNotWorse    0.0
DebtRatio                               0.0
MonthlyIncome                           0.0
NumberOfOpenCreditLinesAndLoans         0.0
NumberOfTimes90DaysLate                 0.0
NumberRealEstateLoansOrLines            0.0
NumberOfTime60-89DaysPastDueNotWorse    0.0
NumberOfDependents                      0.0
dtype: float64

# EDA

In [25]:
filled_train.groupby(["SeriousDlqin2yrs"]).size()/filled_train.shape[0]

SeriousDlqin2yrs
0    0.933001
1    0.066999
dtype: float64

Data is highly imbalanced

In [26]:
filled_train.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,149391.0,149391.0,149391.0,149391.0,149391.0,149391.0,149391.0,149391.0,149391.0,149391.0,149391.0
mean,0.066999,6.071087,52.306237,0.393886,354.43674,5369.444,8.480892,0.23812,1.022391,0.212503,0.740393
std,0.250021,250.263672,14.725962,3.852953,2041.843455,13174.57,5.136515,3.826165,1.130196,3.810523,1.108272
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.030132,41.0,0.0,0.177441,1600.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154235,52.0,0.0,0.368234,4400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.556494,63.0,0.0,0.875279,7400.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


# Model Building