we are going to build a model that predicts if someone who seeks a loan might be a defaulter or non-defaulter. We have several independent variables like checking account balance, credit history, purpose, loan amount, etc.

In [158]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics

In [143]:
credit = pd.read_csv('credit.csv')

In [144]:
credit.shape

(1000, 17)

In [145]:
credit.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [146]:
credit.describe(include='all')

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
count,1000,1000.0,1000,1000,1000.0,1000,1000,1000.0,1000.0,1000.0,1000,1000,1000.0,1000,1000.0,1000,1000
unique,4,,5,6,,5,5,,,,3,3,,4,,2,2
top,unknown,,good,furniture/appliances,,< 100 DM,1 - 4 years,,,,none,own,,skilled,,no,no
freq,394,,530,473,,603,339,,,,814,713,,630,,596,700
mean,,20.903,,,3271.258,,,2.973,2.845,35.546,,,1.407,,1.155,,
std,,12.058814,,,2822.736876,,,1.118715,1.103718,11.375469,,,0.577654,,0.362086,,
min,,4.0,,,250.0,,,1.0,1.0,19.0,,,1.0,,1.0,,
25%,,12.0,,,1365.5,,,2.0,2.0,27.0,,,1.0,,1.0,,
50%,,18.0,,,2319.5,,,3.0,3.0,33.0,,,1.0,,1.0,,
75%,,24.0,,,3972.25,,,4.0,4.0,42.0,,,2.0,,1.0,,


In [147]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

In [148]:
#lot of columns are of object datatypes. lets convert them to categorical columns

for feature in credit.columns:                            #loop through all the columns
    if credit[feature].dtype == 'object':                 #if dtype is object, 
        credit[feature] = pd.Categorical(credit[feature]) #make it categorical column

credit.info() #all object columns converted to catagorical

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   checking_balance      1000 non-null   category
 1   months_loan_duration  1000 non-null   int64   
 2   credit_history        1000 non-null   category
 3   purpose               1000 non-null   category
 4   amount                1000 non-null   int64   
 5   savings_balance       1000 non-null   category
 6   employment_duration   1000 non-null   category
 7   percent_of_income     1000 non-null   int64   
 8   years_at_residence    1000 non-null   int64   
 9   age                   1000 non-null   int64   
 10  other_credit          1000 non-null   category
 11  housing               1000 non-null   category
 12  existing_loans_count  1000 non-null   int64   
 13  job                   1000 non-null   category
 14  dependents            1000 non-null   int64   
 15  phone

In [149]:
print(credit.checking_balance.value_counts()) #394 are unknown, <0 DM are 274, etc
print(credit.credit_history.value_counts())
print(credit.employment_duration.value_counts())
print(credit.savings_balance.value_counts())
print(credit.job.value_counts())
print(credit.default.value_counts())

unknown       394
< 0 DM        274
1 - 200 DM    269
> 200 DM       63
Name: checking_balance, dtype: int64
good         530
critical     293
poor          88
very good     49
perfect       40
Name: credit_history, dtype: int64
1 - 4 years    339
> 7 years      253
4 - 7 years    174
< 1 year       172
unemployed      62
Name: employment_duration, dtype: int64
< 100 DM         603
unknown          183
100 - 500 DM     103
500 - 1000 DM     63
> 1000 DM         48
Name: savings_balance, dtype: int64
skilled       630
unskilled     200
management    148
unemployed     22
Name: job, dtype: int64
no     700
yes    300
Name: default, dtype: int64


In [150]:
#lot of columns are of ordinal dtypes. so we can represent them using numbers

replaceStruct = {
                    'checking_balance' : {'< 0 DM': 1 , '1 - 200 DM': 2, '> 200 DM' : 3, 'unknown': -1},
                    'credit_history' : {'perfect': 5, 'very good': 4, 'good':3, 'poor':2, 'critical':1},
                    'savings_balance': {'< 100 DM': 1 , '100 - 500 DM': 2, '500 - 1000 DM': 3, '> 1000 DM': 4, 'unknown':-1},
                    'employment_duration': {'< 1 year': 2,'1 - 4 years': 3,'4 - 7 years':4, '> 7 years': 5, 'unemployed':1},
                    'phone' :{'no':0, 'yes':1},
                    'default': {'no':0, 'yes':1},
}

OneHotCols = ['purpose', 'housing', 'other_credit', 'job'] #columns we have to do one hot encoding

In [151]:
credit = credit.replace(replaceStruct)
credit = pd.get_dummies(credit, columns = OneHotCols)
credit.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,existing_loans_count,...,housing_other,housing_own,housing_rent,other_credit_bank,other_credit_none,other_credit_store,job_management,job_skilled,job_unemployed,job_unskilled
0,1,6,1,1169,-1,5,4,4,67,2,...,0,1,0,0,1,0,0,1,0,0
1,2,48,3,5951,1,3,2,2,22,1,...,0,1,0,0,1,0,0,1,0,0
2,-1,12,1,2096,1,4,2,3,49,1,...,0,1,0,0,1,0,0,0,0,1
3,1,42,3,7882,1,4,2,4,45,1,...,1,0,0,0,1,0,0,1,0,0
4,1,24,2,4870,1,3,3,4,53,2,...,1,0,0,0,1,0,0,1,0,0


In [153]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   checking_balance              1000 non-null   int64
 1   months_loan_duration          1000 non-null   int64
 2   credit_history                1000 non-null   int64
 3   amount                        1000 non-null   int64
 4   savings_balance               1000 non-null   int64
 5   employment_duration           1000 non-null   int64
 6   percent_of_income             1000 non-null   int64
 7   years_at_residence            1000 non-null   int64
 8   age                           1000 non-null   int64
 9   existing_loans_count          1000 non-null   int64
 10  dependents                    1000 non-null   int64
 11  phone                         1000 non-null   int64
 12  default                       1000 non-null   int64
 13  purpose_business              1000

### Split Data

In [160]:
from sklearn.model_selection import train_test_split

y = credit['default']
x = credit.drop('default', axis = 1)

In [161]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state = 1)

### Build Decision Tree model

We will build our model using the DecisionTreeClassifier function. Using default 'gini' criteria to split. Other option include 'entropy'.  

In [166]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

dTree = DecisionTreeClassifier(criterion='gini', random_state = 1)
dTree.fit(x_train, y_train)

DecisionTreeClassifier(random_state=1)

In [169]:
print(dTree.score(x_train, y_train))
print(dTree.score(x_test, y_test))

#since score for train data is 100% and score for test data is only 69%, there is problem of overfitting

1.0
0.6933333333333334


### Reducing over fitting

In [184]:
# we can reduce overfitting by reducing the depth of decisoin tree

dTree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=1)
dTree.fit(x_train, y_train)

print(dTree.score(x_train, y_train))
print(dTree.score(x_test, y_test))

0.7528571428571429
0.7433333333333333


#                             Ensemble Learning - Bagging

many models are produced simultaneously and best one is selected.
If problem is regression problem, i.e. numerical, average score of all models is result.
If problem is classification i.e categorical, voting process is conducted.

In [189]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(n_estimators = 50, base_estimator= dTree, random_state=1) #n_estimator is no of branches

bagging.fit(x_train, y_train)

print(bagging.score(x_train, y_train))
print(bagging.score(x_test, y_test))


0.7785714285714286
0.7333333333333333


#                             Ensemble Learning - AdaBoosting (adaptive boosting)

one model is created and then next model is created by learning from previous model. Process is sequential

In [197]:
from sklearn.ensemble import AdaBoostClassifier

boosting = AdaBoostClassifier(n_estimators = 50, random_state=1)
boosting.fit(x_train, y_train)

print(boosting.score(x_train, y_train))
print(boosting.score(x_test, y_test))

0.8028571428571428
0.7366666666666667


#                             Ensemble Learning - Gradient Boosting

In [199]:
from sklearn.ensemble import GradientBoostingClassifier

gradient = GradientBoostingClassifier(n_estimators= 50, random_state=1)
gradient.fit(x_train, y_train)

print(gradient.score(x_train, y_train))
print(gradient.score(x_test, y_test))

0.8657142857142858
0.74


#                             Ensemble Learning -  Random Forest

In [203]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier(n_estimators= 50, random_state=1, max_features = 12)
#max_features will not consider all columns but selective ones for better result. We should start generally with squareroot of no. of columns
randomforest.fit(x_train, y_train)

print(randomforest.score(x_train, y_train))
print(randomforest.score(x_test, y_test))

0.9985714285714286
0.7766666666666666
