In [1]:
%matplotlib inline

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [5]:
from sklearn.feature_extraction.text import CountVectorizer  #DT does not take strings as input for the model fit step....

In [6]:
credit_df = pd.read_csv("credit.csv")

In [7]:
credit_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
percent_of_income,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
years_at_residence,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_loans_count,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [8]:
credit_df.head(5).T  #let us look at the raw data. As is evident, there are lot of missing values 

Unnamed: 0,0,1,2,3,4
checking_balance,< 0 DM,1 - 200 DM,unknown,< 0 DM,< 0 DM
months_loan_duration,6,48,12,42,24
credit_history,critical,good,critical,good,poor
purpose,furniture/appliances,furniture/appliances,education,furniture/appliances,car
amount,1169,5951,2096,7882,4870
savings_balance,unknown,< 100 DM,< 100 DM,< 100 DM,< 100 DM
employment_duration,> 7 years,1 - 4 years,4 - 7 years,4 - 7 years,1 - 4 years
percent_of_income,4,2,2,2,3
years_at_residence,4,2,3,4,4
age,67,22,49,45,53


In [9]:
credit_df.shape

(1000, 17)

In [10]:
credit_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
percent_of_income,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
years_at_residence,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_loans_count,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [11]:
credit_df.info()  # many columns are of type object i.e. strings. These need to be converted to ordinal type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


In [12]:
# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types. 
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for feature in credit_df.columns: # Loop through all columns in the dataframe
    if credit_df[feature].dtype == 'object': # Only apply for columns with categorical strings
        credit_df[feature] = pd.Categorical(credit_df[feature]).codes # Replace strings with an integer

In [13]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null int8
months_loan_duration    1000 non-null int64
credit_history          1000 non-null int8
purpose                 1000 non-null int8
amount                  1000 non-null int64
savings_balance         1000 non-null int8
employment_duration     1000 non-null int8
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null int8
housing                 1000 non-null int8
existing_loans_count    1000 non-null int64
job                     1000 non-null int8
dependents              1000 non-null int64
phone                   1000 non-null int8
default                 1000 non-null int8
dtypes: int64(7), int8(10)
memory usage: 64.5 KB


In [14]:
credit_df.head(5).T

Unnamed: 0,0,1,2,3,4
checking_balance,1,0,3,1,1
months_loan_duration,6,48,12,42,24
credit_history,0,1,0,1,3
purpose,4,4,3,4,1
amount,1169,5951,2096,7882,4870
savings_balance,4,2,2,2,2
employment_duration,3,0,1,1,0
percent_of_income,4,2,2,2,3
years_at_residence,4,2,3,4,4
age,67,22,49,45,53


In [15]:
col_labels = [ 'checking_balance', 'months_loan_duration' ,'credit_history' , 'purpose, amount', 'savings_balance' , 'employment_duration'
, 'percent_of_income', 'years_at_residence', 'age', 'other_credit', 'housing', 'existing_loans_count', 'job', 'dependents', 'phone', 'default']

In [16]:
credit_df.columns

Index(['checking_balance', 'months_loan_duration', 'credit_history', 'purpose',
       'amount', 'savings_balance', 'employment_duration', 'percent_of_income',
       'years_at_residence', 'age', 'other_credit', 'housing',
       'existing_loans_count', 'job', 'dependents', 'phone', 'default'],
      dtype='object')

In [17]:
# splitting data into training and test set for independent attributes
train_set = credit_df.head(700) # Up to the last initial training set row
test_set = credit_df.tail(300) # Past the last initial training set row

In [18]:
train_set.head(4).T

Unnamed: 0,0,1,2,3
checking_balance,1,0,3,1
months_loan_duration,6,48,12,42
credit_history,0,1,0,1
purpose,4,4,3,4
amount,1169,5951,2096,7882
savings_balance,4,2,2,2
employment_duration,3,0,1,1
percent_of_income,4,2,2,2
years_at_residence,4,2,3,4
age,67,22,49,45


In [19]:
#train_set.pop("default")

In [20]:
# capture the target column ("default") into separate vectors for training set and test set
train_labels = train_set.pop("default")
test_labels = test_set.pop("default")

In [21]:
# invoking the decision tree classifier function. Using 'entropy' method of finding the split columns. Other option 
# could be gini index.  Restricting the depth of the tree to 5 (no particular reason for selecting this)

#dt_model = DecisionTreeClassifier(criterion = 'entropy' , max_depth = 5, random_state = 100)
                                  
dt_model = DecisionTreeClassifier(criterion = 'entropy' )

In [22]:
dt_model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [23]:
dt_model.score(train_set , train_labels)

1.0

In [24]:
test_pred = dt_model.predict(test_set)

In [25]:
dt_model.score(test_set , test_labels)

0.6866666666666666

### Regularising the Decision Tree

In [26]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, min_samples_leaf=5 )
dt_model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [27]:
test_pred = dt_model.predict(test_set)

In [28]:
dt_model.score(train_set , train_labels)

0.7828571428571428

In [29]:
dt_model.score(test_set , test_labels)

0.7233333333333334

# Ensemble Learning - Bagging

In [62]:
from sklearn.ensemble import BaggingClassifier

In [63]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, min_samples_leaf=5 )

In [64]:
#bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=50)
#bgcl = BaggingClassifier(n_estimators=50)

In [65]:
from sklearn.linear_model import LogisticRegression

In [66]:
lrModel = LogisticRegression()

In [67]:
bgcl = BaggingClassifier(base_estimator=lrModel, n_estimators=50)

In [68]:
bgcl = bgcl.fit(train_set, train_labels)





In [69]:
bgcl.score(train_set , train_labels)

0.7271428571428571

In [70]:
bgcl.score(test_set , test_labels)

0.7233333333333334

In [71]:
NDP = test_set.head(1)

In [72]:
bgcl.predict(NDP)

array([0], dtype=int8)

# Ensemble Learning - AdaBoosting

In [73]:
from sklearn.ensemble import AdaBoostClassifier

In [74]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, min_samples_leaf=5 )

In [75]:
#abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=50)

In [76]:
abcl = AdaBoostClassifier(base_estimator=lrModel, n_estimators=50)

In [77]:
abcl = abcl.fit(train_set, train_labels)



In [78]:
abcl.score(train_set, train_labels)

0.72

In [79]:
abcl.score(test_set , test_labels)

0.72

# Ensemble Learning - GradientBoost

In [80]:
from sklearn.ensemble import GradientBoostingClassifier

In [81]:
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.09, max_depth=5, min_samples_leaf=5 )

In [82]:
gbcl = GradientBoostingClassifier(n_estimators = 50)

In [83]:
gbcl = gbcl.fit(train_set, train_labels)

In [84]:
test_pred = gbcl.predict(test_set)
gbcl.score(test_set , test_labels)

0.7566666666666667

# Ensemble RandomForest Classifier

In [85]:
from sklearn.ensemble import RandomForestClassifier

In [86]:
rfcl = RandomForestClassifier(n_estimators = 6)

In [87]:
rfcl = rfcl.fit(train_set, train_labels)

In [88]:
test_pred = rfcl.predict(test_set)
rfcl.score(test_set , test_labels)

0.7266666666666667