In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [None]:
credit_df = pd.read_csv("credit.csv")

In [None]:
credit_df.head(10)  

In [None]:
credit_df.describe()

In [None]:
credit_df.info()

In [None]:
credit_df.shape

In [None]:
credit_df.describe()

In [None]:
credit_df['default'].value_counts()

In [None]:
np.sort(credit_df['checking_balance'].unique())

In [None]:
credit_df.info()  # many columns are of type object i.e. strings. These need to be converted to ordinal type

In [None]:
# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types. 
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for feature in credit_df.columns: # Loop through all columns in the dataframe
    if credit_df[feature].dtype == 'object': # Only apply for columns with categorical strings
        credit_df[feature] = pd.Categorical(credit_df[feature]).codes # Replace strings with an integer

In [None]:
# Multiple methods exist
# Label Encoder
# or manually encoding
#credit_df['checking_balance'] = credit_df['checking_balance'].replace({'< 0 DM':0, '1 - 200 DM':1, '> 200 DM':2, 'unknown':3 })


In [None]:
credit_df.info()

In [None]:
credit_df.head(10)

In [None]:
# Importing train-test-split 
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = credit_df.drop('default',axis=1)

# Putting response variable to y
y = credit_df['default']

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 0)
#X_train.head()

In [None]:
# invoking the decision tree classifier function. 
#Restricting the depth of the tree to 5 (no particular reason for selecting this)

                                  
dt_model = DecisionTreeClassifier( max_depth = 10, random_state = 0 )

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
# Let's check the evaluation metrics of our default model

# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred = dt_model.predict(X_test)



In [None]:
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred))

In [None]:
# Check training and testing score
# Training Score
print('Training score')
print(dt_model.score(X_train,y_train))
print('Test score')
print(dt_model.score(X_test , y_test))

In [None]:

# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )

print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))

# Graphic representation of Decision Tree

Start – Anaconda prompt

Visit: https://anaconda.org/conda-forge/python-graphviz

To install this package with conda run one of the following:

conda install python-graphviz 

conda install -c conda-forge python-graphviz 

conda install -c conda-forge/label/broken python-graphviz 

conda install -c conda-forge/label/cf201901 python-graphviz 



In [None]:
class_label = ['No', 'Yes']

In [None]:
from IPython.display import Image  
#import pydotplus as pydot
from sklearn import tree
from os import system

Credit_Tree_File = open('e:\credit_tree_1.dot','w')
dot_data = tree.export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train), class_names = list(class_label))
#dot_data = tree.export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train))
Credit_Tree_File.close()




http://webgraphviz.com/

In [None]:
# You can also copy the script inthe .dot file and paste it at http://webgraphviz.com/ to get tree view 
#or create a .png as below

### system("dot -Tpng e:\credit_tree_1.dot -o e:\credit_tree_1.png")
### Image("e:\credit_tree_1.png")

# Hyperparameter tuning


In [None]:
#With Hyper Parameters Tuning
#from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#making the instance
model= DecisionTreeClassifier(random_state=100)

# specify number of folds for k-fold CV
n_folds = 3

# hyper parameters to build the model on
params = {'max_depth': range(2, 20)}

#Making models with hyper parameters sets
model1 = GridSearchCV(model, param_grid=params, cv=n_folds, return_train_score=True)
#Learning
model1.fit(X_train, y_train)

In [None]:
scores = model1.cv_results_


In [None]:
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
print("Best Hyper Parameters:",model1.best_params_)
print("Best Score:",model1.best_score_)

### Tune multiple papameters

In [None]:
# Tune multiple parameters
#With Hyper Parameters Tuning

from sklearn.model_selection import GridSearchCV

#making the instance
model= DecisionTreeClassifier(random_state=100)

# hyper parameters to build the model on
params = {'criterion':('gini','entropy'), 
          'max_depth': range(2, 10),
         'min_samples_leaf': range(1, 8)}

#Making models with hyper parameters sets
model1 = GridSearchCV(model, param_grid=params)
#Learning
model1.fit(X_train, y_train)

In [None]:
print("Best Hyper Parameters:",model1.best_params_)
print("Best Score:",model1.best_score_)

##                                      Regularising the Decision Tree

In [None]:
dt_model = DecisionTreeClassifier( max_depth = 5, random_state=100)
dt_model.fit(X_train, y_train)

In [None]:

#print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))


In [None]:
y_pred = dt_model.predict(X_test)

In [None]:
dt_model.score(X_test , y_test)

#                             Ensemble Learning - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=10, random_state=100)

bgcl = bgcl.fit(X_train, y_train)


In [None]:
y_pred = bgcl.predict(X_test)
bgcl.score(X_test, y_test)

# Ensemble RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 5, random_state=100)
rfcl = rfcl.fit(X_train, y_train)


In [None]:
test_pred = rfcl.predict(X_test)
rfcl.score(X_test , y_test)

# Hyperparameter tuning of RandomForest

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': range(50,150,5) ,
    'max_depth': [5,10,15,20] 
    
}
# Create a based model
rf = RandomForestClassifier(random_state=0)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv=5, 
                           n_jobs = -1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)