In [1]:
import os
import pandas as pd

In [3]:
#os.chdir("D:\ML\Decision tree\Data")
dat=pd.read_csv("credit_history.csv")
dat.head()

Unnamed: 0,default,amount,grade,years,ownership,income,age
0,0,1000,B,2.0,RENT,19200.0,24
1,1,6500,A,2.0,MORTGAGE,66000.0,28
2,0,2400,A,2.0,RENT,60000.0,36
3,0,10000,C,3.0,RENT,62000.0,24
4,1,4000,C,2.0,RENT,20000.0,28


Target customers who are likely to default

In [4]:
dat.isnull().sum()

default        0
amount         0
grade          0
years        279
ownership      0
income         0
age            0
dtype: int64

In [5]:
dat['years'].describe()

count    7448.000000
mean        6.086332
std         6.700758
min         0.000000
25%         2.000000
50%         4.000000
75%         8.000000
max        62.000000
Name: years, dtype: float64

In [6]:
dat['years'].fillna(4,inplace=True)

#Independent variables
X=dat.drop("default",axis=1)

In [7]:
X.head()

Unnamed: 0,amount,grade,years,ownership,income,age
0,1000,B,2.0,RENT,19200.0,24
1,6500,A,2.0,MORTGAGE,66000.0,28
2,2400,A,2.0,RENT,60000.0,36
3,10000,C,3.0,RENT,62000.0,24
4,4000,C,2.0,RENT,20000.0,28


In [8]:
len(X['grade'].unique())

7

In [9]:
#Create Dummies
X=pd.get_dummies(X)
y=dat['default']

In [10]:
X.head()

Unnamed: 0,amount,years,income,age,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT
0,1000,2.0,19200.0,24,0,1,0,0,0,0,0,0,0,0,1
1,6500,2.0,66000.0,28,1,0,0,0,0,0,0,1,0,0,0
2,2400,2.0,60000.0,36,1,0,0,0,0,0,0,0,0,0,1
3,10000,3.0,62000.0,24,0,0,1,0,0,0,0,0,0,0,1
4,4000,2.0,20000.0,28,0,0,1,0,0,0,0,0,0,0,1


In [11]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: default, dtype: int64

In [12]:
X.shape

(7727, 15)

In [13]:
import sklearn.model_selection as model_selection
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=200)

Bagging: A procedure for reducing the variance of a model.

Each tree is built on a bootstrap dataset, independent of other trees

In [14]:
#Bagging Classifier: Bootstrap sampling with DT as base learner
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [15]:
clf=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=20,random_state=400,
                      base_estimator=DecisionTreeClassifier())

In [16]:
clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=20,
                  n_jobs=-1, oob_score=True, random_state=400)

In [17]:
clf.oob_score_

0.6256269212101602

In [16]:
#Average number of correct predictions
clf.score(X_test,y_test)

0.6513583441138422

In [18]:
set(range(100,200,20))

{100, 120, 140, 160, 180}

In [19]:
#Parameter tuning
for w in range(100,200,20):
    clf=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=w,random_state=400,
                          base_estimator=DecisionTreeClassifier())
    clf.fit(X_train,y_train)
    oob=clf.oob_score_
    scr=clf.score(X_test,y_test)
    print('For n_estimators = '+str(w))
    print('OOB score is '+str(oob))
    print('OOB score is '+str(scr))
    print('************************')

For n_estimators = 100
OOB score is 0.6471444750040447
OOB score is 0.6746442432082794
************************
For n_estimators = 120
OOB score is 0.6477916194790487
OOB score is 0.6707632600258733
************************
For n_estimators = 140
OOB score is 0.6495712667853099
OOB score is 0.6727037516170763
************************
For n_estimators = 160
OOB score is 0.651674486329073
OOB score is 0.6714100905562742
************************
For n_estimators = 180
OOB score is 0.6508655557353179
OOB score is 0.6714100905562742
************************


In [20]:
#Finalizing on a tree model with 180 trees
clf=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=100,random_state=400,
                      base_estimator=DecisionTreeClassifier())
clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100,
                  n_jobs=-1, oob_score=True, random_state=400)

In [21]:
#Average Number of correct predictions
clf.score(X_test,y_test) 

0.6746442432082794

In [22]:
# Feature Importance
len(clf.estimators_)

100

In [23]:
# We can extract feature importance from each tree then take a mean for all trees
import numpy as np
imp=[]
for i in clf.estimators_:
    imp.append(i.feature_importances_) #feature importance at each tree

In [24]:
imp[0]

array([0.19700134, 0.17624424, 0.27210945, 0.17167482, 0.05989222,
       0.03158916, 0.01158755, 0.01190346, 0.00811315, 0.00404819,
       0.00118149, 0.01788583, 0.00346169, 0.00929994, 0.02400746])

In [26]:
imp=np.mean(imp,axis=0)
imp

array([0.22591154, 0.16335194, 0.27435677, 0.17076598, 0.05785835,
       0.02551805, 0.01037358, 0.00771893, 0.00557347, 0.00306825,
       0.00091687, 0.01878629, 0.0014107 , 0.01258523, 0.02180404])

In [27]:
feature_importance=pd.Series(imp,index=X.columns.tolist()).sort_values(ascending=False)
feature_importance

income                0.274357
amount                0.225912
age                   0.170766
years                 0.163352
grade_A               0.057858
grade_B               0.025518
ownership_RENT        0.021804
ownership_MORTGAGE    0.018786
ownership_OWN         0.012585
grade_C               0.010374
grade_D               0.007719
grade_E               0.005573
grade_F               0.003068
ownership_OTHER       0.001411
grade_G               0.000917
dtype: float64

Random Forest Classifier: Bootstrap sampling + Feature sampling

In [None]:
X_train.head()

In [None]:
y_train.head()

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
clf=RandomForestClassifier(n_estimators=80,oob_score=True,n_jobs=-1,random_state=400)
clf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=80, n_jobs=-1, oob_score=True,
                       random_state=400)

In [40]:
clf.oob_score_

0.6574987866041093

In [41]:
for w in range(50,300,10):
    clf=RandomForestClassifier(n_estimators=w,oob_score=True,n_jobs=-1,random_state=400)
    clf.fit(X_train,y_train)
    oob=clf.oob_score_
    print('For n_estimators = '+str(w))
    print('OOB score is '+str(oob))
    print('************************')

For n_estimators = 50
OOB score is 0.6502184112603139
************************
For n_estimators = 60
OOB score is 0.651998058566575
************************
For n_estimators = 70
OOB score is 0.6566898560103543
************************
For n_estimators = 80
OOB score is 0.6574987866041093
************************
For n_estimators = 90
OOB score is 0.6555573531790972
************************
For n_estimators = 100
OOB score is 0.6578223588416114
************************
For n_estimators = 110
OOB score is 0.6578223588416114
************************
For n_estimators = 120
OOB score is 0.6583077171978644
************************
For n_estimators = 130
OOB score is 0.6617052256916357
************************
For n_estimators = 140
OOB score is 0.6605727228603786
************************
For n_estimators = 150
OOB score is 0.6599255783853746
************************
For n_estimators = 160
OOB score is 0.6591166477916195
************************
For n_estimators = 170
OOB score is 0.65863128

In [42]:
#Finalize 290 trees
clf=RandomForestClassifier(n_estimators=290,oob_score=True,n_jobs=-1,random_state=400,max_depth=3,max_features=4)
clf.fit(X_train,y_train)

RandomForestClassifier(max_depth=3, max_features=4, n_estimators=290, n_jobs=-1,
                       oob_score=True, random_state=400)

In [43]:
clf.score(X_test,y_test)

0.628719275549806

In [None]:
# #Area Under the curve
# from sklearn import metrics
# metrics.roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

In [44]:
clf.feature_importances_

array([0.02903717, 0.0142545 , 0.13676535, 0.01690939, 0.38974467,
       0.05329521, 0.12124344, 0.14778315, 0.03155036, 0.03922267,
       0.00130765, 0.0065073 , 0.00104804, 0.00199317, 0.00933794])

In [None]:
pd.Series(clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)

In [45]:
from sklearn.model_selection import GridSearchCV
rf=GridSearchCV(clf,param_grid={'max_depth':[3,5],'max_features':[3,5],'n_estimators':[10,20]})
rf.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(max_depth=3, max_features=4,
                                              n_estimators=290, n_jobs=-1,
                                              oob_score=True,
                                              random_state=400),
             param_grid={'max_depth': [3, 5], 'max_features': [3, 5],
                         'n_estimators': [10, 20]})

In [46]:
rf.best_estimator_

RandomForestClassifier(max_depth=5, max_features=3, n_estimators=10, n_jobs=-1,
                       oob_score=True, random_state=400)

In [None]:
rf.best_params_

In [47]:
rf.best_score_

0.6393806918816534

In [48]:
rf.score(X_test,y_test)

0.6403622250970246

Boosting classifier: Trees are grown sequentially: each tree is grown using information from previously grown trees. 

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf=AdaBoostClassifier(n_estimators=50,random_state=400)
clf.fit(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf=GradientBoostingClassifier(n_estimators=80,random_state=400)
clf.fit(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(clf,param_grid={'n_estimators':[60,80,100,120,140,160]},cv=3) 
mod.fit(X_train,y_train)

In [None]:
mod.best_estimator_

In [None]:
clf=GradientBoostingClassifier(n_estimators=160,random_state=400)
clf.fit(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
pd.Series(clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)

In [None]:
#Partial Dependence plot
from sklearn.ensemble.partial_dependence import plot_partial_dependence
fig,axs=plot_partial_dependence(clf,X_train,[2],feature_names=X_train.columns.tolist())

In [None]:
fig

In [None]:
#Base learners are generated sequentially in such a way that the present base learner is always 
#more effective than the previous one.
#Optimize loss function of the previous learner.

XGboost is focused towards
1. Computational Speed
2. Model performance

In [31]:
import xgboost as xg
xgb=xg.XGBClassifier(objective='binary:logistic',reg_lambda=0.2,max_depth=3,random_state=200) #L2 regularization

In [33]:
#Grid Search CV: Parameter tuning
xgb=model_selection.GridSearchCV(xgb, param_grid={'max_depth':[2,3,5,9],'n_estimators':[50,100,150,500],'reg_lambda':[0.1,0.2]})
xgb.fit(X_train,y_train)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=3, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=200,
                                     reg_alpha=None, reg_lambda=0.2,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=Non

In [34]:
xgb.best_params_

{'max_depth': 5, 'n_estimators': 100, 'reg_lambda': 0.1}

In [35]:
xgb.score(X_test,y_test)

0.6397153945666235

In [None]:
#Area Under the curve
from sklearn import metrics
metrics.roc_auc_score(y_test,xgb.predict_proba(X_test)[:,1])

In [None]:
#Saving the model
import pickle
pickle.dump(xgb,open('xgb.sav','wb'))

In [None]:
#Load the model
import pickle
model=pickle.load(open('xgb.sav', 'rb'))

In [None]:
model.score(X_test,y_test)