In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn import ensemble # required for RandomForest
from sklearn import linear_model
from sklearn import model_selection
from sklearn import feature_selection # required for RFE
from sklearn import preprocessing
from sklearn import metrics
warnings.filterwarnings('ignore')

In [2]:
def printmetrics(actual,predicted):
    print('AUC : ',np.round(metrics.roc_auc_score(actual,predicted),4))
    print('Accuracy :',np.round(metrics.accuracy_score(actual,predicted),4))
    print('Precision : ',np.round(metrics.precision_score(actual,predicted),4))
    print('Recall : ',np.round(metrics.recall_score(actual,predicted),4))
    print('F1 : ',np.round(metrics.f1_score(actual,predicted),4))

In [3]:
df=pd.read_csv('loan_data_set.csv')
df=df.drop('Loan_ID',axis=1)
df=df.dropna()

In [4]:
newcols=['gender','married','dependents','edu','selfemp','appinc','coappinc','amnt','term','chistory','proparea','status']
df.columns=newcols
df['status']=df['status'].replace({'Y':1,'N':0})
X=df.drop('status',axis=1)
y=df['status']

In [5]:
X['gender']=X['gender'].replace({'Male':1,'Female':0})
X['married']=X['married'].replace({'Yes':1,'No':0})
X['dependents']=X['dependents'].replace({'3+':3})
X['dependents']=X['dependents'].astype(np.int)
X['edu']=X['edu'].replace({'Graduate':1,'Not Graduate':2})
X['selfemp']=X['selfemp'].replace({'Yes':1,'No':0})
X['proparea']=X['proparea'].replace({'Rural':1,'Urban':2,'Semiurban':3})

In [6]:
Xtrain,Xtest,ytrain,ytest=model_selection.train_test_split(X,y,test_size=.15,random_state=42)

In [7]:
model=ensemble.RandomForestClassifier(random_state=42,n_estimators=200)

In [8]:
model.fit(Xtrain,ytrain)

RandomForestClassifier(n_estimators=200, random_state=42)

In [9]:
predtrain=model.predict(Xtrain)
predtest=model.predict(Xtest)

In [10]:
printmetrics(ytrain,predtrain)

AUC :  1.0
Accuracy : 1.0
Precision :  1.0
Recall :  1.0
F1 :  1.0


In [11]:
printmetrics(ytest,predtest)

AUC :  0.7661
Accuracy : 0.8472
Precision :  0.8448
Recall :  0.9608
F1 :  0.8991


In [12]:
model=ensemble.RandomForestClassifier(random_state=42,n_estimators=2000,max_features=1)

In [13]:
model.fit(Xtrain,ytrain)

RandomForestClassifier(max_features=1, n_estimators=2000, random_state=42)

In [14]:
predtrain=model.predict(Xtrain)
predtest=model.predict(Xtest)

In [15]:
printmetrics(ytrain,predtrain)

AUC :  1.0
Accuracy : 1.0
Precision :  1.0
Recall :  1.0
F1 :  1.0


In [16]:
printmetrics(ytrain,predtrain)

AUC :  1.0
Accuracy : 1.0
Precision :  1.0
Recall :  1.0
F1 :  1.0


In [17]:
printmetrics(ytest,predtest)

AUC :  0.7423
Accuracy : 0.8333
Precision :  0.8305
Recall :  0.9608
F1 :  0.8909


In [18]:
pd.Series(model.feature_importances_ , index=Xtrain.columns).sort_values(ascending=False)

appinc        0.215631
amnt          0.209175
chistory      0.179781
coappinc      0.128539
proparea      0.061036
term          0.058130
dependents    0.058050
married       0.026846
edu           0.024502
selfemp       0.019207
gender        0.019103
dtype: float64

In [19]:
model=ensemble.RandomForestClassifier(random_state=42)
pdict={'n_estimators':[4,5,6,7,8,9]}
gridobj=model_selection.GridSearchCV(estimator=model,param_grid=pdict,scoring='precision',cv=5,return_train_score=True)
gridobj.fit(Xtrain,ytrain)
print("Best params :",gridobj.best_params_)
print("Best score :",gridobj.best_score_)

Best params : {'n_estimators': 6}
Best score : 0.807975865393939


In [20]:
cvresults=pd.DataFrame(gridobj.cv_results_)

In [21]:
cvresults[ ['params','mean_train_score','mean_test_score']]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_estimators': 4},0.977243,0.801302
1,{'n_estimators': 5},0.959201,0.794771
2,{'n_estimators': 6},0.97766,0.807976
3,{'n_estimators': 7},0.96867,0.794261
4,{'n_estimators': 8},0.990158,0.804651
5,{'n_estimators': 9},0.980773,0.796649


In [22]:
model=ensemble.RandomForestClassifier(random_state=42)
pdict={'n_estimators':[4,5,6,7,8,9],'max_features':[1,2,3,4]}
gridobj=model_selection.GridSearchCV(estimator=model,param_grid=pdict,scoring='precision',cv=5,return_train_score=True)
gridobj.fit(Xtrain,ytrain)
print("Best params :",gridobj.best_params_)
print("Best score :",gridobj.best_score_)
cvresults=pd.DataFrame(gridobj.cv_results_)
print(cvresults[ ['params','mean_train_score','mean_test_score']])

Best params : {'max_features': 4, 'n_estimators': 4}
Best score : 0.8192150635208713
                                    params  mean_train_score  mean_test_score
0   {'max_features': 1, 'n_estimators': 4}          0.976990         0.803836
1   {'max_features': 1, 'n_estimators': 5}          0.960890         0.783250
2   {'max_features': 1, 'n_estimators': 6}          0.980308         0.808038
3   {'max_features': 1, 'n_estimators': 7}          0.966291         0.783002
4   {'max_features': 1, 'n_estimators': 8}          0.990190         0.806956
5   {'max_features': 1, 'n_estimators': 9}          0.975620         0.783213
6   {'max_features': 2, 'n_estimators': 4}          0.972510         0.798151
7   {'max_features': 2, 'n_estimators': 5}          0.954691         0.790943
8   {'max_features': 2, 'n_estimators': 6}          0.976662         0.792703
9   {'max_features': 2, 'n_estimators': 7}          0.967701         0.774519
10  {'max_features': 2, 'n_estimators': 8}          0.984

In [23]:
model=ensemble.RandomForestClassifier(random_state=42,max_features=4,n_estimators=4)

rfeobj=feature_selection.RFE(estimator=model,n_features_to_select=5)
# feature selection using decision tree

rfeobj.fit(Xtrain,ytrain)

RFE(estimator=RandomForestClassifier(max_features=4, n_estimators=4,
                                     random_state=42),
    n_features_to_select=5)

In [24]:
rfeobj.fit(Xtrain,ytrain)

RFE(estimator=RandomForestClassifier(max_features=4, n_estimators=4,
                                     random_state=42),
    n_features_to_select=5)

In [25]:
Xtrain.columns[rfeobj.support_]

Index(['appinc', 'coappinc', 'amnt', 'term', 'chistory'], dtype='object')

In [26]:
model=ensemble.RandomForestClassifier(random_state=42,max_features=4,n_estimators=4)
rfemodel=feature_selection.RFE(estimator=model) # n_features_to_select : best value to find using GridSearchCV
pdict={'n_features_to_select':[1,2,3,4,5,6]} # a dictionary , key ->hyperparameter name , value-> list
gridobj=model_selection.GridSearchCV(estimator=rfemodel,param_grid=pdict,cv=5,scoring='precision',n_jobs=-1,return_train_score=True)
gridobj.fit(Xtrain,ytrain)
print("Best params :",gridobj.best_params_)
cvresults=pd.DataFrame(gridobj.cv_results_)
print(cvresults[ ['params','mean_train_score','mean_test_score']])




Best params : {'n_features_to_select': 6}
                        params  mean_train_score  mean_test_score
0  {'n_features_to_select': 1}               NaN              NaN
1  {'n_features_to_select': 2}               NaN              NaN
2  {'n_features_to_select': 3}               NaN              NaN
3  {'n_features_to_select': 4}          0.972370         0.797612
4  {'n_features_to_select': 5}          0.973501         0.802661
5  {'n_features_to_select': 6}          0.971976         0.817040


In [27]:
df[:10]

Unnamed: 0,gender,married,dependents,edu,selfemp,appinc,coappinc,amnt,term,chistory,proparea,status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,1
6,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,1
7,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,0
8,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,1
9,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,0
10,Male,Yes,2,Graduate,No,3200,700.0,70.0,360.0,1.0,Urban,1
