In [9]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

def accuracy(label, predicted):
    N= len(predicted)
    crt = 0
    for i in range(N):
        if label[i] == predicted[i]:
            crt=crt+1
    if N==0:
        raise ValueError("Input array size must be greater than 0")
    return crt/N

def precision(label, predicted):
    N = len(predicted)
    tp = 0
    fp = 0
    for i in range(N):
        if predicted[i]==1:
            if label[i]==1:
                tp=tp+1
            else :
                fp = fp +1
    return tp/(tp+fp)
def recall(label, predicted):
    tp = 0
    fn =0
    N = len(predicted)
    for i in range(N):
        if label[i]==1:
            if predicted[i]==1:
                tp=tp+1
            else :
                fn = fn+1
    return tp/(tp+fn)

In [3]:
def load_data(filename):
    df = pd.read_csv(filename)
    n=len(df)
    df=df.dropna(axis=1,thresh=int(0.4*n))
    df.drop(columns=['id','member_id','url','desc','emp_title','purpose','title','zip_code','addr_state','earliest_cr_line'],inplace=True)
    df.drop(columns=['inq_last_6mths','sub_grade','application_type','last_pymnt_d','issue_d','pymnt_plan'],inplace=True)
    df['term'] = df['term'].str.replace(' months','').astype(float)
    df['int_rate'] = df['int_rate'].str.replace('%','').astype(float)
    df['revol_util'] = df['revol_util'].str.replace('%','').astype(float)
    df['last_credit_pull_d']=df['last_credit_pull_d'].astype(str).str[:2]
    df['last_credit_pull_d'] = df['last_credit_pull_d'].str.replace('-','')
    df.grade=df.grade.astype('category').cat.codes
    df.home_ownership=df.home_ownership.astype('category').cat.codes
    df = df[df.loan_status != 'Current']
    df.loan_status=df.loan_status.astype('category').cat.codes
    df.verification_status=df.verification_status.astype('category').cat.codes
    df.initial_list_status=df.initial_list_status.astype('category').cat.codes
    df.last_credit_pull_d=df.last_credit_pull_d.astype('category').cat.codes
    df.emp_length=df.emp_length.astype('category').cat.codes
    df.dropna(inplace=True)
    return df

dfTrain = load_data('loan_train.csv')
dfTest = load_data('loan_test.csv')

In [4]:
Xtrain=np.array(dfTrain.drop(columns=['loan_status']))
Ytrain=np.array(dfTrain.loan_status)
print(Ytrain)
Xtest=np.array(dfTest.drop(columns=['loan_status']))
Ytest=np.array([dfTest.loan_status]).flatten()
max_depth= [3,7,10]
maxfeatures=['auto','sqrt','log2']
n_estimators=[50,100,200]
for depth in max_depth:
    for features in maxfeatures:
        for trees in n_estimators:
            clf = GradientBoostingClassifier(max_depth=depth,n_estimators=trees,max_features=features)
            clf.fit(Xtrain,Ytrain)
            predicted=clf.predict(Xtest)
            n=len(predicted)
            p=predicted.reshape([n,1]).flatten()
            print("The accuracy of the model is:",accuracy(Ytest,p))
            print("hyperparameters: n_estimators:",trees," max_features:",features," max_depth:",depth)



[0 1 1 ... 1 1 1]
The accuracy of the model is: 0.9903398926654741
hyperparameters: n_estimators: 50  max_features: auto  max_depth: 3
The accuracy of the model is: 0.995134168157424
hyperparameters: n_estimators: 100  max_features: auto  max_depth: 3
The accuracy of the model is: 0.9962075134168158
hyperparameters: n_estimators: 200  max_features: auto  max_depth: 3
The accuracy of the model is: 0.9847584973166369
hyperparameters: n_estimators: 50  max_features: sqrt  max_depth: 3
The accuracy of the model is: 0.9926296958855099
hyperparameters: n_estimators: 100  max_features: sqrt  max_depth: 3
The accuracy of the model is: 0.995134168157424
hyperparameters: n_estimators: 200  max_features: sqrt  max_depth: 3
The accuracy of the model is: 0.984830053667263
hyperparameters: n_estimators: 50  max_features: log2  max_depth: 3
The accuracy of the model is: 0.9907692307692307
hyperparameters: n_estimators: 100  max_features: log2  max_depth: 3
The accuracy of the model is: 0.994847942754

### The maximum accuracy is achieved for the following
hyperparameters: n_estimators: 100  max_features: auto  max_depth: 7 \
The accuracy of the model is: 0.9974239713774598

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf2 = DecisionTreeClassifier()
clf2.fit(Xtrain,Ytrain)
predicted = clf2.predict(Xtest)
print("The accuracy of the inbuilt decision tree classifier is: ",accuracy(Ytest,predicted))
print("The precision of the inbuilt decision tree classifier is: ",precision(Ytest,predicted))
print("The recall of the inbuilt decision tree classifier is: ",recall(Ytest,predicted))

clf = GradientBoostingClassifier(max_depth=7, n_estimators=100, max_features = 'auto')
clf.fit(Xtrain,Ytrain)
predicted=clf.predict(Xtest)
n=len(predicted)
p=predicted.reshape([n,1]).flatten()
print("The accuracy of the gradient boosting model is:",accuracy(Ytest,p))
print("The precision of the gradient boosting model is:",precision(Ytest,p))
print("The recall of the gradient boosting model is:",recall(Ytest,p))



The accuracy of the inbuilt decision tree classifier is:  0.9916279069767442
The precision of the inbuilt decision tree classifier is:  0.9951133204145253
The recall of the inbuilt decision tree classifier is:  0.9950294860994103
The accuracy of the gradient boosting model is: 0.9967084078711985
The precision of the gradient boosting model is: 0.9964729593550554
The recall of the gradient boosting model is: 0.9996630160067397
