In [5]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
# This is the model we'll be using.
from sklearn import tree

# A convenience for displaying visualizations.
from IPython.display import Image

# Packages for rendering our tree.
import pydotplus
import graphviz

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

In [7]:
# reading the data set
df = pd.read_csv('loan.csv', dtype = {'issue_d': str}, low_memory=False)
# https://www.kaggle.com/wendykan/lending-club-loan-data

## Data Cleaning & Exploration

In [8]:
df.head(10)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,
5,,,5550,5550,5550.0,36 months,15.02,192.45,C,C3,...,,,Cash,N,,,,,,
6,,,2000,2000,2000.0,36 months,17.97,72.28,D,D1,...,,,Cash,N,,,,,,
7,,,6000,6000,6000.0,36 months,13.56,203.79,C,C1,...,,,DirectPay,N,,,,,,
8,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
9,,,6000,6000,6000.0,36 months,14.47,206.44,C,C2,...,,,Cash,N,,,,,,


In [11]:
list(df)

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'application_type',
 'annual_inc_joint',
 'dti_joint',
 'verification_status_joint',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_

In [13]:
df = df[['loan_amnt', 'installment', 'grade', 'annual_inc', 'mths_since_last_delinq', 'dti', 'all_util', 'inq_last_6mths']]

In [14]:
df.head(10)

Unnamed: 0,loan_amnt,installment,grade,annual_inc,mths_since_last_delinq,dti,all_util,inq_last_6mths
0,2500,84.92,C,55000.0,,18.24,28.0,1.0
1,30000,777.23,D,90000.0,71.0,26.52,57.0,0.0
2,5000,180.69,D,59280.0,,10.51,35.0,0.0
3,4000,146.51,D,92000.0,,16.74,70.0,0.0
4,30000,731.78,C,57250.0,,26.35,54.0,0.0
5,5550,192.45,C,152500.0,,37.94,58.0,3.0
6,2000,72.28,D,51000.0,,2.4,100.0,1.0
7,6000,203.79,C,65000.0,,30.1,74.0,0.0
8,5000,180.69,D,53580.0,32.0,21.16,73.0,1.0
9,6000,206.44,C,300000.0,17.0,17.43,48.0,1.0


In [15]:
null_count = df.isnull().sum()
null_count[null_count>0]

mths_since_last_delinq    538776
dti                         1197
all_util                   97767
inq_last_6mths                 1
dtype: int64

In [17]:
df = df.fillna(0)

In [18]:
null_count = df.isnull().sum()
null_count[null_count>0]

Series([], dtype: int64)

In [25]:
X = df.drop('grade', 1)
Y = df['grade']
y_list = Y.unique()
y_list

array(['C', 'D', 'B', 'A', 'E', 'F', 'G'], dtype=object)

## Decision Tree

In [32]:
# Initialize and train our tree.
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=4,
    random_state = 1337
)
decision_tree.fit(X, Y)

cross_val_score(decision_tree, X, Y, cv=5)

array([0.31859774, 0.33162309, 0.34490783, 0.3380954 , 0.33101587])

## Decision Forest

In [27]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

In [29]:
rfc = ensemble.RandomForestClassifier()

In [30]:
cross_val_score(rfc, X, Y, cv=5)



array([0.63965115, 0.68885689, 0.70479319, 0.64314086, 0.57997635])