In [226]:

%matplotlib inline

import pandas as pd
import sklearn
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
import random
mpl.pyplot.style.use('ggplot')
from graphviz import Digraph

In [None]:
loandata = pd.read_csv('../data/LoanStats3d.csv',skiprows=[0])

In [None]:
completedloans=loandata[(loandata.loan_status=='Charged Off')|(loandata.loan_status=='Fully Paid')]

In [None]:
# Split into test and training sets. Allocating 80% for training data
random.seed(1234)
traindata, testdata = train_test_split(completedloans, test_size = 0.2)

## Data Cleaning


In [None]:
traindata.drop(['id','member_id','funded_amnt','emp_title','desc','url','pymnt_plan','title'],1,inplace=True)
#traindata.drop('id',1,inplace=True)


In [None]:
from datetime import datetime
traindata.earliest_cr_line = pd.to_datetime(traindata.earliest_cr_line)
traindata.last_pymnt_d = pd.to_datetime(traindata.last_pymnt_d)
traindata.next_pymnt_d = pd.to_datetime(traindata.next_pymnt_d)
traindata.last_credit_pull_d = pd.to_datetime(traindata.last_credit_pull_d)

In [None]:
traindata.earliest_cr_line =[datetime.toordinal(x) for x in traindata.earliest_cr_line]
traindata.last_pymnt_d =[datetime.toordinal(x) for x in traindata.last_pymnt_d]
traindata.next_pymnt_d =[datetime.toordinal(x) for x in traindata.next_pymnt_d]
traindata.last_credit_pull_d =[datetime.toordinal(x) for x in traindata.last_credit_pull_d]

In [None]:
traindata.int_rate = pd.Series(traindata.int_rate).str.replace('%', '').astype(float)
traindata.revol_util= pd.Series(traindata.revol_util).str.replace('%', '').astype(float)

In [None]:
testdata.earliest_cr_line = pd.to_datetime(testdata.earliest_cr_line)
testdata.last_pymnt_d = pd.to_datetime(testdata.last_pymnt_d)
testdata.next_pymnt_d = pd.to_datetime(testdata.next_pymnt_d)
testdata.last_credit_pull_d = pd.to_datetime(testdata.last_credit_pull_d)
testdata.earliest_cr_line =[datetime.toordinal(x) for x in testdata.earliest_cr_line]
testdata.last_pymnt_d =[datetime.toordinal(x) for x in testdata.last_pymnt_d]
testdata.next_pymnt_d =[datetime.toordinal(x) for x in testdata.next_pymnt_d]
testdata.last_credit_pull_d =[datetime.toordinal(x) for x in testdata.last_credit_pull_d]
testdata.int_rate = pd.Series(testdata.int_rate).str.replace('%', '').astype(float)
testdata.revol_util= pd.Series(testdata.revol_util).str.replace('%', '').astype(float)

In [None]:
# Import SciKit Learn functions
from sklearn.metrics import roc_curve, auc , roc_auc_score, confusion_matrix, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier , export_graphviz
from sklearn.cross_validation import train_test_split
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

In [None]:
# Rearranging the columns so that the response variable is at index 0
cols = traindata.columns.tolist()
cols =(cols[12:13]+cols[0:12]+cols[13:])
traindata = traindata[cols]

In [None]:
testdata = testdata[cols]

In [None]:
# This code transforms the string columns into numerical values 
le = preprocessing.LabelEncoder()
le.fit(traindata.home_ownership)
le.classes_
traindata.home_ownership=le.transform(traindata.home_ownership)

le.fit(traindata.term)
le.classes_
traindata.term=le.transform(traindata.term)

le.fit(traindata.grade)
le.classes_
traindata.grade=le.transform(traindata.grade)


le.fit(traindata.loan_status)
le.classes_
traindata.loan_status=le.transform(traindata.loan_status)

le.fit(traindata.emp_length)
le.classes_
traindata.emp_length=le.transform(traindata.emp_length)

le.fit(traindata.verification_status)
le.classes_
traindata.verification_status=le.transform(traindata.verification_status)

le.fit(traindata.issue_d)
le.classes_
traindata.issue_d=le.transform(traindata.issue_d)

le.fit(traindata.purpose)
le.classes_
traindata.purpose=le.transform(traindata.purpose)

le.fit(traindata.zip_code)
le.classes_
traindata.zip_code=le.transform(traindata.zip_code)

le.fit(traindata.addr_state)
le.classes_
traindata.addr_state=le.transform(traindata.addr_state)

le.fit(traindata.initial_list_status)
le.classes_
traindata.initial_list_status=le.transform(traindata.initial_list_status)

le.fit(traindata.application_type)
le.classes_
traindata.application_type=le.transform(traindata.application_type)

le.fit(traindata.verification_status_joint)
le.classes_
traindata.verification_status_joint=le.transform(traindata.verification_status_joint)


le.fit(traindata.sub_grade)
le.classes_
traindata.sub_grade=le.transform(traindata.sub_grade)
# Split the target from the input variables
X_train = traindata.iloc[:, 1:]
y_train = traindata.iloc[:, 0]

In [None]:
# This code transforms the string columns into numerical values 
le = preprocessing.LabelEncoder()
le.fit(testdata.home_ownership)
le.classes_
testdata.home_ownership=le.transform(testdata.home_ownership)

le.fit(testdata.term)
le.classes_
testdata.term=le.transform(testdata.term)

le.fit(testdata.grade)
le.classes_
testdata.grade=le.transform(testdata.grade)


le.fit(testdata.loan_status)
le.classes_
testdata.loan_status=le.transform(testdata.loan_status)

le.fit(testdata.emp_length)
le.classes_
testdata.emp_length=le.transform(testdata.emp_length)

le.fit(testdata.verification_status)
le.classes_
testdata.verification_status=le.transform(testdata.verification_status)

le.fit(testdata.issue_d)
le.classes_
testdata.issue_d=le.transform(testdata.issue_d)

le.fit(testdata.purpose)
le.classes_
testdata.purpose=le.transform(testdata.purpose)

le.fit(testdata.zip_code)
le.classes_
testdata.zip_code=le.transform(testdata.zip_code)

le.fit(testdata.addr_state)
le.classes_
testdata.addr_state=le.transform(testdata.addr_state)

le.fit(testdata.initial_list_status)
le.classes_
testdata.initial_list_status=le.transform(testdata.initial_list_status)

le.fit(testdata.application_type)
le.classes_
testdata.application_type=le.transform(testdata.application_type)

le.fit(testdata.verification_status_joint)
le.classes_
testdata.verification_status_joint=le.transform(testdata.verification_status_joint)


le.fit(testdata.sub_grade)
le.classes_
testdata.sub_grade=le.transform(testdata.sub_grade)
# Split the target from the input variables
X_test = testdata.iloc[:, 1:]
y_test = testdata.iloc[:, 0]

In [None]:
X_train.mths_since_last_delinq[X_train.mths_since_last_delinq.isnull()]=999999

In [None]:
sum(X_train.ix[:,24].isnull())

In [None]:
matchings_indices = [ i for i, x in enumerate(x) if x == value ]

In [None]:
type(nadroplist[1])

In [None]:
nullcolls=(X_train.count()!=37216)
matches = [i for i in range(0,len(nullcolls)) if i == True]
names=nullcolls[nullcolls]
nadroplist=list(names.index)
tuple(nadroplist)

In [None]:
X_train.drop(nadroplist,1,inplace=True)

In [None]:


# Create a decision tree classifier instance (start out with a small tree for interpretability)
ctree = DecisionTreeClassifier(random_state=1, max_depth=2)

# Fit the decision tree classifier
ctree.fit(X_train, y_train)

# Export graph viz
dotfile = open('tree1.dot', 'w')
export_graphviz(ctree,out_file=dotfile, feature_names=X_train.columns)
dotfile.close()

In [None]:
dotfile

In [None]:

#! dot.exe -Tpng tree1.dot -o tree1.png
Image('tree1.png',unconfined=True)

In [None]:
mean_absolute_error(y_test, ctree.predict(X_test))