In [3]:
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import math
import sqlite3
import csv
import pandas as pd
import collections
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from dateutil.parser import parse
from datetime import datetime
from statsmodels.stats import weightstats as stests
from scipy.stats import ks_2samp

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

dataURL = 'https://www.kaggle.com/wendykan/lending-club-loan-data/downloads/lending-club-loan-data.zip'
homeDir = "/fdata/LendingClub/lending-club-loan-data/"



In [4]:
dcheck_sql = pd.read_pickle(homeDir+'DCheckSQL_FF.pkl')
fData = pd.read_pickle(homeDir+'DCheckCSV_FF.pkl')

In [None]:
# Create data and classification sets
fnames = ['dti',\
          'revol_bal',\
          'acc_now_delinq',\
          'collections_12_mths_ex_med',\
          'delinq_2yrs',\
          'inq_last_6mths',\
          'annual_inc',\
          'revol_util',\
          'open_acc',\
          'IVCode',\
          'ELCode',\
          'ECDays']

dfX = fData[fnames]
y = np.array(fData['LSCode'])
# scale data
X = StandardScaler().fit_transform(dfX)
# create training and testing data sets
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
# setup grid search
Cs = [0.001, 0.01, 0.1 ,1, 10]
gammas = [0.001, 0.01,0.1, 1]
param_grid = {'C':Cs}
grid_search = GridSearchCV(svm.SVC(kernel='linear',max_iter=1),param_grid,cv=4)
# Train SVC classifier
grid_search.fit(X_train,y_train)
# use best params
p = list(grid_search.best_params_.values())
print(p)

In [None]:
# create training and testing data sets
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf = svm.SVC(kernel='linear',C=10,max_iter=200000)
clf.fit(X_train,y_train)
# get prediction for test and train data
predict1 = clf.predict(X_test)
predict2 = clf.predict(X_train)
# accuracy scores for test and train data
print("Accuracy Score on Train Data: ",accuracy_score(y_train,predict2))
print("Accuracy Score on Test Data: ",accuracy_score(y_test,predict1))
# confusion matricies for test and train data
print(confusion_matrix(y_test,predict1))
print(confusion_matrix(y_train,predict2))
# precision recall plot
y_score = clf.decision_function(X_test)
avgpre = average_precision_score(y_test,y_score)
print('Average Precision Score: ',avgpre)
precision, recall, thresholds = precision_recall_curve(y_test,y_score)
plt.step(precision,recall)
plt.show()

In [None]:
# only works with linear kernel
labels = ['Debt to Income Ratio','Revolving Credit Balance','Current Delinquent Accounts',\
          'Collections (12mths)','Delinquent (2yrs)','Inquiries (6mths)','Annual Income',\
         'Revolving Credit Ratio','Open Accounts','Income Verified (code)','Employment Length (code)',\
         'Earliest Credit (days)']
fig, ax1 = plt.subplots(1,1)
fig.set_size_inches(10, 7)
ax1.margins(0.05)
ax1.set_xticks(np.arange(0, 15, 1.0))
ax1.set_xticklabels(labels,rotation=90)
plt.title('Fit Coefficients for Loan Club Data\nSVC with linear kernel')
plt.plot(clf.coef_.tolist()[0],marker='o')
plt.show()

In [None]:
#Try KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

fnames = ['dti',\
          'revol_bal',\
          'acc_now_delinq',\
          'collections_12_mths_ex_med',\
          'delinq_2yrs',\
          'inq_last_6mths',\
          'annual_inc',\
          'revol_util',\
          'open_acc',\
          'IVCode',\
          'ELCode',\
          'ECDays']

dfX = fData[fnames]
y = np.array(fData['LSCode'])
# scale data
X = StandardScaler().fit_transform(dfX)

# create training and testing data sets
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

ncla = KNeighborsClassifier(n_neighbors=18,leaf_size=60,algorithm='ball_tree',n_jobs=4)
ncla.fit(X_train,y_train)
predict3 = ncla.predict(X_test)
predict4 = ncla.predict(X_train)

# accuracy scores for test and train data
print("Accuracy Score on Train Data: ",accuracy_score(y_train,predict4))
print("Accuracy Score on Test Data: ",accuracy_score(y_test,predict3))

# confusion matricies for test and train data
print(confusion_matrix(y_test,predict3))
print(confusion_matrix(y_train,predict4))


In [21]:
import random
# set up 10 training data sets where length is the number of bad loans
fnames = ['dti',\
          'revol_bal',\
          'acc_now_delinq',\
          'collections_12_mths_ex_med',\
          'delinq_2yrs',\
          'inq_last_6mths',\
          'annual_inc',\
          'revol_util',\
          'open_acc',\
          'IVCode',\
          'ELCode',\
          'ECDays']

dfX = fData[fnames]
dfy = fData['LSCode']

# scale dfX and retrun it to the data frame
# have to do this here so all data is scaled the same way
scaledX = StandardScaler().fit_transform(dfX.values)
dfTemp = pd.DataFrame(scaledX,index=dfX.index,columns=dfX.columns)
dfXy = pd.concat([dfTemp,dfy],axis=1)
#print(dfXy.head())

# separate out into good and bad loans
td_good = dfXy[dfXy['LSCode']==0]
td_bad = dfXy[dfXy['LSCode']==1]

# select 25% testing data from each
# do good data
gilist = random.sample(range(0,len(td_good)),int(len(td_good)/4.0))
test_good = td_good.iloc[gilist]
td_good = td_good.drop(td_good.index[gilist])
# do bad data
bilist = random.sample(range(0,len(td_bad)),int(len(td_bad)/4.0))
test_bad = td_bad.iloc[bilist]
td_bad = td_bad.drop(td_bad.index[bilist])

# separate remaining data into 10 training sets
d_td_good = td_good
tlen = len(td_bad)
dlen = int(len(td_good)/11.0)
tdGList = []
for indx in range(0,10):
    # create a list of indices tlen long from what remains of d_td_good
    #print(len(d_td_good))
    ilist = random.sample(range(0,len(d_td_good)),tlen)
    # pull those random rows
    tdf = d_td_good.iloc[ilist]
    # remove the first dlen-n of those rows from d_td_good
    # n is chosen so we get 10 sets of good test data that are the same length as the bad test data \
    # with a minimum of unused good loan data
    d_td_good = d_td_good.drop(d_td_good.index[ilist[:dlen-651]])
    # append to training data list
    tdGList.append(tdf)
# combine good and bad data and scale it
for indx in range(0,10):
    # append bad loan data to every random good data set
    temp = pd.concat([tdGList[indx],td_bad],axis=0)
    # now randomize the order
    temp = temp.sample(frac=1)
    tdGList[indx] = temp

In [29]:
# train 10 classifiers
from sklearn.neighbors import KNeighborsClassifier

clasf_list = []
for indx in range(0,10):
    print('Training on data: ',indx)
    mX = tdGList[indx][fnames]
    my = tdGList[indx]['LSCode']
    #print(sum(my))
    #t = svm.SVC(kernel='linear',C=2.0,max_iter=20000)
    t = KNeighborsClassifier(n_neighbors=12,n_jobs=4)
    t.fit(mX,my)
    clasf_list.append(t)

Training on data:  0
Training on data:  1
Training on data:  2
Training on data:  3
Training on data:  4
Training on data:  5
Training on data:  6
Training on data:  7
Training on data:  8
Training on data:  9


In [30]:
# now create 10 predictions of the test data
test = pd.concat([test_good,test_bad],axis=0)
X_test = test[fnames]
y_test = test['LSCode']
t_votes = pd.DataFrame()
for indx in range(0,10):
    print('Predictor ',indx,' running')
    tr = pd.Series(clasf_list[indx].predict(X_test))
    t_votes = pd.concat([t_votes,tr],axis=1)
#print(t_votes.head())

Predictor  0  running
Predictor  1  running
Predictor  2  running
Predictor  3  running
Predictor  4  running
Predictor  5  running
Predictor  6  running
Predictor  7  running
Predictor  8  running
Predictor  9  running


In [31]:
vote_count = t_votes.sum(axis=1)
vote_count.reset_index(drop=True,inplace=True)
test = y_test.reset_index(drop=True,inplace=False)
comp = pd.concat([test,vote_count],axis=1)
comp.columns = ['Actual','Vote Talley']

In [32]:
print(len(comp))
print(comp.head())

61648
   Actual  Vote Talley
0     0.0          0.0
1     0.0          0.0
2     0.0          2.0
3     0.0          3.0
4     0.0          3.0


In [33]:
# test different threshold levels
# construct confusion matrix data for all thresholds
gg = [0]*10
bb = [0]*10
gb = [0]*10
bg = [0]*10

for thres in range(0,10):
    gg[thres] = 0
    bb[thres] = 0
    gb[thres] = 0
    bg[thres] = 0
    for indx, row in comp.iterrows():
        if row[1] >= thres+1:
            r = 1
        else:
            r = 0
        if (row[0]==0)&(r==0):
            gg[thres] += 1
        elif (row[0]==0)&(r==1):
            gb[thres] += 1
        elif (row[0]==1)&(r==0):
            bg[thres] += 1
        elif (row[0]==1)&(r==1):
            bb[thres] += 1
        else:
            print('WTF!!!!')

In [34]:
# print out confusion matricies
for indx in range(0,10):
    print(gg[indx],gb[indx],'\n',bg[indx],bb[indx],'\n')
    accuracy = (gg[indx]+bb[indx])/(gg[indx]+bb[indx]+bg[indx]+gb[indx])
    print(accuracy,'\n\n')
    

17645 32867 
 2372 8764 

0.4283837269660005 


22705 27807 
 3398 7738 

0.49381975084349855 


26449 24063 
 4199 6937 

0.5415585258240332 


29538 20974 
 4888 6248 

0.5804892291720737 


32282 18230 
 5530 5606 

0.6145860368543992 


34707 15805 
 6205 4931 

0.6429730080456787 


37259 13253 
 6827 4309 

0.6742797819880613 


39833 10679 
 7536 3600 

0.7045321827147677 


42377 8135 
 8310 2826 

0.7332435764339476 


45338 5174 
 9298 1838 

0.7652478588113159 




In [42]:
# test double threshold with an uncertain class
# construct confusion matrix data for all thresholds
for thres1 in range(0,10):
    for thres2 in range(thres1,10):
        gg = 0
        bb = 0
        gb = 0
        bg = 0
        ucg = 0
        ucb = 0
        for indx, row in comp.iterrows():
            if thres1 == thres2:
                if row[1] >= thres1+1:
                    r = 1
                else:
                    r = 0
            else:
                if row[1] <= thres1+1:
                    r = 0
                elif row[1] >= thres2+1:
                    r = 1
                else:
                    r = 2
            if (row[0]==0)&(r==0):
                gg += 1
            elif (row[0]==0)&(r==1):
                gb += 1
            elif (row[0]==1)&(r==0):
                bg += 1
            elif (row[0]==1)&(r==1):
                bb += 1
            elif (row[0]==0)&(r==2):
                ucg += 1
            elif (row[0]==1)&(r==2):
                ucb += 1
            else:
                print('WTF!!!!')
        # print confusion matrix and accuracy
        print(gg,gb,ucg,'\n',bg,bb,ucb,'\n')
        accuracy = (gg+bb)/(gg+gb+bg+bb)
        print(accuracy,'\n','T1: ',thres1+1,' T2: ',thres2+1,'\n\n')

17645 32867 0 
 2372 8764 0 

0.4283837269660005 
 T1:  1  T2:  1 


22705 27807 0 
 3398 7738 0 

0.49381975084349855 
 T1:  1  T2:  2 


22705 24063 3744 
 3398 6937 801 

0.519097070206469 
 T1:  1  T2:  3 


22705 20974 6833 
 3398 6248 1490 

0.5429535864978903 
 T1:  1  T2:  4 


22705 18230 9577 
 3398 5606 2132 

0.5669116321912734 
 T1:  1  T2:  5 


22705 15805 12002 
 3398 4931 2807 

0.5900211362326266 
 T1:  1  T2:  6 


22705 13253 14554 
 3398 4309 3429 

0.6186648345356693 
 T1:  1  T2:  7 


22705 10679 17128 
 3398 3600 4138 

0.6514040909316032 
 T1:  1  T2:  8 


22705 8135 19672 
 3398 2826 4912 

0.6888355277358084 
 T1:  1  T2:  9 


22705 5174 22633 
 3398 1838 5900 

0.7411444964517591 
 T1:  1  T2:  10 


22705 27807 0 
 3398 7738 0 

0.49381975084349855 
 T1:  2  T2:  2 


26449 24063 0 
 4199 6937 0 

0.5415585258240332 
 T1:  2  T2:  3 


26449 20974 3089 
 4199 6248 689 

0.5650077760497667 
 T1:  2  T2:  4 


26449 18230 5833 
 4199 5606 1331 

0.58833786