In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.externals import joblib
from sklearn import *
from sklearn import cluster
from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE



In [2]:
loans = pd.read_csv('../loanstats.csv')

In [4]:
k_means = joblib.load(open('kmeans.pkl', 'rb'))
loans['kcluster'] = k_means.labels_

In [5]:
def manual_clusters(fico):
    if fico > 720:
        return 'Group1'
    elif fico > 700:
        return 'Group2'
    elif fico > 690: 
        return 'Group3'
    elif fico > 680: # amnt < 25000 
        return 'Group4'
    elif fico > 670:
        return 'Group5'
    else:
        return 'Group6'

loans['manual_cat'] = loans.apply(lambda x: manual_clusters(x['fico_range_low']), axis=1)

In [3]:
loans['term'] = loans['term'].map(lambda a: int(a.strip(' months')))
loans['application_type'] = loans['application_type'].map(lambda a: 1 if a=='Joint App' else 0)

def elength(a):
    if (a=='n/a'):
        return 0
    elif (a=='10+ years'):
        return 10
    elif (a=='1 year'):
        return 1
    elif (a=='< 1 year'):
        return 0.5
    else:
        return float(a.strip(' years'))
    
loans['emp_length'] = loans['emp_length'].map(lambda a: elength(a))

loans['revol_util'] = loans['revol_util'].map(lambda a: float(a.strip('%')))

homes = pd.get_dummies(loans['home_ownership'], prefix='home')
loans = loans.join(homes)
loans.drop('home_ownership', axis=1, inplace=True)

#states = pd.get_dummies(loans['addr_state'], prefix='st')
#loans = loans.join(states)
loans.drop('addr_state', axis=1, inplace=True)

loans.drop('grade', axis=1, inplace=True)
loans.drop('sub_grade', axis=1, inplace=True)
loans.drop('set', axis=1, inplace=True)
loans.drop('emp_title', axis=1, inplace=True)
loans.drop('timestamp', axis=1, inplace=True)
loans.drop('issue_d', axis=1, inplace=True)
loans.drop('last_credit_pull_d', axis=1, inplace=True)
loans.drop('title', axis=1, inplace=True)
loans.drop('purpose', axis=1, inplace=True)
loans.drop('next_pymnt_d', axis=1, inplace=True)
loans.drop('zip_code', axis=1, inplace=True)
loans.drop('last_fico_range_high', axis=1, inplace=True)
loans.drop('fico_range_high', axis=1, inplace=True)
loans.drop('last_fico_range_low', axis=1, inplace=True)
loans.drop('installment', axis=1, inplace=True)
loans.drop('id', axis=1, inplace=True)

# Regression With All Data

In [9]:
loans_tot = loans.drop('manual_cat', axis=1)
loans_tot.drop('kcluster', axis=1, inplace=True)

In [10]:
pd.set_option('display.max_columns',101)
loans_tot.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,annual_inc,dti,delinq_2yrs,fico_range_low,mths_since_last_delinq,open_acc,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,policy_code,application_type,tot_cur_bal,bc_util,chargeoff_within_12_mths,mo_sin_old_il_acct,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_tl,num_il_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,total_bc_limit,home_ANY,home_MORTGAGE,home_NONE,home_OTHER,home_OWN,home_RENT,st_AK,st_AL,st_AR,st_AZ,st_CA,st_CO,st_CT,st_DC,st_DE,st_FL,st_GA,st_HI,st_IA,st_ID,st_IL,st_IN,st_KS,st_KY,st_LA,st_MA,st_MD,st_ME,st_MI,st_MN,st_MO,st_MS,st_MT,st_NC,st_ND,st_NE,st_NH,st_NJ,st_NM,st_NV,st_NY,st_OH,st_OK,st_OR,st_PA,st_RI,st_SC,st_SD,st_TN,st_TX,st_UT,st_VA,st_VT,st_WA,st_WI,st_WV,st_WY
0,27050.0,36,10.99,10.0,55000.0,22.87,0.0,730.0,195.0,14.0,36638.0,61.2,27.0,0.0,1.0,0,114834.0,53.9,0.0,117.0,6.0,4.0,16.0,8.0,0.0,2.0,4.0,8.0,8.0,15.0,4.0,14.0,0.0,0.0,1.0,25.0,0.0,0.0,35700.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4800.0,36,10.99,2.0,39600.0,2.49,0.0,755.0,195.0,3.0,4136.0,16.1,8.0,0.0,1.0,0,4136.0,16.1,0.0,104.0,25.0,0.0,25.0,3.0,0.0,2.0,2.0,4.0,1.0,7.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,25700.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,12000.0,36,11.99,10.0,130000.0,13.03,0.0,715.0,195.0,9.0,10805.0,67.0,19.0,0.0,1.0,0,327264.0,93.0,0.0,173.0,4.0,3.0,85.0,4.0,0.0,3.0,5.0,4.0,8.0,8.0,5.0,9.0,0.0,0.0,3.0,1.0,0.0,0.0,10700.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,20800.0,36,13.53,10.0,81500.0,16.73,0.0,685.0,64.0,29.0,23473.0,54.5,41.0,0.0,1.0,0,23473.0,54.6,0.0,115.0,0.0,0.0,0.0,0.0,1.0,8.0,24.0,17.0,1.0,40.0,24.0,29.0,0.0,0.0,3.0,50.0,0.0,0.0,15000.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10000.0,36,9.67,7.0,102000.0,15.55,2.0,670.0,11.0,9.0,9912.0,44.4,22.0,0.0,1.0,0,39143.0,89.4,0.0,243.0,8.0,0.0,25.0,8.0,1.0,3.0,4.0,6.0,9.0,13.0,4.0,9.0,0.0,0.0,1.0,66.7,0.0,0.0,9200.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
array=loans_tot.values

Y=array[:,2]
Y


array([ 10.99,  10.99,  11.99, ...,  30.89,  30.65,  30.94])

In [12]:
x1=array[:,0:2]
x2=array[:,3:]
X=np.hstack((x1,x2))

X[1]

array([  4.80000000e+03,   3.60000000e+01,   2.00000000e+00,
         3.96000000e+04,   2.49000000e+00,   0.00000000e+00,
         7.55000000e+02,   1.95000000e+02,   3.00000000e+00,
         4.13600000e+03,   1.61000000e+01,   8.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         4.13600000e+03,   1.61000000e+01,   0.00000000e+00,
         1.04000000e+02,   2.50000000e+01,   0.00000000e+00,
         2.50000000e+01,   3.00000000e+00,   0.00000000e+00,
         2.00000000e+00,   2.00000000e+00,   4.00000000e+00,
         1.00000000e+00,   7.00000000e+00,   2.00000000e+00,
         3.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   2.57000000e+04,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,

In [13]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
lm=linear_model.LinearRegression()

lm.fit(X_train, Y_train)
lm.intercept_

-2396536.428857611

In [14]:
ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.584897774607142|3.5825012770563323|
|mae    |2.74149700302    |2.74033041899    |
|mape   |22.5956759686    |22.5659130974    |


#### Attempt without States
The 51 additional variables created by including states could be affecting the goodness of fit, so we have run a second simple linear regression with the states removed.

In [7]:
loans_tot2 = loans.drop('manual_cat', axis=1)
loans_tot2.drop('kcluster', axis=1, inplace=True)


array=loans_tot2.values

Ys=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xs=np.hstack((x1,x2))

X_trains,X_tests,Y_trains,Y_tests=train_test_split(Xs,Ys,test_size=0.2)
lms=linear_model.LinearRegression()

lms.fit(X_trains, Y_trains)
ptrains = lms.predict(X_trains)
ptests = lms.predict(X_tests)

rms_train = sqrt(mean_squared_error(Y_trains, ptrains))
rms_test = sqrt(mean_squared_error(Y_tests, ptests))
mae_train = mean_absolute_error(Y_trains, ptrains)
mae_test = mean_absolute_error(Y_tests, ptests)
mape_train = np.mean(np.abs((Y_trains - ptrains) / Y_trains)) * 100
mape_test = np.mean(np.abs((Y_tests - ptests) / Y_tests)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.585454655160287|3.586699839384543|
|mae    |2.74255333688    |2.74161334565    |
|mape   |22.6066536721    |22.5692472448    |


#### All Data - Feature Selection

In [15]:
model=LinearRegression()
rfe = RFE(model, n_features_to_select=15)
rfe.fit(X_train,Y_train)

ptrain = rfe.predict(X_train)
ptest = rfe.predict(X_test)

rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |4.595904205354061|4.60181176494732|
|mae    |3.59899302381    |3.60264273146    |
|mape   |31.7118288588    |31.7185915495    |


In [16]:
rfe.ranking_

array([76, 12, 48, 81, 29, 50, 25, 74, 15, 80, 61, 39,  8, 79, 65, 78, 69,
        1, 64, 54, 11, 57, 19, 70, 13, 27, 67, 55, 71, 14, 24,  1, 38,  1,
       53,  1, 75, 77,  1,  1,  1,  1,  1,  1,  3,  1,  1, 23, 32, 17, 26,
        1, 46, 73, 58,  1,  1, 10, 52, 33, 49, 72, 16,  2, 56, 20, 62, 30,
       60, 37, 44, 66, 28, 47,  5, 34, 36, 51, 22, 35,  6, 31, 45, 40,  4,
       63,  7, 42, 68, 43, 41, 59, 18, 21,  9])

#### All Data - Only Correlated Columns

In [8]:
keep=['int_rate','fico_range_low','total_bc_limit','mths_since_recent_inq','mo_sin_rcnt_tl','loan_amnt','dti',
      'num_tl_op_past_12m','revol_util','percent_bc_gt_75','bc_util','term']
loans_selected=loans[keep]

array=loans_selected.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
lm=linear_model.LinearRegression()

lm.fit(X_train, Y_train)
ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.6814228253706514|3.6848377526565015|
|mae    |2.81739968138    |2.8152951469    |
|mape   |23.3103033646    |23.319785822    |


# Manual Clusters

In [17]:
loans_man = loans.drop('kcluster', axis=1)

In [18]:
man1 = loans_man[loans_man['manual_cat']=='Group1']
man2 = loans_man[loans_man['manual_cat']=='Group2']
man3 = loans_man[loans_man['manual_cat']=='Group3']
man4 = loans_man[loans_man['manual_cat']=='Group4']
man5 = loans_man[loans_man['manual_cat']=='Group5']
man6 = loans_man[loans_man['manual_cat']=='Group6']

In [None]:
man1.drop('manual_cat',axis=1,inplace=True)
man2.drop('manual_cat',axis=1,inplace=True)
man3.drop('manual_cat',axis=1,inplace=True)
man4.drop('manual_cat',axis=1,inplace=True)
man5.drop('manual_cat',axis=1,inplace=True)
man6.drop('manual_cat',axis=1,inplace=True)

In [20]:
array=man1.values

Y1=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
X1=np.hstack((x1,x2))

X_train1,X_test1,Y_train1,Y_test1=train_test_split(X1,Y1,test_size=0.2)

lm1=linear_model.LinearRegression()

lm1.fit(X_train1, Y_train1)

ptrain1 = lm1.predict(X_train1)
ptest1 = lm1.predict(X_test1)
rms_train = sqrt(mean_squared_error(Y_train1, ptrain1))
rms_test = sqrt(mean_squared_error(Y_test1, ptest1))
mae_train = mean_absolute_error(Y_train1, ptrain1)
mae_test = mean_absolute_error(Y_test1, ptest1)
mape_train = np.mean(np.abs((Y_train1 - ptrain1) / Y_train1)) * 100
mape_test = np.mean(np.abs((Y_test1 - ptest1) / Y_test1)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.0371079747478458|3.045673588849156|
|mae    |2.24825484536    |2.25054850412    |
|mape   |23.7815106366    |23.7352980594    |


In [21]:
array=man2.values

Y2=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
X2=np.hstack((x1,x2))

X_train2,X_test2,Y_train2,Y_test2=train_test_split(X2,Y2,test_size=0.2)

lm2=linear_model.LinearRegression()

lm2.fit(X_train2, Y_train2)

ptrain2 = lm2.predict(X_train2)
ptest2 = lm2.predict(X_test2)

rms_train = sqrt(mean_squared_error(Y_train2, ptrain2))
rms_test = sqrt(mean_squared_error(Y_test2, ptest2))
mae_train = mean_absolute_error(Y_train2, ptrain2)
mae_test = mean_absolute_error(Y_test2, ptest2)
mape_train = np.mean(np.abs((Y_train2 - ptrain2) / Y_train2)) * 100
mape_test = np.mean(np.abs((Y_test2 - ptest2) / Y_test2)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.465332722029478|3.452755500037841|
|mae    |2.63543588824    |2.62789353572    |
|mape   |23.7835054651    |23.6601098955      |


In [22]:
array=man3.values

Y3=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
X3=np.hstack((x1,x2))

X_train3,X_test3,Y_train3,Y_test3=train_test_split(X3,Y3,test_size=0.2)

lm3=linear_model.LinearRegression()

lm3.fit(X_train3, Y_train3)

ptrain3 = lm3.predict(X_train3)
ptest3 = lm3.predict(X_test3)

rms_train = sqrt(mean_squared_error(Y_train3, ptrain3))
rms_test = sqrt(mean_squared_error(Y_test3, ptest3))
mae_train = mean_absolute_error(Y_train3, ptrain3)
mae_test = mean_absolute_error(Y_test3, ptest3)
mape_train = np.mean(np.abs((Y_train3 - ptrain3) / Y_train3)) * 100
mape_test = np.mean(np.abs((Y_test3 - ptest3) / Y_test3)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.6249253987949315|3.6395634447347196|
|mae    |2.75964044618    |2.77110872235    |
|mape   |23.0054064159    |23.0274027424      |


In [23]:
array=man4.values

Y4=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
X4=np.hstack((x1,x2))

X_train4,X_test4,Y_train4,Y_test4=train_test_split(X4,Y4,test_size=0.2)

lm4=linear_model.LinearRegression()

lm4.fit(X_train4, Y_train4)

ptrain4 = lm4.predict(X_train4)
ptest4 = lm4.predict(X_test4)

rms_train = sqrt(mean_squared_error(Y_train4, ptrain4))
rms_test = sqrt(mean_squared_error(Y_test4, ptest4))
mae_train = mean_absolute_error(Y_train4, ptrain4)
mae_test = mean_absolute_error(Y_test4, ptest4)
mape_train = np.mean(np.abs((Y_train4 - ptrain4) / Y_train4)) * 100
mape_test = np.mean(np.abs((Y_test4 - ptest4) / Y_test4)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.7187560587797552|3.741684835351748|
|mae    |2.84589620621    |2.86032308243    |
|mape   |22.6253919231    |22.6981479687      |


In [24]:
array=man5.values

Y5=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
X5=np.hstack((x1,x2))

X_train5,X_test5,Y_train5,Y_test5=train_test_split(X5,Y5,test_size=0.2)

lm5=linear_model.LinearRegression()

lm5.fit(X_train5, Y_train5)

ptrain5 = lm5.predict(X_train5)
ptest5 = lm5.predict(X_test5)

rms_train = sqrt(mean_squared_error(Y_train5, ptrain5))
rms_test = sqrt(mean_squared_error(Y_test5, ptest5))
mae_train = mean_absolute_error(Y_train5, ptrain5)
mae_test = mean_absolute_error(Y_test5, ptest5)
mape_train = np.mean(np.abs((Y_train5 - ptrain5) / Y_train5)) * 100
mape_test = np.mean(np.abs((Y_test5 - ptest5) / Y_test5)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.689256932931926|3.6827127928289523|
|mae    |2.83508159995    |2.8260408302    |
|mape   |21.1978376688    |21.0890224745      |


In [25]:
array=man6.values

Y6=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
X6=np.hstack((x1,x2))

X_train6,X_test6,Y_train6,Y_test6=train_test_split(X6,Y6,test_size=0.2)

lm6=linear_model.LinearRegression()

lm6.fit(X_train6, Y_train6)

ptrain6 = lm6.predict(X_train6)
ptest6 = lm6.predict(X_test6)

rms_train = sqrt(mean_squared_error(Y_train6, ptrain6))
rms_test = sqrt(mean_squared_error(Y_test6, ptest6))
mae_train = mean_absolute_error(Y_train6, ptrain6)
mae_test = mean_absolute_error(Y_test6, ptest6)
mape_train = np.mean(np.abs((Y_train6 - ptrain6) / Y_train6)) * 100
mape_test = np.mean(np.abs((Y_test6 - ptest6) / Y_test6)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.6917699508040354|3.6944622825143405|
|mae    |2.86236097205    |2.8698897677    |
|mape   |20.0612733076    |20.1003759989      |


# KMeans Clusters

In [27]:
kloans= loans.drop('manual_cat', axis=1)

kc0 = kloans[kloans['kcluster']==0]
kc1 = kloans[kloans['kcluster']==1]
kc2 = kloans[kloans['kcluster']==2]
kc3 = kloans[kloans['kcluster']==3]
kc4 = kloans[kloans['kcluster']==4]
kc5 = kloans[kloans['kcluster']==5]
kc6 = kloans[kloans['kcluster']==6]

kc0.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,annual_inc,dti,delinq_2yrs,fico_range_low,mths_since_last_delinq,open_acc,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,policy_code,application_type,tot_cur_bal,bc_util,chargeoff_within_12_mths,mo_sin_old_il_acct,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_tl,num_il_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,total_bc_limit,kcluster,home_ANY,home_MORTGAGE,home_NONE,home_OTHER,home_OWN,home_RENT,st_AK,st_AL,st_AR,st_AZ,st_CA,st_CO,st_CT,st_DC,st_DE,st_FL,st_GA,st_HI,st_IA,st_ID,st_IL,st_IN,st_KS,st_KY,st_LA,st_MA,st_MD,st_ME,st_MI,st_MN,st_MO,st_MS,st_MT,st_NC,st_ND,st_NE,st_NH,st_NJ,st_NM,st_NV,st_NY,st_OH,st_OK,st_OR,st_PA,st_RI,st_SC,st_SD,st_TN,st_TX,st_UT,st_VA,st_VT,st_WA,st_WI,st_WV,st_WY
378751,25000.0,36,7.49,10.0,109000.0,26.02,0.0,745.0,195.0,9.0,20862.0,54.3,19.0,0.0,1.0,0,305781.0,54.3,0.0,142.0,13.0,3.0,13.0,0.0,0.0,3.0,3.0,6.0,7.0,9.0,3.0,9.0,0.0,0.0,0.0,20.0,0.0,0.0,38400.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
378752,20000.0,60,13.99,10.0,70000.0,16.9,0.0,680.0,33.0,20.0,31200.0,42.0,35.0,0.0,1.0,0,34856.0,44.0,0.0,46.0,5.0,0.0,6.0,13.0,1.0,6.0,11.0,19.0,2.0,32.0,11.0,20.0,0.0,0.0,2.0,44.4,0.0,0.0,33100.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
378753,16000.0,60,12.88,1.0,70000.0,26.4,0.0,720.0,195.0,13.0,28705.0,56.3,29.0,0.0,1.0,0,265836.0,61.0,0.0,147.0,9.0,2.0,9.0,11.0,0.0,4.0,6.0,12.0,6.0,21.0,6.0,13.0,0.0,0.0,1.0,60.0,0.0,0.0,45900.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
378754,25000.0,36,5.32,1.0,150000.0,9.54,0.0,795.0,195.0,7.0,19339.0,42.5,18.0,0.0,1.0,0,430856.0,44.0,0.0,166.0,4.0,7.0,73.0,5.0,0.0,1.0,1.0,3.0,4.0,7.0,1.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,44000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
378755,15700.0,60,16.59,6.0,48000.0,29.13,0.0,685.0,66.0,19.0,15458.0,64.9,28.0,0.0,1.0,0,270017.0,87.7,0.0,112.0,4.0,1.0,4.0,10.0,1.0,3.0,6.0,5.0,13.0,14.0,7.0,19.0,0.0,0.0,3.0,75.0,0.0,0.0,12300.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
kc0.drop('kcluster', axis=1, inplace=True)
kc1.drop('kcluster', axis=1, inplace=True)
kc2.drop('kcluster', axis=1, inplace=True)
kc3.drop('kcluster', axis=1, inplace=True)
kc4.drop('kcluster', axis=1, inplace=True)
kc5.drop('kcluster', axis=1, inplace=True)
kc6.drop('kcluster', axis=1, inplace=True)

In [29]:
array=kc0.values

Ykc0=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc0=np.hstack((x1,x2))

X_trainkc0,X_testkc0,Y_trainkc0,Y_testkc0=train_test_split(Xkc0,Ykc0,test_size=0.2)

lmkc0=linear_model.LinearRegression()

lmkc0.fit(X_trainkc0, Y_trainkc0)

ptrainkc0 = lmkc0.predict(X_trainkc0)
ptestkc0 = lmkc0.predict(X_testkc0)

rms_train = sqrt(mean_squared_error(Y_trainkc0, ptrainkc0))
rms_test = sqrt(mean_squared_error(Y_testkc0, ptestkc0))
mae_train = mean_absolute_error(Y_trainkc0, ptrainkc0)
mae_test = mean_absolute_error(Y_testkc0, ptestkc0)
mape_train = np.mean(np.abs((Y_trainkc0 - ptrainkc0) / Y_trainkc0)) * 100
mape_test = np.mean(np.abs((Y_testkc0 - ptestkc0) / Y_testkc0)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.5652619784096204|3.5327663089173487|
|mae    |2.76515175662    |2.75883372871    |
|mape   |24.2674969743    |24.3955240571      |


In [30]:
array=kc1.values

Ykc1=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc1=np.hstack((x1,x2))

X_trainkc1,X_testkc1,Y_trainkc1,Y_testkc1=train_test_split(Xkc1,Ykc1,test_size=0.2)

lmkc1=linear_model.LinearRegression()

lmkc1.fit(X_trainkc1, Y_trainkc1)

ptrainkc1 = lmkc1.predict(X_trainkc1)
ptestkc1 = lmkc1.predict(X_testkc1)

rms_train = sqrt(mean_squared_error(Y_trainkc1, ptrainkc1))
rms_test = sqrt(mean_squared_error(Y_testkc1, ptestkc1))
mae_train = mean_absolute_error(Y_trainkc1, ptrainkc1)
mae_test = mean_absolute_error(Y_testkc1, ptestkc1)
mape_train = np.mean(np.abs((Y_trainkc1 - ptrainkc1) / Y_trainkc1)) * 100
mape_test = np.mean(np.abs((Y_testkc1 - ptestkc1) / Y_testkc1)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.13390000713961|3.1542774258330617|
|mae    |2.42434981498    |2.44616289836    |
|mape   |19.6338334686    |19.7490991115      |


In [31]:
array=kc2.values

Ykc2=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc2=np.hstack((x1,x2))

X_trainkc2,X_testkc2,Y_trainkc2,Y_testkc2=train_test_split(Xkc2,Ykc2,test_size=0.2)

lmkc2=linear_model.LinearRegression()

lmkc2.fit(X_trainkc2, Y_trainkc2)

ptrainkc2 = lmkc2.predict(X_trainkc2)
ptestkc2 = lmkc2.predict(X_testkc2)

rms_train = sqrt(mean_squared_error(Y_trainkc2, ptrainkc2))
rms_test = sqrt(mean_squared_error(Y_testkc2, ptestkc2))
mae_train = mean_absolute_error(Y_trainkc2, ptrainkc2)
mae_test = mean_absolute_error(Y_testkc2, ptestkc2)
mape_train = np.mean(np.abs((Y_trainkc2 - ptrainkc2) / Y_trainkc2)) * 100
mape_test = np.mean(np.abs((Y_testkc2 - ptestkc2) / Y_testkc2)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |4.099505906857208|4.093987808656713|
|mae    |3.02576884856    |3.02368956049    |
|mape   |23.7348957249    |23.6741218839      |


In [32]:
array=kc3.values

Ykc3=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc3=np.hstack((x1,x2))

X_trainkc3,X_testkc3,Y_trainkc3,Y_testkc3=train_test_split(Xkc3,Ykc3,test_size=0.2)

lmkc3=linear_model.LinearRegression()

lmkc3.fit(X_trainkc3, Y_trainkc3)

ptrainkc3 = lmkc3.predict(X_trainkc3)
ptestkc3 = lmkc3.predict(X_testkc3)

rms_train = sqrt(mean_squared_error(Y_trainkc3, ptrainkc3))
rms_test = sqrt(mean_squared_error(Y_testkc3, ptestkc3))
mae_train = mean_absolute_error(Y_trainkc3, ptrainkc3)
mae_test = mean_absolute_error(Y_testkc3, ptestkc3)
mape_train = np.mean(np.abs((Y_trainkc3 - ptrainkc3) / Y_trainkc3)) * 100
mape_test = np.mean(np.abs((Y_testkc3 - ptestkc3) / Y_testkc3)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.835351590877725|3.8295581100221314|
|mae    |2.9277095298    |2.91749993402    |
|mape   |22.8053440166    |22.7161511693      |


In [33]:
array=kc4.values

Ykc4=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc4=np.hstack((x1,x2))

X_trainkc4,X_testkc4,Y_trainkc4,Y_testkc4=train_test_split(Xkc4,Ykc4,test_size=0.2)

lmkc4=linear_model.LinearRegression()

lmkc4.fit(X_trainkc4, Y_trainkc4)

ptrainkc4 = lmkc4.predict(X_trainkc4)
ptestkc4 = lmkc4.predict(X_testkc4)

rms_train = sqrt(mean_squared_error(Y_trainkc4, ptrainkc4))
rms_test = sqrt(mean_squared_error(Y_testkc4, ptestkc4))
mae_train = mean_absolute_error(Y_trainkc4, ptrainkc4)
mae_test = mean_absolute_error(Y_testkc4, ptestkc4)
mape_train = np.mean(np.abs((Y_trainkc4 - ptrainkc4) / Y_trainkc4)) * 100
mape_test = np.mean(np.abs((Y_testkc4 - ptestkc4) / Y_testkc4)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.141024069133379|3.132542385167082|
|mae    |2.47845204629    |2.47568635239    |
|mape   |21.2526526582    |21.3165419097      |


In [34]:
array=kc5.values

Ykc5=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc5=np.hstack((x1,x2))

X_trainkc5,X_testkc5,Y_trainkc5,Y_testkc5=train_test_split(Xkc5,Ykc5,test_size=0.2)

lmkc5=linear_model.LinearRegression()

lmkc5.fit(X_trainkc5, Y_trainkc5)

ptrainkc5 = lmkc5.predict(X_trainkc5)
ptestkc5 = lmkc5.predict(X_testkc5)

rms_train = sqrt(mean_squared_error(Y_trainkc5, ptrainkc5))
rms_test = sqrt(mean_squared_error(Y_testkc5, ptestkc5))
mae_train = mean_absolute_error(Y_trainkc5, ptrainkc5)
mae_test = mean_absolute_error(Y_testkc5, ptestkc5)
mape_train = np.mean(np.abs((Y_trainkc5 - ptrainkc5) / Y_trainkc5)) * 100
mape_test = np.mean(np.abs((Y_testkc5 - ptestkc5) / Y_testkc5)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |2.8483284143991954|2.836506043423458|
|mae    |2.24428715478    |2.234771086    |
|mape   |17.1006961597    |17.0423735808      |


In [35]:
array=kc6.values

Ykc6=array[:,2]
x1=array[:,0:2]
x2=array[:,3:]
Xkc6=np.hstack((x1,x2))

X_trainkc6,X_testkc6,Y_trainkc6,Y_testkc6=train_test_split(Xkc6,Ykc6,test_size=0.2)

lmkc6=linear_model.LinearRegression()

lmkc6.fit(X_trainkc6, Y_trainkc6)

ptrainkc6 = lmkc6.predict(X_trainkc6)
ptestkc6 = lmkc6.predict(X_testkc6)

rms_train = sqrt(mean_squared_error(Y_trainkc6, ptrainkc6))
rms_test = sqrt(mean_squared_error(Y_testkc6, ptestkc6))
mae_train = mean_absolute_error(Y_trainkc6, ptrainkc6)
mae_test = mean_absolute_error(Y_testkc6, ptestkc6)
mape_train = np.mean(np.abs((Y_trainkc6 - ptrainkc6) / Y_trainkc6)) * 100
mape_test = np.mean(np.abs((Y_testkc6 - ptestkc6) / Y_testkc6)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'      |')

|metric |train            |test             | 
|rms    |3.0075462987911328|3.025473181914594|
|mae    |2.36904561269    |2.37852197671    |
|mape   |20.8282172927    |20.8518797479      |
