In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.externals import joblib
from sklearn import *
from sklearn import cluster
from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split



In [2]:
loans = pd.read_csv('../loanstats.csv')

In [3]:
def inc(a):
    if a > 100000000:
        return loans['annual_inc'].mean()
    else:
        return a
loans['annual_inc'] = loans['annual_inc'].map(lambda a: inc(a))

loans['term'] = loans['term'].map(lambda a: int(a.strip(' months')))
loans['application_type'] = loans['application_type'].map(lambda a: 1 if a=='Joint App' else 0)

def elength(a):
    if (a=='n/a'):
        return 0
    elif (a=='10+ years'):
        return 10
    elif (a=='1 year'):
        return 1
    elif (a=='< 1 year'):
        return 0.5
    else:
        return float(a.strip(' years'))
    
loans['emp_length'] = loans['emp_length'].map(lambda a: elength(a))

loans['revol_util'] = loans['revol_util'].map(lambda a: float(a.strip('%')))

homes = pd.get_dummies(loans['home_ownership'], prefix='home')
loans = loans.join(homes)
loans.drop('home_ownership', axis=1, inplace=True)

states = pd.get_dummies(loans['addr_state'], prefix='st')
loans = loans.join(states)
loans.drop('addr_state', axis=1, inplace=True)

loans.drop('grade', axis=1, inplace=True)
loans.drop('sub_grade', axis=1, inplace=True)
loans.drop('set', axis=1, inplace=True)
loans.drop('timestamp', axis=1, inplace=True)

loans.drop('issue_d', axis=1, inplace=True)
loans.drop('last_credit_pull_d', axis=1, inplace=True)
loans.drop('title', axis=1, inplace=True)
loans.drop('purpose', axis=1, inplace=True)
loans.drop('next_pymnt_d', axis=1, inplace=True)
loans.drop('zip_code', axis=1, inplace=True)
loans.drop('last_fico_range_high', axis=1, inplace=True)
loans.drop('fico_range_high', axis=1, inplace=True)
loans.drop('last_fico_range_low', axis=1, inplace=True)
loans.drop('installment', axis=1, inplace=True)
loans.drop('emp_title', axis=1, inplace=True)

# No Clusters

In [4]:
from sklearn.neighbors import KNeighborsRegressor

In [5]:
array=loans.values
Y=array[:,3]
Y

array([ 10.99,  10.99,  11.99, ...,  30.89,  30.65,  30.94])

In [6]:
x1=array[:,1:3]
x2=array[:,4:]
X=np.hstack((x1,x2))
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

## Attempt 1 - 5 neighbors, default values

In [10]:
kn_tot = KNeighborsRegressor(n_neighbors=5)

In [11]:
kn_tot.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [12]:
ptrain = kn_tot.predict(X_train)
ptest = kn_tot.predict(X_test)
rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100
#results.append([name, rms_train, rms_test, mae_train, mae_test, mape_train, mape_test])

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.7499824881607187|4.5861409469993175|
|mae    |2.90587200592    |3.5545191538    |
|mape   |24.5315428931    |30.0320739693    |


## Attempt 2 - 10 Neighbors

In [13]:
kn_tot2 = KNeighborsRegressor(n_neighbors=10)
kn_tot2.fit(X_train, Y_train)
ptrain = kn_tot2.predict(X_train)
ptest = kn_tot2.predict(X_test)
rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100
#results.append([name, rms_train, rms_test, mae_train, mae_test, mape_train, mape_test])

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.9793228555102287|4.3956449277936445|
|mae    |3.07883587622    |3.40093598985    |
|mape   |26.1453710031    |28.8700109764    |


## Attempt 3 - Increase Leaf Size

In [7]:
kn_tot2 = KNeighborsRegressor(n_neighbors=10, leaf_size=100)
kn_tot2.fit(X_train, Y_train)
ptrain = kn_tot2.predict(X_train)
ptest = kn_tot2.predict(X_test)
rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100
#results.append([name, rms_train, rms_test, mae_train, mae_test, mape_train, mape_test])

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |3.9768872967538345|4.4046665949643415|
|mae    |3.07696660844    |3.40815071205    |
|mape   |26.1289139423    |28.9437853246    |


In [None]:
from sklearn.metrics import accuracy_score
mean(abs(accuracy(Ytest, ptest)))