In [3]:
import pandas as pd
import pickle
import numpy as np
from sklearn.externals import joblib
from sklearn import *
from sklearn import cluster
from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split

In [18]:
k_means = joblib.load(open('kmeans.pkl', 'rb'))
loans = pd.read_csv('loanstats.csv')
loans['kcluster'] = k_means.labels_

def manual_clusters(fico):
    if fico > 720:
        return 'Group1'
    elif fico > 700:
        return 'Group2'
    elif fico > 690: 
        return 'Group3'
    elif fico > 680: # amnt < 25000 
        return 'Group4'
    elif fico > 670:
        return 'Group5'
    else:
        return 'Group6'

loans['manual_cat'] = loans.apply(lambda x: manual_clusters(x['fico_range_low']), axis=1)

loans['term'] = loans['term'].map(lambda a: int(a.strip(' months')))
loans['application_type'] = loans['application_type'].map(lambda a: 1 if a=='Joint App' else 0)

def elength(a):
    if (a=='n/a'):
        return 0
    elif (a=='10+ years'):
        return 10
    elif (a=='1 year'):
        return 1
    elif (a=='< 1 year'):
        return 0.5
    else:
        return float(a.strip(' years'))
    
loans['emp_length'] = loans['emp_length'].map(lambda a: elength(a))

loans['revol_util'] = loans['revol_util'].map(lambda a: float(a.strip('%')))

homes = pd.get_dummies(loans['home_ownership'], prefix='home')
loans = loans.join(homes)
loans.drop('home_ownership', axis=1, inplace=True)

states = pd.get_dummies(loans['addr_state'], prefix='st')
loans = loans.join(states)
loans.drop('addr_state', axis=1, inplace=True)

loans.drop('grade', axis=1, inplace=True)
loans.drop('sub_grade', axis=1, inplace=True)
loans.drop('set', axis=1, inplace=True)
loans.drop('emp_title', axis=1, inplace=True)
loans.drop('timestamp', axis=1, inplace=True)
loans.drop('issue_d', axis=1, inplace=True)
loans.drop('last_credit_pull_d', axis=1, inplace=True)
loans.drop('title', axis=1, inplace=True)
loans.drop('purpose', axis=1, inplace=True)
loans.drop('next_pymnt_d', axis=1, inplace=True)
loans.drop('zip_code', axis=1, inplace=True)
loans.drop('last_fico_range_high', axis=1, inplace=True)
loans.drop('fico_range_high', axis=1, inplace=True)
loans.drop('last_fico_range_low', axis=1, inplace=True)
loans.drop('installment', axis=1, inplace=True)

In [19]:
loans_tot = loans.drop('manual_cat', axis=1)
loans_tot.drop('kcluster', axis=1, inplace=True)

In [20]:
array=loans_tot.values

Y=array[:,3]
Y

array([ 10.99,  10.99,  11.99, ...,  30.89,  30.65,  30.94])

In [21]:
x1=array[:,1:3]
x2=array[:,4:]
X=np.hstack((x1,x2))

X[1]

array([  4.80000000e+03,   3.60000000e+01,   2.00000000e+00,
         3.96000000e+04,   2.49000000e+00,   0.00000000e+00,
         7.55000000e+02,   1.95000000e+02,   3.00000000e+00,
         4.13600000e+03,   1.61000000e+01,   8.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         4.13600000e+03,   1.61000000e+01,   0.00000000e+00,
         1.04000000e+02,   2.50000000e+01,   0.00000000e+00,
         2.50000000e+01,   3.00000000e+00,   0.00000000e+00,
         2.00000000e+00,   2.00000000e+00,   4.00000000e+00,
         1.00000000e+00,   7.00000000e+00,   2.00000000e+00,
         3.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   2.57000000e+04,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,

In [22]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [23]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=60)

In [24]:
model.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [25]:
ptrain = model.predict(X_train)
ptest = model.predict(X_test)
rms_train = sqrt(mean_squared_error(Y_train, ptrain))
rms_test = sqrt(mean_squared_error(Y_test, ptest))
mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|rms    |'+str(rms_train)+'|'+str(rms_test)+'|\n|mae    |'
      +str(mae_train)+'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)+'    |')

|metric |train            |test             | 
|rms    |1.2676872868328954|3.347687908122074|
|mae    |0.951497831678    |2.54143403883    |
|mape   |7.70522311849    |20.6002297966    |
