In [23]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train_split.csv')

In [3]:
train.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,batch_enrolled,int_rate,grade,sub_grade,emp_title,...,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,verification_status_joint,last_week_pay,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status
0,58189336,14350,14350,14350.0,36 months,,19.19,E,E3,clerk,...,0.0,74.0,INDIVIDUAL,,26th week,0,0.0,28699.0,30800.0,0
1,70011223,4800,4800,4800.0,36 months,BAT1586599,10.99,B,B4,Human Resources Specialist,...,0.0,,INDIVIDUAL,,9th week,0,0.0,9974.0,32900.0,0
2,70255675,10000,10000,10000.0,36 months,BAT1586599,7.26,A,A4,Driver,...,0.0,,INDIVIDUAL,,9th week,0,65.0,38295.0,34900.0,0
3,1893936,15000,15000,15000.0,36 months,BAT4808022,19.72,D,D5,Us office of Personnel Management,...,0.0,,INDIVIDUAL,,135th week,0,0.0,55564.0,24700.0,0
4,7652106,16000,16000,16000.0,36 months,BAT2833642,10.64,B,B2,LAUSD-HOLLYWOOD HIGH SCHOOL,...,0.0,,INDIVIDUAL,,96th week,0,0.0,47159.0,47033.0,0


In [4]:
train.columns

Index(['member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term',
       'batch_enrolled', 'int_rate', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state',
       'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'application_type', 'verification_status_joint', 'last_week_pay',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim',
       'loan_status'],
      dtype='object')

In [5]:
train.shape

(63999, 45)

In [6]:
train.isnull().sum()

member_id                          0
loan_amnt                          0
funded_amnt                        0
funded_amnt_inv                    0
term                               0
batch_enrolled                 10264
int_rate                           0
grade                              0
sub_grade                          0
emp_title                       3826
emp_length                      3324
home_ownership                     0
annual_inc                         0
verification_status                0
pymnt_plan                         0
desc                           54849
purpose                            0
title                             13
zip_code                           0
addr_state                         0
dti                                0
delinq_2yrs                        0
inq_last_6mths                     0
mths_since_last_delinq         32831
mths_since_last_record         54349
open_acc                           0
pub_rec                            0
r

In [13]:
def prep_data(draft_df):
    encoded = pd.DataFrame()
    
    draft_df.drop(columns = ['batch_enrolled', 'desc', 'zip_code'], axis = 1)
    map = {np.nan : 0, '< 1 year':1, '1 year':2, '2 years':3, '3 years':4, '4 years':5, '5 years':6, '6 years':7, '7 years':8, '8 years':9, '9 years':10, '10+ years':11}
    encoded['emp_length_encoded'] = draft_df['emp_length'].replace(map)
    
    #ordinal Variables
    combined = draft_df[['delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'pub_rec', 'total_acc', 'open_acc', 'collections_12_mths_ex_med', 'acc_now_delinq', 'last_week_pay']]
    combined = combined.apply(preprocessing.LabelEncoder().fit_transform)
    encoded = pd.concat([encoded, combined], axis = 1)
    
    #Nominal Variables
    combined = draft_df[['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'verification_status_joint', 'member_id']]
    combined = pd.get_dummies(combined)
    encoded = pd.concat([encoded, combined], axis = 1)
    
    combined = draft_df[['annual_inc', 'dti', 'revol_bal', 'revol_util', 'total_acc', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
                          'collection_recovery_fee', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'open_acc', 'mths_since_last_major_derog']]
    
    imputer = preprocessing.Imputer(missing_values = np.nan, strategy = 'mean', axis=0)
    imputer_df = pd.DataFrame(imputer.fit_transform(combined))
    imputer_df.columns = combined.columns
    imputer_df.index = combined.index
    encoded = pd.concat([encoded, imputer_df], axis = 1)
    return encoded

In [14]:
encoded = prep_data(train)
encoded_test = prep_data(train)



In [17]:
X = encoded
y = train['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [34]:
from sklearn.linear_model import LinearRegression
regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [35]:
y_pred = regressor1.predict(X_test)

In [36]:
y_pred

array([ 0.59517797,  0.09569185,  0.63447291, ...,  0.43274276,
       -0.01319706, -0.16047455])

In [37]:
from sklearn.ensemble import RandomForestRegressor
regressor2 = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor2.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [44]:
y_pred1 = regressor2.predict(X_test)

In [45]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0, max_depth=100)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [46]:
y_pred2 = classifier.predict(X_test)

In [49]:
classifier.score(X_test, y_test)

0.8771875

In [51]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred2)

In [52]:
cm

array([[11985,   244],
       [ 1721,  2050]], dtype=int64)