In [3]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [4]:
df = pd.read_csv('data/train.csv')

In [5]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,total_acc,initial_list_status,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,record_id
0,15000.0,36 months,11.99,498.15,B,B3,Quality Assurance Specialist,4 years,MORTGAGE,70000.0,...,32.0,f,0.0,1.0,INDIVIDUAL,0.0,0.0,295215.0,20500.0,453246940
1,3725.0,36 months,6.03,113.38,A,A1,,,MORTGAGE,52260.0,...,9.0,f,0.0,1.0,INDIVIDUAL,0.0,0.0,25130.0,14200.0,453313687
2,16000.0,36 months,11.14,524.89,B,B2,KIPP NYC,3 years,RENT,67500.0,...,22.0,f,0.0,1.0,INDIVIDUAL,0.0,193.0,41737.0,19448.0,453283543
3,4200.0,36 months,13.33,142.19,C,C3,Receptionist,< 1 year,MORTGAGE,21600.0,...,19.0,w,0.0,1.0,INDIVIDUAL,0.0,165.0,28187.0,14500.0,453447199
4,6500.0,36 months,12.69,218.05,B,B5,Medtox Laboratories,10+ years,RENT,41000.0,...,12.0,f,0.0,1.0,INDIVIDUAL,0.0,,,,453350283


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200189 entries, 0 to 200188
Data columns (total 36 columns):
loan_amnt                     200189 non-null float64
term                          200189 non-null object
int_rate                      200189 non-null float64
installment                   200189 non-null float64
grade                         200189 non-null object
sub_grade                     200189 non-null object
emp_title                     189062 non-null object
emp_length                    192135 non-null object
home_ownership                200189 non-null object
annual_inc                    200189 non-null float64
verification_status           200189 non-null object
issue_d                       200189 non-null object
loan_status                   200189 non-null int64
pymnt_plan                    200189 non-null object
purpose                       200189 non-null object
zip_code                      200189 non-null object
addr_state                    200189 n

In [7]:
df.policy_code.unique()

array([1.])

In [8]:
data = df.drop([ 'issue_d', 'earliest_cr_line',  'zip_code', 'addr_state', 'record_id'], axis=1)

In [9]:
def preproc(data, dict_grade=None,dict_sub_grade=None, dict_pymnt_plan=None, dict_purpose=None, list_home_ownership = None, list_verification_status = None):
    
    data = data.drop([ 'issue_d', 'earliest_cr_line',  'zip_code', 'addr_state', 'record_id','policy_code'], axis=1)
    
    data['term'] = data['term'].map({' 36 months': 0, ' 60 months': 1})    
    data['initial_list_status'] = data['initial_list_status'].map({'f': 0, 'w': 1})
    data['application_type'] = data['application_type'].map({'INDIVIDUAL': 0, 'JOINT': 1})
    data['emp_length'] = data['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3,  '3 years': 4,  '4 years': 5,  '5 years': 6,  '6 years': 7,  '7 years': 8,  '8 years': 9,  '9 years': 10,  '10+ years': 11})

    data.emp_length.fillna(value=0,inplace=True)
    data.revol_util.fillna(value=0,inplace=True)  
    data['emp_title'] = data.emp_title.map(lambda x: 0 if x == 'n/a' else 1)

    data.collections_12_mths_ex_med.fillna(value=data.collections_12_mths_ex_med.mean(),inplace=True)
    data.mths_since_last_delinq.fillna(value=data.mths_since_last_delinq.mean(),inplace=True)
    data.tot_coll_amt.fillna(value=data.tot_coll_amt.mean(),inplace=True)
    data.tot_cur_bal.fillna(value=data.tot_coll_amt.mean(),inplace=True)
    data.total_rev_hi_lim.fillna(value=data.total_rev_hi_lim.mean(),inplace=True)

    #ohe = OneHotEncoder(sparse=False)    
    #data = pd.concat([data ,pd.DataFrame(ohe.fit_transform(data['home_ownership'].values.reshape(-1, 1)))], axis=1)
    
    #new_ohe_features = ohe.fit(data.home_ownership.values.reshape(-1, 1))
    #encode = ohe.transform(data.home_ownership.values.reshape(-1, 1))
    #print(encode)
    
    #new_ohe_features = ohe.fit(data.verification_status.values.reshape(-1, 1))
    #data['verification_status_ohe'] = ohe.transform(data.verification_status.values.reshape(-1, 1))    
    
    verification_status_list = data.verification_status.values.reshape(-1, 1)

    data = pd.concat((data,pd.get_dummies(data.verification_status)),1)
    
    if list_verification_status is None:
        list_verification_status = list(data['verification_status'].unique())
        
    for i in list_verification_status:
        try:
            data[i]
        except KeyError:
            data[i] = 0
            
    home_ownership_list = data.home_ownership.values.reshape(-1, 1)

    data = pd.concat((data,pd.get_dummies(data.home_ownership)),1)
    
    if list_home_ownership is None:
        list_home_ownership = list(data['home_ownership'].unique())
        
    for i in list_home_ownership:
        try:
            data[i]
        except KeyError:
            data[i] = 0
    
    #if not le_dict:
        #columnsToEncode = list(data['home_ownership'].unique())
        #train = True;
    #else:
        #columnsToEncode = le_dict.keys()   
        #train = False;
        
    
    #for feature in columnsToEncode:             
        #try:
            #data = pd.concat([data ,pd.get_dummies(data[feature])], axis=1)
            #data = data.drop(feature, axis=1)
        
        #except:
            #print('Error encoding '+feature)
            #df[feature]  = df[feature].convert_objects(convert_numeric='force')
           # df[feature]  = df[feature].apply(pd.to_numeric, errors='coerce')  
    
    le = LabelEncoder()

    if dict_grade is None:
        le.fit(data.grade.astype(str))
        data['grade_le'] = le.transform((data['grade'].values))
        dict_grade = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        data['grade_le'] = data['grade'].map(dict_grade).fillna(-1)  

    if dict_sub_grade is None:
        le.fit(data.sub_grade.astype(str))
        data['sub_grade_le'] = le.transform(data['sub_grade'].values)
        dict_sub_grade = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        data['sub_grade_le'] = data['sub_grade'].map(dict_sub_grade).fillna(-1)           

    if dict_pymnt_plan is None:
        le.fit(data.pymnt_plan.astype(str))
        data['pymnt_plan_le'] = le.transform(data['pymnt_plan'].values)
        dict_pymnt_plan = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        data['pymnt_plan_le'] = data['pymnt_plan'].map(dict_pymnt_plan).fillna(0)  
    
    if dict_purpose is None:
        le.fit(data.purpose.astype(str))
        data['purpose_le'] = le.transform(data['purpose'].values)
        dict_purpose = dict(zip(le.classes_, le.transform(le.classes_)))

    else:
        data['purpose_le'] = data['purpose'].map(dict_purpose).fillna(-1)  
    data = data.drop(['grade', 'sub_grade', 'purpose', 'pymnt_plan', 'home_ownership', 'verification_status'], axis=1)   
    return data, dict_grade, dict_sub_grade, dict_pymnt_plan, dict_purpose, list_home_ownership, list_verification_status

In [10]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [11]:
train, dict_grade, dict_sub_grade, dict_pymnt_plan, dict_purpose, list_home_ownership, list_verification_status = preproc(train)
test = preproc(test, dict_grade, dict_sub_grade, dict_pymnt_plan, dict_purpose, list_home_ownership, list_verification_status)[0]


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60057 entries, 8644 to 191094
Data columns (total 37 columns):
loan_amnt                     60057 non-null float64
term                          60057 non-null int64
int_rate                      60057 non-null float64
installment                   60057 non-null float64
emp_title                     60057 non-null int64
emp_length                    60057 non-null float64
annual_inc                    60057 non-null float64
loan_status                   60057 non-null int64
dti                           60057 non-null float64
delinq_2yrs                   60057 non-null float64
inq_last_6mths                60057 non-null float64
mths_since_last_delinq        60057 non-null float64
open_acc                      60057 non-null float64
pub_rec                       60057 non-null float64
revol_bal                     60057 non-null float64
revol_util                    60057 non-null float64
total_acc                     60057 non-null 

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140132 entries, 193136 to 121958
Data columns (total 37 columns):
loan_amnt                     140132 non-null float64
term                          140132 non-null int64
int_rate                      140132 non-null float64
installment                   140132 non-null float64
emp_title                     140132 non-null int64
emp_length                    140132 non-null float64
annual_inc                    140132 non-null float64
loan_status                   140132 non-null int64
dti                           140132 non-null float64
delinq_2yrs                   140132 non-null float64
inq_last_6mths                140132 non-null float64
mths_since_last_delinq        140132 non-null float64
open_acc                      140132 non-null float64
pub_rec                       140132 non-null float64
revol_bal                     140132 non-null float64
revol_util                    140132 non-null float64
total_acc                 

In [15]:
x_train = train.drop(['loan_status'], axis=1)
y_train = train['loan_status']

x_test = test.drop(['loan_status'], axis=1)
y_test = test['loan_status']

x_train = scale(x_train)
x_test = scale(x_test)

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=0)
x_train, y_train = ros.fit_resample(x_train, y_train)

  import sys
  


In [16]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')  
svclassifier.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [17]:
y_pred = svclassifier.predict(x_test) 

In [18]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred)) 

[[ 8303  4968]
 [15400 31386]]
              precision    recall  f1-score   support

           0       0.35      0.63      0.45     13271
           1       0.86      0.67      0.76     46786

   micro avg       0.66      0.66      0.66     60057
   macro avg       0.61      0.65      0.60     60057
weighted avg       0.75      0.66      0.69     60057



In [27]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.660855520588774
Precision: 0.8633437861033174
Recall: 0.6708417047834823


In [20]:
svclassifier1 = SVC(kernel='poly', degree=8)  
svclassifier1.fit(x_train, y_train) 



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=8, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [23]:
y_pred1 = svclassifier1.predict(x_test) 

In [24]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred1))  
print(classification_report(y_test, y_pred1))

[[ 1446 11825]
 [ 2476 44310]]
              precision    recall  f1-score   support

           0       0.37      0.11      0.17     13271
           1       0.79      0.95      0.86     46786

   micro avg       0.76      0.76      0.76     60057
   macro avg       0.58      0.53      0.51     60057
weighted avg       0.70      0.76      0.71     60057



In [29]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred1))
print("Precision:",metrics.precision_score(y_test, y_pred1))
print("Recall:",metrics.recall_score(y_test, y_pred1))

Accuracy: 0.7618762175932864
Precision: 0.7893471096463882
Recall: 0.9470781857820716


In [25]:
svclassifier2 = SVC(kernel='rbf')  
svclassifier2.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [31]:
y_pred2 = svclassifier2.predict(x_test) 

In [32]:
print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2)) 

[[ 8869  4402]
 [17155 29631]]
              precision    recall  f1-score   support

           0       0.34      0.67      0.45     13271
           1       0.87      0.63      0.73     46786

   micro avg       0.64      0.64      0.64     60057
   macro avg       0.61      0.65      0.59     60057
weighted avg       0.75      0.64      0.67     60057



In [34]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred2))
print("Precision:",metrics.precision_score(y_test, y_pred2))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.6410576618878732
Precision: 0.8706549525460583
Recall: 0.6708417047834823


In [35]:
svclassifier3 = SVC(kernel='sigmoid')  
svclassifier3.fit(x_train, y_train)  



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [36]:
y_pred3 = svclassifier3.predict(x_test)  

In [37]:
print(confusion_matrix(y_test, y_pred3))  
print(classification_report(y_test, y_pred3))

[[ 7643  5628]
 [20365 26421]]
              precision    recall  f1-score   support

           0       0.27      0.58      0.37     13271
           1       0.82      0.56      0.67     46786

   micro avg       0.57      0.57      0.57     60057
   macro avg       0.55      0.57      0.52     60057
weighted avg       0.70      0.57      0.60     60057



In [38]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred3))
print("Precision:",metrics.precision_score(y_test, y_pred3))
print("Recall:",metrics.recall_score(y_test, y_pred3))

Accuracy: 0.5671944985597016
Precision: 0.8243938968454554
Recall: 0.5647202154490659
