In [27]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,total_acc,initial_list_status,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,record_id
0,15000.0,36 months,11.99,498.15,B,B3,Quality Assurance Specialist,4 years,MORTGAGE,70000.0,...,32.0,f,0.0,1.0,INDIVIDUAL,0.0,0.0,295215.0,20500.0,453246940
1,3725.0,36 months,6.03,113.38,A,A1,,,MORTGAGE,52260.0,...,9.0,f,0.0,1.0,INDIVIDUAL,0.0,0.0,25130.0,14200.0,453313687
2,16000.0,36 months,11.14,524.89,B,B2,KIPP NYC,3 years,RENT,67500.0,...,22.0,f,0.0,1.0,INDIVIDUAL,0.0,193.0,41737.0,19448.0,453283543
3,4200.0,36 months,13.33,142.19,C,C3,Receptionist,< 1 year,MORTGAGE,21600.0,...,19.0,w,0.0,1.0,INDIVIDUAL,0.0,165.0,28187.0,14500.0,453447199
4,6500.0,36 months,12.69,218.05,B,B5,Medtox Laboratories,10+ years,RENT,41000.0,...,12.0,f,0.0,1.0,INDIVIDUAL,0.0,,,,453350283


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200189 entries, 0 to 200188
Data columns (total 43 columns):
loan_amnt                     200189 non-null float64
term                          200189 non-null int64
int_rate                      200189 non-null float64
installment                   200189 non-null float64
grade                         200189 non-null int32
sub_grade                     200189 non-null int32
emp_length                    200189 non-null int32
annual_inc                    200189 non-null float64
loan_status                   200189 non-null int64
pymnt_plan                    200189 non-null int32
purpose                       200189 non-null int32
zip_code                      200189 non-null int32
addr_state                    200189 non-null int32
dti                           200189 non-null float64
delinq_2yrs                   200189 non-null float64
inq_last_6mths                200189 non-null float64
mths_since_last_delinq        200189 non-nu

In [5]:
df.emp_length.fillna(value=0,inplace=True)
df.revol_util.fillna(value=0,inplace=True)
df.collections_12_mths_ex_med.fillna(value=df.collections_12_mths_ex_med.mean(),inplace=True)
df.mths_since_last_delinq.fillna(value=df.mths_since_last_delinq.mean(),inplace=True)
df.tot_coll_amt.fillna(value=df.tot_coll_amt.mean(),inplace=True)
df.tot_cur_bal.fillna(value=df.tot_coll_amt.mean(),inplace=True)
df.total_rev_hi_lim.fillna(value=df.total_rev_hi_lim.mean(),inplace=True)
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'] = df['emp_length'].astype(int)
df['term'] = pd.factorize(df['term'])[0]

In [6]:
df['is_title_known'] = df['emp_title'].map(lambda x: 0 if x == 'n/a' else 1)
df.drop('emp_title', axis=1, inplace=True)

In [7]:
df['zip_code'] = df['zip_code'].where(df['zip_code'].str.len() == 4, 
                                               df['zip_code'].str[:3])
df['zip_code'] = df['zip_code'].astype(int)

In [8]:
label_encoder = LabelEncoder()
categorical_columns = df[['grade','sub_grade','pymnt_plan','initial_list_status','purpose','application_type','addr_state']]
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [9]:
one_hot_encoded_home_ownership = pd.get_dummies(df.home_ownership)
df = df.drop('home_ownership',axis = 1)

df = df.join(one_hot_encoded_home_ownership)

In [10]:
one_hot_encoded_verification_status = pd.get_dummies(df.verification_status)
df = df.drop('verification_status',axis = 1)

df = df.join(one_hot_encoded_verification_status)

In [11]:
def month_to_decimal(month):
    month_dict = {'Jan':0, 'Feb':1/12., 'Mar':2/12., 'Apr':3/12., 'May':4/12., 'Jun':5/12., 
     'Jul':6/12., 'Aug':7/12., 'Sep':8/12., 'Oct':9/12., 'Nov':10/12., 'Dec':11/12.}
    return month_dict[month]

def convert_date(month_year):
    month_and_year = month_year.split('-')
    return float(month_and_year[1]) + month_to_decimal(month_and_year[0])

def encode_with_func(df, column_name, func_name):
    df[column_name+'_le'] = df[column_name].map(func_name)
    df.drop(column_name, axis=1, inplace=True)

encode_with_func(df, 'issue_d', convert_date)

In [12]:
encode_with_func(df, 'earliest_cr_line', convert_date)

In [13]:
Y = df['loan_status'].values

X = df.drop(['loan_status'], axis=1)

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X, Y = ros.fit_resample(X, Y)

x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [19]:
tree = DecisionTreeClassifier(max_depth=5, min_samples_leaf = 5)
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
y_tree = tree.predict(x_test)

In [22]:
print("Accuracy:",metrics.accuracy_score(y_test, y_tree))
print("Precision:",metrics.precision_score(y_test, y_tree))
print("Recall:",metrics.recall_score(y_test, y_tree))

Accuracy: 0.640534492542587
Precision: 0.6557195136347539
Recall: 0.5938970147977076


In [24]:
y_pred_tree = tree.predict_proba(x_test)[::,1]
auc_tree = metrics.roc_auc_score(y_test, y_pred_tree)
print(auc_tree)

0.6930181389263298


In [25]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)

In [26]:
KNeighborsClassifier?

In [30]:
forest = RandomForestClassifier(n_estimators=100, min_samples_leaf= 5, max_depth=5)
forest.fit(x_train, y_train)
y_forest = forest.predict(x_test)

In [31]:
print("Accuracy:",metrics.accuracy_score(y_test, y_forest))
print("Precision:",metrics.precision_score(y_test, y_forest))
print("Recall:",metrics.recall_score(y_test, y_forest))
y_pred_forest = forest.predict_proba(x_test)[::,1]
auc_forest = metrics.roc_auc_score(y_test, y_pred_forest)
print(auc_forest)

Accuracy: 0.6455560671113634
Precision: 0.6603361660171839
Recall: 0.6015097083226414
0.7058620281199492
