In [1]:
# Importing necessary modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [2]:
# Importing data
data = pd.read_csv('../data/accepted_2007_to_2018Q4.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# This is big data set. For the purpose of prototyping, I will downsample the number of rows and columns.
# Later on, I will scale up using the entire data set.

select_columns = ['loan_status','loan_amnt','int_rate','installment','sub_grade','emp_length', 'issue_d', \
                  'home_ownership','annual_inc','verification_status','purpose','addr_state', \
                  'dti','delinq_2yrs','earliest_cr_line','fico_range_low','fico_range_high','inq_last_6mths', \
                  'mths_since_last_delinq','mths_since_last_record','open_acc','pub_rec','revol_bal','revol_util', \
                  'total_acc','collections_12_mths_ex_med','mths_since_last_major_derog','annual_inc_joint', \
                  'dti_joint','acc_now_delinq','tot_coll_amt', 'tot_cur_bal','open_acc_6m','open_act_il', \
                  'open_il_12m','open_il_24m']

data_fully_paid = data[data.loan_status == 'Fully Paid']
data_charged_off = data[data.loan_status == 'Charged Off']

# The data are combined in balance: the number of fully paid = charged off
combined_data = pd.concat([data_fully_paid[select_columns].sample(n=10000, random_state=34), \
                           data_charged_off[select_columns].sample(n=10000, random_state=34)])
minidata = combined_data.sample(frac=1, random_state=34)
minidata.head()

Unnamed: 0,loan_status,loan_amnt,int_rate,installment,sub_grade,emp_length,issue_d,home_ownership,annual_inc,verification_status,...,mths_since_last_major_derog,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m
1729351,Fully Paid,12000.0,14.99,415.93,C4,4 years,Jan-2017,MORTGAGE,35000.0,Source Verified,...,,,,0.0,0.0,98400.0,0.0,0.0,0.0,0.0
1911200,Charged Off,28625.0,14.33,670.97,C1,3 years,Sep-2012,OWN,72000.0,Verified,...,,,,0.0,0.0,7438.0,,,,
1734562,Charged Off,6000.0,8.24,188.69,B1,,Jan-2017,MORTGAGE,46498.0,Not Verified,...,,,,0.0,89.0,71738.0,0.0,2.0,0.0,2.0
229044,Charged Off,8000.0,18.25,290.23,E1,3 years,Jul-2015,RENT,40000.0,Verified,...,,,,0.0,120.0,37354.0,,,,
658378,Fully Paid,18600.0,12.79,421.22,C1,9 years,Jun-2016,RENT,115000.0,Not Verified,...,,,,0.0,0.0,100621.0,2.0,3.0,3.0,5.0


In [4]:
# Feature engineering

# Change some columns to categorical
categorical_columns = ['sub_grade','home_ownership','verification_status','purpose','addr_state']
minidata[categorical_columns] = minidata[categorical_columns].astype('category')
minidata['sub_grade'] = pd.Categorical(minidata['sub_grade'], ordered=True)

In [5]:
# Change employment length to float
minidata['emp_length'] = minidata['emp_length'].astype('str')
minidata['emp_length'] = minidata['emp_length'].map(lambda x: x.rstrip(' years'))
minidata['emp_length'][minidata.emp_length == '< 1'] = '0'
minidata['emp_length'][minidata.emp_length == '10+'] = '10'
minidata['emp_length'] = minidata['emp_length'].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [6]:
# Make datetime format
date_columns = ['earliest_cr_line','issue_d']
minidata[date_columns] = minidata[date_columns].apply(pd.to_datetime)

# Convert into time interval and then to float that count days
minidata['time_interval'] = (minidata['issue_d'] - minidata['earliest_cr_line']).dt.days
minidata = minidata.drop(date_columns, axis=1)

In [7]:
# One-hot-encoding for categorical variables

#def cat2ohe(data, col):
#    label_encoded = LabelEncoder().fit_transform(data[col])
#    label_encoded = label_encoded.reshape(len(label_encoded), 1)
#    return OneHotEncoder(sparse=False).fit_transform(label_encoded)

minidata['sub_grade'] = LabelEncoder().fit_transform(minidata['sub_grade'])
minidata['loan_status'] = LabelEncoder().fit_transform(minidata['loan_status'])

for cat_col in categorical_columns[1:]:
    ohe_col = pd.get_dummies(minidata[cat_col], prefix=cat_col, dummy_na=True)
    minidata = pd.concat([minidata, ohe_col], axis=1)
    minidata = minidata.drop(cat_col, axis=1)

In [8]:
na_dict = {}
for col in minidata.columns:
    if minidata[col].isna().sum() > 0:
        na_dict[col] = np.nanmedian(minidata[col])

minidata = minidata.fillna(value=na_dict)

In [9]:
# Modeling using Random Forest

y = minidata['loan_status']
X = minidata.drop('loan_status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=34)

rfc = RandomForestClassifier(random_state=34)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=34, verbose=0,
                       warm_start=False)

In [10]:
y_pred = rfc.predict(X_test)
roc_auc_score(y_test, y_pred)

0.6441497684879482

In [11]:
confusion_matrix(y_test, y_pred)

array([[2047, 1019],
       [1113, 1821]])

In [12]:
importance = rfc.feature_importances_
ind = np.argsort(importance)[::-1]
for i in ind[:10]:
    print(X.columns[i], importance[i])

int_rate 0.07311868874429552
sub_grade 0.06698188103801757
dti 0.054879836157248806
tot_cur_bal 0.04851251745848959
installment 0.048383017574374
revol_util 0.04802534070789761
revol_bal 0.04773019279748678
time_interval 0.046560813207806126
annual_inc 0.04496929850185367
loan_amnt 0.042811041489926885


In [13]:
parameters = {'n_estimators': [10, 50, 100], \
              'criterion': ['gini', 'entropy'], \
              'min_samples_split': [0.01, 0.05, 0.10], \
              'max_features': ['sqrt', 'log2', None], \
              'ccp_alpha': [0, 1, 10]}

gs  = GridSearchCV(RandomForestClassifier(), parameters)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [14]:
y_pred = rfc.predict(X_test)
roc_auc_score(y_test, y_pred)

0.6441497684879482

Some problems:

1) It took so long to do hyperparameter tuning --> Move to domino/colab? But data size is too large.

2) ROC-AUC score is low --> Maybe not choosing optimal features? Replace features that aren't important.

3) Replacing NaN value with median doesn't seem optimal --> Do some kind of prediction for missing value?