In [3]:
# Import libraries
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scripts import get_binary_data

In [4]:
# Load data
df = pd.read_csv('./../data/processed_credit_risk_data.csv')
df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...
30387,40,33000,MORTGAGE,2.0,HOMEIMPROVEMENT,B,1000,10.99,0,0.03,N,15
30388,37,90000,MORTGAGE,11.0,DEBTCONSOLIDATION,A,4000,6.62,0,0.04,N,15
30389,38,200000,MORTGAGE,0.0,DEBTCONSOLIDATION,A,3000,7.68,0,0.01,N,12
30390,38,110000,MORTGAGE,5.0,MEDICAL,B,16000,11.99,0,0.15,N,13


In [5]:
# Replace categorical variable with numeric for random forest training
df['cb_person_default_on_file'] = df.apply(lambda x: get_binary_data(x['cb_person_default_on_file']), axis=1)
df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,0,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,0,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,0,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,1,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
30387,40,33000,MORTGAGE,2.0,HOMEIMPROVEMENT,B,1000,10.99,0,0.03,0,15
30388,37,90000,MORTGAGE,11.0,DEBTCONSOLIDATION,A,4000,6.62,0,0.04,0,15
30389,38,200000,MORTGAGE,0.0,DEBTCONSOLIDATION,A,3000,7.68,0,0.01,0,12
30390,38,110000,MORTGAGE,5.0,MEDICAL,B,16000,11.99,0,0.15,0,13


In [6]:
# Apply one-hot encoding to a categorical columns
df_feat = pd.get_dummies(df,
                         columns=['person_home_ownership', 'loan_intent', 'loan_grade'],
                         drop_first=True)

In [7]:
df_feat.corr()['loan_int_rate'].sort_values(ascending=False)

loan_int_rate                  1.000000
cb_person_default_on_file      0.477958
loan_grade_D                   0.454932
loan_grade_C                   0.358753
loan_status                    0.317782
loan_grade_E                   0.310677
loan_grade_F                   0.188199
loan_amnt                      0.137058
person_home_ownership_RENT     0.131150
loan_grade_G                   0.125252
loan_percent_income            0.114378
loan_intent_HOMEIMPROVEMENT    0.020846
cb_person_cred_hist_length     0.020829
person_age                     0.018649
person_home_ownership_OTHER    0.014569
loan_intent_MEDICAL            0.005804
loan_intent_PERSONAL           0.001082
person_income                 -0.002109
loan_grade_B                  -0.004310
loan_intent_VENTURE           -0.008089
person_home_ownership_OWN     -0.011144
loan_intent_EDUCATION         -0.011726
person_emp_length             -0.054157
Name: loan_int_rate, dtype: float64

In [8]:
# Features considered in training
features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
            'loan_status', 'cb_person_default_on_file', 'cb_person_cred_hist_length',
            'person_home_ownership_RENT', 'loan_intent_EDUCATION',
            'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
            'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_B',
            'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F',
            'loan_grade_G']

In [9]:
len(features)

19

In [10]:
# Get data for supervised training with random forest
X = df_feat[features]
y = df_feat.loan_int_rate

In [11]:
# Separate 70% for training and 30% for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [12]:
# Instantiate regressor
reg = RandomForestRegressor(n_estimators=100, random_state=1)

In [13]:
# Training
reg.fit(X_train, y_train)

In [14]:
# Prediction
y_pred = reg.predict(X_test)
y_pred

array([14.8782, 13.363 , 11.1572, ...,  7.4347,  7.2659, 10.8343])

In [15]:
# Mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

1.754761421820486

In [16]:
# Square root of mean squared error
np.sqrt(mse)

1.3246740813575564

In [17]:
# R2 score
r2 = r2_score(y_test, y_pred)
r2

0.8183744264370156

In [18]:
file_path = os.path.abspath('..')

In [19]:
pickle.dump(reg, open(file_path + '/deploy/loan_int_rate.pkl', 'wb'))