# Random Forest Model

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import pandas
import numpy
import itertools
import matplotlib.pyplot as plt
import sklearn
from sklearn import tree
from sklearn import ensemble
from sklearn.metrics import log_loss
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import CategoricalNB

In [3]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [4]:
from loan_approval_lib.standard_data_processing import data_cleaning_algorithm
from loan_approval_lib.standard_data_processing import create_loan_grade_numerical_from_loan_grade
from loan_approval_lib.standard_data_processing import create_person_home_ownership_one_hot_encoder
from loan_approval_lib.standard_data_processing import create_person_home_ownership_one_hot
from loan_approval_lib.standard_data_processing import create_loan_intent_one_hot_encoder
from loan_approval_lib.standard_data_processing import create_loan_intent_one_hot
from loan_approval_lib.standard_data_processing import map_cb_person_default_on_file
from loan_approval_lib.standard_data_processing import create_decision_tree_columns
from loan_approval_lib.standard_data_processing import create_decision_tree_columns_with_id
from loan_approval_lib.standard_data_processing import create_dataframe_copy_and_drop_columns
from loan_approval_lib.standard_data_processing import create_decision_tree_columns
from loan_approval_lib.standard_data_processing import create_decision_tree_columns_with_id

In [5]:
data = loan_approval_lib.load_original_data()

In [6]:
data_train = loan_approval_lib.load_data_train()

In [7]:
data_test = loan_approval_lib.load_data_test()

# Data Cleaning, Data Preprocessing

In [8]:
data = data_cleaning_algorithm(data)

column person_age, number of removed rows: 5
column person_income, number of removed rows: 0
column person_emp_length, number of removed rows: 897
column loan_amnt, number of removed rows: 0
column loan_int_rate, number of removed rows: 3047
column loan_percent_income, number of removed rows: 0
column cb_person_default_on_file, number of removed rows: 0


In [9]:
data_train = data_cleaning_algorithm(data_train)

column person_age, number of removed rows: 1
column person_income, number of removed rows: 0
column person_emp_length, number of removed rows: 2
column loan_amnt, number of removed rows: 0
column loan_int_rate, number of removed rows: 0
column loan_percent_income, number of removed rows: 0
column cb_person_default_on_file, number of removed rows: 0


In [10]:
data_cleaning_algorithm(data_test)
None

column person_age, number of removed rows: 0
column person_income, number of removed rows: 0
column person_emp_length, number of removed rows: 0
column loan_amnt, number of removed rows: 0
column loan_int_rate, number of removed rows: 0
column loan_percent_income, number of removed rows: 0
column cb_person_default_on_file, number of removed rows: 0


In [10]:
# Loan Grade

create_loan_grade_numerical_from_loan_grade(data, data_train, data_test)

In [11]:
# Person Home Ownership

encoder = create_person_home_ownership_one_hot_encoder(data)

data = create_person_home_ownership_one_hot(encoder, data)
data_train = create_person_home_ownership_one_hot(encoder, data_train)
data_test = create_person_home_ownership_one_hot(encoder, data_test)

In [12]:
# Loan Intent

encoder = create_loan_intent_one_hot_encoder(data)

data = create_loan_intent_one_hot(encoder, data)
data_train = create_loan_intent_one_hot(encoder, data_train)
data_test = create_loan_intent_one_hot(encoder, data_test)

In [13]:
# Default On File

data = map_cb_person_default_on_file(data)
data_train = map_cb_person_default_on_file(data_train)
data_test = map_cb_person_default_on_file(data_test)

# Choose Columns for Tree Model, Copy DataFrame and Drop Unused Columns

In [14]:
decision_tree_columns = create_decision_tree_columns()
decision_tree_columns_with_id = create_decision_tree_columns_with_id()

In [15]:
data_copy = create_dataframe_copy_and_drop_columns(data)
data_train_copy = create_dataframe_copy_and_drop_columns(data_train)
data_test_copy = create_dataframe_copy_and_drop_columns(data_test)

# Random Forest Model

In [40]:
def random_forest_model_train(
    data_train,
    decision_tree_columns,
    n_estimators,
    max_features,
    max_depth,
    min_samples_split,
    min_samples_leaf,
    bootstrap,
    n_jobs,
):

    X = data_train[decision_tree_columns].copy()
    Y = data_train['loan_status'].copy()
    
    clf = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        criterion='entropy',
        max_features=max_features,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        n_jobs=n_jobs,
        oob_score=True,
        
    )
    clf.fit(X, Y)
    
    Z = clf.predict(X)
    X['loan_status_predict'] = Z
    X['loan_status'] = data_train['loan_status']
    accuracy_in_sample = (X['loan_status'] == X['loan_status_predict']).sum() / len(X)
    
    return (clf, accuracy_in_sample)

In [17]:
def random_forest_model_test(
    clf,
    data_test,
    decision_tree_columns,
):

    X_test = data_test[decision_tree_columns].copy()

    Z_test = clf.predict(X_test)
    X_test['loan_status_predict'] = Z_test
    X_test['loan_status'] = data_test['loan_status']
    accuracy_out_of_sample = (X_test['loan_status'] == X_test['loan_status_predict']).sum() / len(X_test)
    
    return accuracy_out_of_sample

In [18]:
def random_forest_model_predict(
    clf,
    data_predict,
    decision_tree_columns,
    decision_tree_columns_with_id,
):
    
    X_predict = data_predict[decision_tree_columns_with_id].copy()

    Z_predict = clf.predict_proba(X_predict[decision_tree_columns])
    X_predict['loan_status'] = Z_predict[:, 1]

    return X_predict

In [19]:
def random_forest_model_predict_write_to_csv(
    data_predict,
    filename,
):
    data_predict[['id', 'loan_status']].to_csv(filename, index=False)

In [66]:
# Usage

(
    clf,
    accuracy_in_sample,
) = random_forest_model_train(
    data_copy,
    decision_tree_columns,
    1000, # n_estimators
    'sqrt', # max_features
    None, # max_depth
    10, # min_samples_split
    2, # min_samples_leaf
    True, # bootstrap
    8, # n_jobs
)

accuracy_out_of_sample = random_forest_model_test(
    clf,
    data_train_copy,
    decision_tree_columns,
)

data_predict = random_forest_model_predict(
    clf,
    data_test_copy,
    decision_tree_columns,
    decision_tree_columns_with_id,
)

random_forest_model_predict_write_to_csv(
    data_predict,
    f'random_forest_model_2.csv'
)

In [67]:
accuracy_in_sample

np.float64(0.9465632858340318)

In [68]:
accuracy_out_of_sample

np.float64(0.9520163696819848)

In [72]:
(
    clf,
    accuracy_in_sample,
) = random_forest_model_train(
    data_copy,
    decision_tree_columns,
    1000, # n_estimators
    None, # max_features
    None, # max_depth
    10, # min_samples_split
    2, # min_samples_leaf
    True, # bootstrap
    8, # n_jobs
)

accuracy_out_of_sample = random_forest_model_test(
    clf,
    data_train_copy,
    decision_tree_columns,
)

data_predict = random_forest_model_predict(
    clf,
    data_test_copy,
    decision_tree_columns,
    decision_tree_columns_with_id,
)

random_forest_model_predict_write_to_csv(
    data_predict,
    f'random_forest_model_4.csv'
)

In [73]:
accuracy_in_sample

np.float64(0.9628387817826208)

In [74]:
accuracy_out_of_sample

np.float64(0.9518799556654446)

# Optimization using choice of dataset

In [20]:
len(data_copy), len(data_train_copy), len(data_test_copy)

(28632, 58645, 39098)

In [21]:
'loan_status' in data_copy.columns, 'loan_status' in data_train_copy.columns, 'loan_status' in data_test_copy

(True, True, False)

In [41]:
# base case: train model on data_copy, which has 28632 rows

(
    clf,
    accuracy_in_sample,
) = random_forest_model_train(
    data_copy,
    decision_tree_columns,
    1000, # n_estimators
    None, # max_features
    None, # max_depth
    10, # min_samples_split
    2, # min_samples_leaf
    True, # bootstrap
    8, # n_jobs
)

accuracy_out_of_sample = random_forest_model_test(
    clf,
    data_train_copy,
    decision_tree_columns,
)

data_predict = random_forest_model_predict(
    clf,
    data_test_copy,
    decision_tree_columns,
    decision_tree_columns_with_id,
)

# random_forest_model_predict_write_to_csv(
#     data_predict,
#     f'random_forest_model_4.csv'
# )

In [42]:
accuracy_in_sample

np.float64(0.9631531153953619)

In [43]:
accuracy_out_of_sample

np.float64(0.9519822661778498)

In [None]:
# public: 0.94414, private: 0.94603

In [33]:
# second case: train model on data_train_copy, which has 58645 rows

(
    clf,
    accuracy_in_sample,
) = random_forest_model_train(
    data_train_copy,
    decision_tree_columns,
    1000, # n_estimators
    None, # max_features
    None, # max_depth
    10, # min_samples_split
    2, # min_samples_leaf
    True, # bootstrap
    8, # n_jobs
)

accuracy_out_of_sample = random_forest_model_test(
    clf,
    data_copy, # might as well use the other dataset for training validation
    decision_tree_columns,
)

data_predict = random_forest_model_predict(
    clf,
    data_test_copy,
    decision_tree_columns,
    decision_tree_columns_with_id,
)

random_forest_model_predict_write_to_csv(
    data_predict,
    f'random_forest_model_5.csv'
)

In [34]:
accuracy_in_sample

np.float64(0.9673799982948248)

In [35]:
accuracy_out_of_sample

np.float64(0.9235820061469684)

In [None]:
# public: 0.94815, private: 0.94833

In [44]:
# final case: train model on both datasets (`data_copy` and `data_train_copy`), which has 28632 + 58645 = 87277 rows

both_data_copy = pandas.concat(
    [data_copy, data_train_copy],
    axis=0,
)
both_data_copy.reset_index(inplace=True, drop=True)

(
    clf,
    accuracy_in_sample,
) = random_forest_model_train(
    both_data_copy,
    decision_tree_columns,
    1000, # n_estimators
    None, # max_features
    None, # max_depth
    10, # min_samples_split
    2, # min_samples_leaf
    True, # bootstrap
    8, # n_jobs
)

# can't produce an out of sample test, because all data has been used for training
# accuracy_out_of_sample = random_forest_model_test(
#     clf,
#     data_copy,
#     decision_tree_columns,
# )

data_predict = random_forest_model_predict(
    clf,
    data_test_copy,
    decision_tree_columns,
    decision_tree_columns_with_id,
)

random_forest_model_predict_write_to_csv(
    data_predict,
    f'random_forest_model_6.csv'
)

In [45]:
accuracy_in_sample

np.float64(0.9673682642620621)

In [None]:
# public: 0.95235, 0.95495

# 0.95494 is 1744 / 3810

In [49]:
# oob score
clf.oob_score

True

In [50]:
clf.oob_score_

0.9473629936867675