In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [4]:
train_df = train_df.drop(columns=['index', 'Unnamed: 0'])
train_df_backup = train_df.copy()

######

test_df = test_df.drop(columns=['index', 'Unnamed: 0'])
test_df_backup = test_df.copy()

In [5]:
# Convert categorical data to numeric and separate target feature for training data

train_df_X = train_df.drop(columns=['loan_status'])
train_df_dummy_X = pd.get_dummies(train_df_X, drop_first=True)



train_df_dummy_y = pd.get_dummies(train_df['loan_status'], drop_first=True)

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
test_df_X = test_df.drop(columns=['loan_status'])
test_df_dummy_X = pd.get_dummies(test_df_X, drop_first=True)


test_df_dummy_y = pd.get_dummies(test_df['loan_status'], drop_first=True)

In [7]:
# add missing dummy variables to testing set

test_columns = list(test_df_dummy_X.columns)
train_columns = list(train_df_dummy_X.columns)
#######
dummy_col_list = []
#######

for  col in train_columns:
    if col not in test_columns:
        dummy_col_list.append(col)

for col in dummy_col_list:
    test_df_dummy_X[f'{col}'] = 0


print(f'{dummy_col_list} added to testing set')

['debt_settlement_flag_Y'] added to testing set


Prediction #1

Since the data is unscaled, I expect the Logistic Regression model to perform worse than the Random Forest Classifier.

In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score

classifier_log_regression = LogisticRegression(max_iter=10000)

classifier_log_regression.fit(train_df_dummy_X, train_df_dummy_y.values.ravel())

lr_train_score = classifier_log_regression.score(train_df_dummy_X, train_df_dummy_y)
lr_test_score = classifier_log_regression.score(test_df_dummy_X,test_df_dummy_y)

print(f"Logistic Regression model Training Data Score: {lr_train_score}")

print(f"Logistic Regression model Testing Data Score: {lr_test_score}")



Logistic Regression model Training Data Score: 0.7042692939244664
Logistic Regression model Testing Data Score: 0.5752871118672905


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Train a Random Forest Classifier model and print the model score
classifier_rand_forest = RandomForestClassifier(random_state=1, n_estimators=1000)

classifier_rand_forest.fit(train_df_dummy_X, train_df_dummy_y.values.ravel())

rf_train_score = classifier_rand_forest.score(train_df_dummy_X, train_df_dummy_y.values.ravel())

rf_test_score = classifier_rand_forest.score(test_df_dummy_X,test_df_dummy_y.values.ravel())



print(f"Random Forest Classifier model Training Data Score: {rf_train_score}")

print(f"Random Forest Classifier model Testing Data Score: {rf_test_score}")

Random Forest Classifier model Training Data Score: 1.0
Random Forest Classifier model Testing Data Score: 0.6405784772437261


Prediction # 2

In the last experiment, I did not expect the percent difference to be so relatively small between the two models. With the scaled data, I expect the Logistical regression scores to increase.

In [10]:
# Scale the data
scaler = StandardScaler().fit(train_df_dummy_X)

train_df_dummy_X_scaled = scaler.transform(train_df_dummy_X)
test_df_dummy_X_scaled = scaler.transform(test_df_dummy_X)


In [11]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier_log_regression_scaled = LogisticRegression(max_iter=10000)
classifier_log_regression_scaled.fit(train_df_dummy_X_scaled, train_df_dummy_y.values.ravel())


lr_train_score_scaled = classifier_log_regression_scaled.score(train_df_dummy_X_scaled, train_df_dummy_y)
lr_test_score_scaled = classifier_log_regression_scaled.score(test_df_dummy_X_scaled,test_df_dummy_y)

print(f"Logistic Regression model Scaled Training Data Score: {lr_train_score_scaled}")

print(f"Logistic Regression model Scaled Testing Data Score: {lr_test_score_scaled}")

Logistic Regression model Scaled Training Data Score: 0.7078817733990148
Logistic Regression model Scaled Testing Data Score: 0.7679710761378137


In [12]:
# Train a Random Forest Classifier model on the scaled data and print the model score

classifier_rand_forest_scaled = RandomForestClassifier(random_state=1, n_estimators=1000)

classifier_rand_forest_scaled.fit(train_df_dummy_X_scaled, train_df_dummy_y.values.ravel())

rf_train_score_scaled = classifier_rand_forest_scaled.score(train_df_dummy_X_scaled, train_df_dummy_y)

rf_test_score_scaled = classifier_rand_forest_scaled.score(test_df_dummy_X_scaled,test_df_dummy_y)

print(f"Random Forest Classifier model Training Data Score: {rf_train_score_scaled}")

print(f"Random Forest Classifier model Testing Data Score: {rf_test_score_scaled}")

Random Forest Classifier model Training Data Score: 1.0
Random Forest Classifier model Testing Data Score: 0.6407911527009783


Results & Thoughts

The outputs for my unscaled and scaled Random Forest Classifier models were virtually the same. I did not expext this. I expected the score to be high

I also noticed that the iterations for the Logistical Regression model had great improvement from increasing the number of iterations

The test score for my scaled logistic regression model was higre than the training score. I did not expect this to occur. Perhaps this is due to more qualified borrowers in the training set.