In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from pprint import pprint
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.shape

(12180, 86)

## Preprocessing without Scaling

In [4]:
# Drop the label to create the X data
X_train = train_df.drop('loan_status', axis = 1)
X_test = test_df.drop('loan_status', axis = 1)

In [5]:
# Convert categorical data to numeric and separate target feature (y) for training data
X_train_dummies = pd.get_dummies(X_train)

y_train = train_df['loan_status']

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
X_test_dummies = pd.get_dummies(X_test)

y_test = test_df['loan_status']

In [7]:
X_train_dummies.shape, X_test_dummies.shape

((12180, 94), (4702, 93))

In [8]:
#Find missing variables
print(list(set(X_train_dummies.columns) - set(X_test_dummies.columns)))
print(list(set(X_test_dummies.columns) - set(X_train_dummies.columns)))

['debt_settlement_flag_Y']
[]


In [9]:
# add missing dummy variables to testing set - X_test_dummies is missing 'debt_settlement_flag_Y', which is presumably the inverse of 'debt_settlement_flag_N'
# https://stackoverflow.com/questions/45094948/how-to-swap-the-0-and-1-values-for-each-other-in-a-pandas-data-frame
X_test_dummies['debt_settlement_flag_Y'] = X_test_dummies['debt_settlement_flag_N'] ^ 1

## Running the Models

As the data has more continuous variables than categorical variables, I predict that the Logistic Regression model will perform better.

In [10]:
# Train the Logistic Regression model on the unscaled data
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver = "lbfgs")
classifier.fit(X_train_dummies, y_train)

# Print the model score
print(f"Training Data Score: {classifier.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_dummies, y_test)}")

Training Data Score: 0.6485221674876848
Testing Data Score: 0.5253083794130158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Train a Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

# Hyperparameters are:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_dummies, y_train)

# Hyperparameters are:
print("Hyperparameter List:")
pprint(clf.get_params())

Hyperparameter List:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [12]:
# Print the model score
print(f'Training Score: {clf.score(X_train_dummies, y_train)}')
print(f'Testing Score: {clf.score(X_test_dummies, y_test)}')

Training Score: 1.0
Testing Score: 0.6180348787749894


## Preprocessing with Scaling

I predict that scaling will improve the accuracy of the models, as it should reduce the tendency of the models to 'bias' torwards higher numbers.

In [13]:
# Scale the data - 04-Ins_Preprocessing-Data
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)

# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test_dummies)

## Re-running the Models

Scaling helps Logistic Regression more than Random Forests, so I predict that Logistic Regression will finally perform better.

In [14]:
# Train the Logistic Regression model on the scaled data
classifier = LogisticRegression(solver = "lbfgs")
classifier.fit(X_train_scaled, y_train)

# Print the model score
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.713136288998358
Testing Data Score: 0.7201190982560612


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Train a Random Forest Classifier model on the scaled data
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

# Print the model score
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6193109315185028


## Final Analysis

Regarding the testing score, the methods performed as follows:

    Logistic Regression, Scaling Testing Data Score: 0.7201190982560612

    Random Forest, Scaling Testing Score: 0.6193109315185028

    Random Forest, No Scaling Testing Score: 0.6180348787749894

    Logistic Regression, No Scaling Testing Data Score: 0.5253083794130158




Prediction 1: Logistic Regression will perform better than Random Forests before scaling - Incorrect

Prediction 2: Scaling will improve the model score - No change for Random Forests (difference of 0.001), Correct for Logistic Regression

Prediction 3: Logistic Regression will perform better than Random Forests after scaling - Correct

Learning: Not all models have a bias towards higher value numbers. Logistic Regression does and should, therefore, use scaled features.

## Additional Information

### Confusion Matrix and Classification Reports:

In [16]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(X_test_scaled)

# [True Negative, False Positive], [False Negative, True Positive]
confusion_matrix(y_true, y_pred)

array([[1242, 1109],
       [ 207, 2144]], dtype=int64)

In [17]:
cm = confusion_matrix(y_true, y_pred)

# True Negatives, False Positives, False Negatives, True Positives
tn, fp, fn, tp = cm.ravel()

# The ratio of the correctly labeled subjects to the whole pool of subjects
accuracy = (tp + tn) / (tp + fp + fn + tn)

# The ratio of correctly 'positive' labeled subjects to all 'positive' labeled
precision = tp / (tp + fp)

# (Also 'sensitivity') The ratio of correctly 'positive' labeled subjects to all that should have been labelled 'positive'
recall = tp / (tp + fn)

# The correctly 'negative' labeled subjects to all that should have been labelled 'negative'
specificity = tn / (tn + fp)

# The ratio of correctly 'negative' labeled subjects to all 'negative' labeled
negativePredictiveValue = tn / (tn + fn)

# The harmonic mean(average) of the precision and recall
# Harmonic Mean: The appropriate mean if the data is comprised of rates (false positive *rate*, etc)
# harmonic mean = N / [(1/x1) + (1/x2) = (1/xN)], simplified for 2 items below
# F1 Score is best/highest if there is balance between precision & recall

# In this case "0" is high-risk loans and "1" is low-risk
f1_0 = 2*(negativePredictiveValue * specificity) / (negativePredictiveValue + specificity)
f1_1 = 2*(recall * precision) / (recall + precision)

# The 'support' is the number of occurrences of each class in y_true
support_0 = y_true.value_counts()[0]
support_1 = y_true.value_counts()[1]

# Macro-averaged F1 score is the arithmetic mean (aka unweighted mean) of all the per-class F1 scores
length = len(y_true.value_counts())

macro_avg_npv_prec = (negativePredictiveValue + precision) / length
macro_avg_spec_rec = (specificity + recall) / length
macro_avg_f1 = (f1_0 + f1_1) / length

# Weighted-averaged F1 score is the mean of all per-class F1 scores while considering each class’s support (weight)
wt_0 = support_0 / (support_0 + support_1)
wt_1 = support_1 / (support_0 + support_1)

weighted_npv_prec = ((negativePredictiveValue * wt_0) + (precision * wt_1)) / (wt_0 + wt_1)
weighted_spec_rec = ((specificity * wt_0) + (recall * wt_1)) / (wt_0 + wt_1)
weighted_f1 = ((f1_0 * wt_0) + (f1_1 * wt_1)) / (wt_0 + wt_1)
# They are the same in the report below because there are the same number of high-risk and low-risk subjects

print("Handmade Classification Report:")
print(negativePredictiveValue, specificity, f1_0, support_0)
print(precision, recall, f1_1, support_1)
print(accuracy)
print(macro_avg_npv_prec, macro_avg_spec_rec, macro_avg_f1)
print(weighted_npv_prec, weighted_spec_rec, weighted_f1)

Handmade Classification Report:
0.8571428571428571 0.528285835814547 0.6536842105263158 2351
0.6590839225330464 0.9119523606975755 0.765167737330478 2351
0.7201190982560612
0.7581133898379517 0.7201190982560612 0.7094259739283969
0.7581133898379517 0.7201190982560612 0.7094259739283969


In [18]:
# sklearn classification_report for comparison
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

   high_risk       0.86      0.53      0.65      2351
    low_risk       0.66      0.91      0.77      2351

    accuracy                           0.72      4702
   macro avg       0.76      0.72      0.71      4702
weighted avg       0.76      0.72      0.71      4702

