In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [2]:
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
ry = pd.get_dummies(train_df['target'])['high_risk']
rtrimmed = train_df.drop('target', axis=1)
rX = pd.get_dummies(rtrimmed)

# Convert categorical data to numeric and separate target feature for testing data
sy = pd.get_dummies(test_df['target'])['high_risk']
strimmed = test_df.drop('target', axis=1)
sX = pd.get_dummies(strimmed)


In [4]:
# test for missing dummy variables in sets
missing = []
for label in rX.columns:
    if label not in sX.columns:
        missing.append(label)
print(f'Items in training but not testing: {missing}')
# confirm only rdum has missing values
print(f'Items in only one set: {set(rX.columns) ^ set(sX.columns)}')

# add missing dummy variables to testing set
for column in missing:
    sX[column] = 0

# confirm values are now symmetric
print(f'Items missing after adjustment: {set(rX.columns) ^ set(sX.columns)}')

Items in training but not testing: ['debt_settlement_flag_Y']
Items in only one set: {'debt_settlement_flag_Y'}
Items missing after adjustment: set()


In [5]:
# Train the Logistic Regression model on the unscaled data
classifier = LogisticRegression(max_iter=20000)
classifier.fit(rX, ry)

LogisticRegression(max_iter=20000)

In [6]:
def print_score(classifier, rX, ry, sX, sy):
    print(f"Training Score: {classifier.score(rX, ry)}")
    print(f"Testing Score: {classifier.score(sX, sy)}")
    print(classification_report(sy, classifier.predict(sX)))

In [7]:
# Print metrics for the Logistic Regression model
print_score(classifier, rX, ry, sX, sy)

Training Score: 0.7087848932676519
Testing Score: 0.5652913653764355
              precision    recall  f1-score   support

           0       0.55      0.78      0.64      2351
           1       0.61      0.36      0.45      2351

    accuracy                           0.57      4702
   macro avg       0.58      0.57      0.55      4702
weighted avg       0.58      0.57      0.55      4702




# Observation

---

From the extremely high `max_iter` required to get the model to converge in the above cell, it is likely the linear regression is going to be overfitted to the training data  

This is reflected in the testing score being much lower than the training score

In [8]:
# Train a Random Forest Classifier model 
classifier2 = RandomForestClassifier()
classifier2.fit(rX,ry)

RandomForestClassifier()

In [9]:
# Print the model score
print_score(classifier2, rX, ry, sX, sy)

Training Score: 1.0
Testing Score: 0.6412165036154828
              precision    recall  f1-score   support

           0       0.71      0.47      0.57      2351
           1       0.61      0.81      0.69      2351

    accuracy                           0.64      4702
   macro avg       0.66      0.64      0.63      4702
weighted avg       0.66      0.64      0.63      4702



In [10]:
# Scale the data
scaler = StandardScaler()
scaler.fit(sX)
rX_scaled = scaler.transform(rX)
sX_scaled = scaler.transform(sX)

In [20]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier3 = LogisticRegression(max_iter=2000)
classifier3.fit(rX_scaled, ry)

LogisticRegression(max_iter=2000)

In [21]:
# Print metrics for the Logistic Regression model
print_score(classifier3, rX_scaled, ry, sX_scaled, sy)

Training Score: 0.7090311986863711
Testing Score: 0.7547851977881752
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2351
           1       0.76      0.74      0.75      2351

    accuracy                           0.75      4702
   macro avg       0.75      0.75      0.75      4702
weighted avg       0.75      0.75      0.75      4702



In [13]:
# Train a Random Forest Classifier model on the scaled data and print the model score
classifier4 = RandomForestClassifier()
classifier4.fit(rX_scaled,ry)

RandomForestClassifier()

In [14]:
# Print the model score
print_score(classifier4, rX_scaled, ry, sX_scaled, sy)

Training Score: 1.0
Testing Score: 0.6316461080391322
              precision    recall  f1-score   support

           0       0.70      0.47      0.56      2351
           1       0.60      0.79      0.68      2351

    accuracy                           0.63      4702
   macro avg       0.65      0.63      0.62      4702
weighted avg       0.65      0.63      0.62      4702



In [17]:
# compare Logistic Regression scores

print('Unscaled Logistic Regression score')
print_score(classifier, rX, ry, sX, sy)
print('-'*10)
print('Scaled Logistic Regression score')
print_score(classifier3, rX_scaled, ry, sX_scaled, sy)


Unscaled Logistic Regression score
Training Score: 0.7087848932676519
Testing Score: 0.5652913653764355
              precision    recall  f1-score   support

           0       0.55      0.78      0.64      2351
           1       0.61      0.36      0.45      2351

    accuracy                           0.57      4702
   macro avg       0.58      0.57      0.55      4702
weighted avg       0.58      0.57      0.55      4702

----------
Scaled Logistic Regression score
Training Score: 0.7090311986863711
Testing Score: 0.7547851977881752
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2351
           1       0.76      0.74      0.75      2351

    accuracy                           0.75      4702
   macro avg       0.75      0.75      0.75      4702
weighted avg       0.75      0.75      0.75      4702



In [18]:
# compare Random Forest scores

print('Unscaled Random Forest score')
print_score(classifier2, rX, ry, sX, sy)
print('-'*10)
print('Scaled Random Forest score')
print_score(classifier4, rX_scaled, ry, sX_scaled, sy)

Unscaled Random Forest score
Training Score: 1.0
Testing Score: 0.6412165036154828
              precision    recall  f1-score   support

           0       0.71      0.47      0.57      2351
           1       0.61      0.81      0.69      2351

    accuracy                           0.64      4702
   macro avg       0.66      0.64      0.63      4702
weighted avg       0.66      0.64      0.63      4702

----------
Scaled Random Forest score
Training Score: 1.0
Testing Score: 0.6316461080391322
              precision    recall  f1-score   support

           0       0.70      0.47      0.56      2351
           1       0.60      0.79      0.68      2351

    accuracy                           0.63      4702
   macro avg       0.65      0.63      0.62      4702
weighted avg       0.65      0.63      0.62      4702




# Observations

Taking an order of magnitude fewer iterations to converge, the scaled version of the Logistic Regression performed markedly better than the unscaled version (high-risk f1-score of 75 vs 45)

Surprisingly, the random forest models were not improved by scaling the data, and while the random forest did better than the logistic regression when unscaled, the scaled logistic regression did much better than either of the random forest models (75 vs 69 and 68). 