In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [2]:
# Import the data
lending = pd.read_csv('Resources/lending_data.csv')
lending.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700,7.672,52800,0.431818,5,1,22800,0
1,8400,6.692,43600,0.311927,3,0,13600,0
2,9000,6.963,46100,0.349241,3,0,16100,0
3,10700,7.664,52700,0.43074,5,1,22700,0
4,10800,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Split the data into X_train, X_test, y_train, y_test
X=lending.drop('loan_status',axis=1)
y=lending['loan_status'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
63243,8900,6.893,45500,0.340659,3,0,15500
36991,9000,6.932,45800,0.344978,3,0,15800
39985,8900,6.896,45500,0.340659,3,0,15500
63047,9000,6.961,46100,0.349241,3,0,16100
49913,8400,6.712,43800,0.315068,3,0,13800


In [4]:
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler model and fit it to the training data

X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Predicitions

Logistic regression performs better when the noise variables are less than or equal to the explanatory variables. Considering the number of dimensions or columns of data and their particular relevance to credit score, I predict the logistic regression to perform better than random forest classifier. In addition, there are not any categorical dimensions in the data indicating logistic regression would perform better.

In [5]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Logistic Regression Unscaled Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Logistic Regression Unscaled Testing Data Score: {classifier.score(X_test, y_test)}")
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test[:10]))}')

Logistic Regression Unscaled Training Data Score: 0.9919177328380795
Logistic Regression Unscaled Testing Data Score: 0.9924680148576145
Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)
print(f"Logistic Regression Scaled Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Logistic Regression Scaled Testing Data Score: {classifier.score(X_test_scaled, y_test)}")
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test_scaled[:10]))}')

Logistic Regression Scaled Training Data Score: 0.9941188609162196
Logistic Regression Scaled Testing Data Score: 0.9941704498555509
Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.98      0.91       592

    accuracy                           0.99     19384
   macro avg       0.93      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1,n_estimators=50).fit(X_train, y_train)
print(f'Random Forest Unscaled Training Score: {clf.score(X_train, y_train)}')
print(f'Random Forest Unscaled Testing Score: {clf.score(X_test, y_test)}')
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(clf.predict(X_test[:10]))}')

Random Forest Unscaled Training Score: 0.9971798046498831
Random Forest Unscaled Testing Score: 0.9920037144036319
Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1,n_estimators=50).fit(X_train_scaled, y_train)
print(f'Random Forest Scaled Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Random Forest Scaled Testing Score: {clf.score(X_test_scaled, y_test)}')
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(clf.predict(X_test_scaled[:10]))}')

Random Forest Scaled Training Score: 0.9971798046498831
Random Forest Scaled Testing Score: 0.9917457697069748
Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [10]:
print(classification_report(y_test, clf.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.89      0.87       592

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384



# Conclusions

The logistic regression testing data score is to some degree higher than the random forest classifier data test score. Both models do comparably well, almost identical.