In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [18]:
lending_df = file_path = Path('/Users/dev/Desktop/Resources/lending_data.csv')
lending_df = pd.read_csv(file_path)
print(lending_df.head())

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  loan_status  
0                 1       22800            0  
1                 0       13600            0  
2                 0       16100            0  
3                 1       22700            0  
4                 1       23000            0  


In [19]:
y = lending_df['loan_status']
X = lending_df.drop(columns=['loan_status'])

In [20]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [21]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [22]:
y.value_counts()


0    75036
1     2500
Name: loan_status, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=1)

lr_model = logistic_regression_model.fit(X_train, y_train)

In [26]:
test_predictions = logistic_regression_model.predict(X_test)
pd.DataFrame({'Predictions': test_predictions, 'Actual': y_test})

Unnamed: 0,Predictions,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
...,...,...
45639,0,0
11301,0,0
51614,0,0
4598,0,0


In [27]:
print(f"The balanced accuracy score of the model is: {balanced_accuracy_score(y_test, test_predictions)}")

The balanced accuracy score of the model is: 0.9520479254722232


In [28]:
cf_test_matrix = confusion_matrix(y_test, test_predictions)
cf_test_matrix   

array([[18663,   102],
       [   56,   563]])

In [29]:
testing_report = classification_report(y_test, test_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [31]:
#Question: How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

#Answer: The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels

In [39]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_ros_model, y_ros_model = ros.fit_resample(X,y)

In [33]:
from collections import Counter
print(Counter(X_ros_model))
print(Counter(y_ros_model))
print(f"The y_ros_model resampled data is equivalently split")

Counter({'loan_size': 1, 'interest_rate': 1, 'borrower_income': 1, 'debt_to_income': 1, 'num_of_accounts': 1, 'derogatory_marks': 1, 'total_debt': 1})
Counter({0: 75036, 1: 75036})
The y_ros_model resampled data is equivalently split


In [34]:
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier.fit(X_ros_model, y_ros_model)
predictions = classifier.predict(X_ros_model)
pd.DataFrame({'Predictions': predictions, 'Actual': y_ros_model})

Unnamed: 0,Predictions,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
150067,1,1
150068,1,1
150069,1,1
150070,1,1


In [35]:
print(f"The balanced accuracy score of the model is: {balanced_accuracy_score(y_ros_model, predictions)}")

The balanced accuracy score of the model is: 0.9945026387334079


In [36]:
cf_matrix = confusion_matrix(y_ros_model, predictions)
cf_matrix  

array([[74614,   422],
       [  403, 74633]])

In [37]:
report = classification_report(y_ros_model, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     75036
           1       0.99      0.99      0.99     75036

    accuracy                           0.99    150072
   macro avg       0.99      0.99      0.99    150072
weighted avg       0.99      0.99      0.99    150072



In [38]:
#Question: How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

#Answer: The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)