In [25]:
# Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [4]:
# Read in csv and review dataframe
file_path = 'Resources/lending_data.csv'
df_credit = pd.read_csv(file_path)
df_credit.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [12]:
# Seperate features and labels set from 'loan_status' column
X = df_credit.copy()
X = X.drop(columns= ['loan_status'])

# Define the target
y = df_credit['loan_status']

In [20]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [13]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [15]:
# Check the balance of the labels
print(y.value_counts())

0    75036
1     2500
Name: loan_status, dtype: int64


In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [22]:
# Instatiate linear model and fit the model to the training data
log_model = LogisticRegression(solver='lbfgs', random_state=1)

log_model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [23]:
# Predictions using the testing data
predictions = log_model.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
# Calculate accuracy score
balanced_accuracy_score(y_test, predictions)

0.9481182566723452

In [27]:
# Confusion matrix
confusion_matrix(y_test, predictions)

array([[18682,   102],
       [   59,   541]])

In [28]:
# Classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18784
           1       0.84      0.90      0.87       600

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.93     19384
weighted avg       0.99      0.99      0.99     19384



How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

Model performance was slightly better at predicting healthy loans than it was at predicting high risk loans, although it predicts both well. When comparing the precision scores for the model, the report states that the model can predict healthy loans with a precision score of 1.00 and high-risk loans with a precision score of 0.84. When comparing the recall scores for predicting both classes of loans, the report indicates that the model has a higher recall score in predicting the healthy loans; that is, the model outputs very few false negatives (59) as seen in the confusion matrix as well. Overall, the model has an accuracy score of 99%.

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [31]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [34]:
# Count the distinct values of the resampled labels data
y_train_ros.value_counts()

0    56252
1    56252
Name: loan_status, dtype: int64

In [35]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
log_model_resampled = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
log_model_resampled.fit(X_train_ros, y_train_ros)

# Make a prediction using the testing data
ros_predictions = log_model_resampled.predict(X_test)

In [36]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, ros_predictions)

0.9945453577512777

In [37]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, ros_predictions)

array([[18673,   111],
       [    3,   597]])

In [38]:
# Print the classification report for the model
print(classification_report(y_test, ros_predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18784
           1       0.84      0.99      0.91       600

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

Again, the model fit with oversampled data does better at correctly predicting high-risk loans with a precision score of 1.00, and slightly worse at correctly predicting high-risk loans with a precision score of 0.84. These scores are the same as the model that was fit with the original data. When comparing the recall scores for predicting both classes of loans, a significant change is noticed between the recall scores of the two models; The model fit with oversampled data has a recall score of 0.99 for both the healthy and the high risk loans, which is a noticable increase from the recall score of 0.90 for the high-risk loans from the original model. Again, the accuracy score of this model is 99%, however we do have better recall scores.