In [18]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# file_path = Path("../Resources/sba_loans_encoded.csv")
file_path = Path("./lending_data.csv")
df_creditrisk = pd.read_csv(file_path)

#Review the dataframe
df_creditrisk.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


##### Review the DataFrame Structure 

In [5]:
#Review columns descriptive statistics
df_creditrisk.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.032243
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.176646
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0


In [7]:
#Review dataframe shape to indicate the number of rows and columns 
df_creditrisk.shape

(77536, 8)

In [8]:
#Review dataframe and distribution of values in outcome/"y" column
#A value of 0 in the “loan_status” column means that the loan is healthy. A value of 1 means that the loan has a high risk of defaulting.
df_creditrisk.loan_status.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [12]:
# Separate the data into labels and features
# Labels = "loan_status" , Features = all other columns 

# Define y variable, the labels 
y= df_creditrisk["loan_status"]

#verify all data points are present



(77536,)

In [13]:
# Define X variable, the features
X= df_creditrisk.drop(columns="loan_status")

In [21]:
# Review the y variable Series
y.shape

(77536,)

In [19]:
# Review the X variable DataFrame- should now indicate 8 columns present 
X.shape

X.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt'],
      dtype='object')

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [17]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split, assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
#Scale X Data to compare to model accuaracy of scaled vs unscaled data
scaler= StandardScaler()

#Apply standard scaler to testing data 
X_scaler= scaler.fit(X_train)

#Scale the training and testing data 
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [26]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model, Assign a random_state parameter of 1 to the model
LRmodel1= LogisticRegression(solver='lbfgs', random_state=1)
LRmodel1

# Fit (AKA train) the model using training data
LRmodel1.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [28]:
#Re-do using scaled data 
# Instantiate the Logistic Regression model, Assign a random_state parameter of 1 to the model
LRmodel2= LogisticRegression(solver='lbfgs', random_state=1)
LRmodel2

# Fit (AKA train) the model using scaled training data
LRmodel2.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [27]:
# Make a prediction using the testing data
LRpredictions1= LRmodel1.predict(X_test)
LRscore1= accuracy_score(y_test,LRpredictions1)
LRscore1

0.9918489475856377

In [29]:
#Make a prediction using the SCALED testing data 
LRpredictions2= LRmodel2.predict(X_test_scaled)
LRscore2= accuracy_score(y_test,LRpredictions2)
LRscore2

0.9936545604622369

In [35]:
#Measure model 1 accuracy using balanced accuracy score 
from sklearn.metrics import balanced_accuracy_score
BA_score1= balanced_accuracy_score(y_test, LRpredictions1)
BA_score1

0.9520479254722232

In [36]:
#Measure model 2 accuracy using balanced accuracy score 
from sklearn.metrics import balanced_accuracy_score
BA_score2= balanced_accuracy_score(y_test, LRpredictions2)
BA_score2

0.9889115309798473

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [37]:
# Generate a confusion matrix for the model using non-scaled data 
cm = confusion_matrix(y_test, LRpredictions1)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {LRscore1}")
print(f"Balanced Accuracy Score: {BA_score1}")
print("Classification Report")
print(classification_report(y_test, LRpredictions1))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18663,102
Actual 1,56,563


Accuracy Score : 0.9918489475856377
Balanced Accuracy Score: 0.9520479254722232
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [38]:
# Generate a confusion matrix and print the classificaton report for the model using scaled data 
cm = confusion_matrix(y_test, LRpredictions2)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {LRscore2}")
print(f"Balanced Accuracy Score: {BA_score2}")
print("Classification Report")
print(classification_report(y_test, LRpredictions2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18652,113
Actual 1,10,609


Accuracy Score : 0.9936545604622369
Balanced Accuracy Score: 0.9889115309798473
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.98      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

Out of 18,719 predictions, the first model predicted the correct outcome 18,663 times when not scaled and 18,652 times out of 18,662 when it was scaled in the second model. The accuracy of the first model is 100% and the second model is also 100%.  In prediciting the likelihood of healthy loan the models perform similarly. 

In predicting the likelihood of a "high-risk" loan, neither model performed quite as well. The number of times the first model correctly identified high-risk loans was 85% (563 / 665). The number of times the second model correctly identified high-risk loans was 84% (609/722).  There were fewer instances of high-risk loans for the model to learn from which is likely the reason for this lower score. The recall-the number of times the model accurately predicted the high-risk loans versus the total number of high-risk loans was 91% (563 / 56 + 563) for the first model and 98% (609/609 + 10)for the second model.  

In this case, model 2 in which the data was scaled before fitting to the model proved to have not only a higher balanced accuracy score which accounted for the limited number of data points representing high risk loans, but also had a higher recall score in predictng high risk loans overall.  Therefore model 2 may be superior overall, particularly if identifying individuals who are most likely to default on their loan is of top priority.  


---