In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv('Resources/lending_data.csv')

# Display the first few rows of the DataFrame to review it
print(df.head())

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  loan_status  
0                 1       22800            0  
1                 0       13600            0  
2                 0       16100            0  
3                 1       22700            0  
4                 1       23000            0  


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
from sklearn.model_selection import train_test_split
# Separate the data into labels and features
# Create the labels set (y) from the “loan_status” column
y = df['loan_status']

# Create the features (X) DataFrame from the remaining columns
X = df.drop(columns=['loan_status'])

# Split the data into training and testing datasets
# 70% training and 30% testing and a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Review the y variable Series
print(y_train.head())
print(y_test.head())

19980    0
41312    0
57169    0
7291     0
45213    0
Name: loan_status, dtype: int64
60914    0
36843    0
1966     0
70137    0
27237    0
Name: loan_status, dtype: int64


In [14]:
# Review the X variable DataFrame
print(X_train.head())
print(X_test.head())

       loan_size  interest_rate  borrower_income  debt_to_income  \
19980     7600.0          6.346            40300        0.255583   
41312    10300.0          7.507            51200        0.414062   
57169     9600.0          7.217            48500        0.381443   
7291     10200.0          7.445            50700        0.408284   
45213     9100.0          6.983            46300        0.352052   

       num_of_accounts  derogatory_marks  total_debt  
19980                2                 0       10300  
41312                4                 1       21200  
57169                4                 0       18500  
7291                 4                 1       20700  
45213                3                 0       16300  
       loan_size  interest_rate  borrower_income  debt_to_income  \
60914    12600.0          8.469            60300        0.502488   
36843     9800.0          7.289            49200        0.390244   
1966     10900.0          7.770            53700        0

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data into training and testing datasets using train_test_split
# Assign a random_state of 1 to the function for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# To verify the split, print the shapes of the resulting datasets
print("Training set shape (X):", X_train.shape)
print("Testing set shape (X):", X_test.shape)
print("Training set shape (y):", y_train.shape)
print("Testing set shape (y):", y_test.shape)


Training set shape (X): (54275, 7)
Testing set shape (X): (23261, 7)
Training set shape (y): (54275,)
Testing set shape (y): (23261,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [9]:
# Make a prediction using the testing data
# Use the fitted model to make predictions on the testing feature data (X_test)
y_pred = logistic_regression_model.predict(X_test)

# You can print the predictions to see them
print(y_pred)

[0 0 0 ... 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [10]:
# Generate a confusion matrix for the model
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[22399   116]
 [   70   676]]


In [11]:
# Print the classification report for the model
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     22515
           1       0.85      0.91      0.88       746

    accuracy                           0.99     23261
   macro avg       0.93      0.95      0.94     23261
weighted avg       0.99      0.99      0.99     23261



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** It appears the logistic regression model does a considerbly better job at identifing high risk loans compared to identifying healthy loans. The Confusion Matrix shows the model identified 22,399 loans as high risk with only 116 of them potentially being incorrect. While the model identified 676 healthy loans with 70 of them potentially being incorrect. The difference in accuracy appears to be extremely significant. I would be curious if the model's rate of incorrect decisions would increase or decrease with addtional healthy loans identified.

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Random Forest Classifier with a random state for reproducibility
random_forest_model = RandomForestClassifier(random_state=1)

# Fit the model with the training data
random_forest_model.fit(X_train, y_train)


In [16]:
# Use the model to make predictions on the test set
y_pred_rf = random_forest_model.predict(X_test)

In [17]:
from sklearn.metrics import classification_report

# Generate the classification report
class_report_rf = classification_report(y_test, y_pred_rf)
print("\nClassification Report for Random Forest Classifier:")
print(class_report_rf)


Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     22515
           1       0.85      0.89      0.87       746

    accuracy                           0.99     23261
   macro avg       0.92      0.94      0.93     23261
weighted avg       0.99      0.99      0.99     23261



In [18]:
from sklearn.ensemble import GradientBoostingClassifier

# Instantiate the Gradient Boosting Classifier with a random state for reproducibility
gradient_boosting_model = GradientBoostingClassifier(random_state=1)

# Fit the model with the training data
gradient_boosting_model.fit(X_train, y_train)

In [19]:
# Use the model to make predictions on the test set
y_pred_gb = gradient_boosting_model.predict(X_test)

In [20]:
# Generate the classification report
class_report_gb = classification_report(y_test, y_pred_gb)
print("\nClassification Report for Gradient Boosting Classifier:")
print(class_report_gb)


Classification Report for Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     22515
           1       0.85      0.99      0.92       746

    accuracy                           0.99     23261
   macro avg       0.92      0.99      0.96     23261
weighted avg       0.99      0.99      0.99     23261



---