In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv('Resources/lending_data.csv')
# Review the DataFrame
df.tail()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [11]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df.loan_status
# Separate the X variable, the features
X = df.copy()
X.drop('loan_status',axis=1,inplace=True)

In [12]:
# Review the y variable Series
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [14]:
# Review the X variable DataFrame
X.tail()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
77531,19100.0,11.261,86600,0.65358,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300
77535,15600.0,9.742,72300,0.585062,9,2,42300


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [15]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [16]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression
# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
# Fit the model using training data
classifier.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [58]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [20]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[18673,    86],
       [   32,   593]], dtype=int64)

In [22]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      0.95      0.91       625

    accuracy                           0.99     19384
   macro avg       0.94      0.97      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The model is able to predict healthy loans with close to 100% accuracy, but struggles with high-risk loans. It is performs the worst in precision of high-risk loans, where it mistakenly classifies 86 instances as high-risk when thay are healthy, but does a better job of identifying high-risk loans with only 32 instances where it mistakenly classified high-risk loans as healthy. However, the general accuracy is at 99%, indicating that the model performs well overall.

---

In [24]:
coefficients = classifier.coef_
coefficients

array([[ 4.64130082e-03, -2.36221581e-03, -1.16620644e-03,
         2.88942198e-01, -2.12499607e-01,  1.67374945e+00,
         2.38221861e-04]])

In [32]:
feature_importance = pd.DataFrame({'Feature': [*X_train.columns], 'Coefficient': [*coefficients[0]]})

In [33]:
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
print(feature_importance)

            Feature  Coefficient  Absolute Coefficient
5  derogatory_marks     1.673749              1.673749
3    debt_to_income     0.288942              0.288942
4   num_of_accounts    -0.212500              0.212500
0         loan_size     0.004641              0.004641
1     interest_rate    -0.002362              0.002362
2   borrower_income    -0.001166              0.001166
6        total_debt     0.000238              0.000238


## Try Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [35]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [36]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [37]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

In [44]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

  rf_model = rf_model.fit(X_train_scaled, y_train.ravel())


In [45]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [46]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, rf_predictions)

array([[18680,    79],
       [   72,   553]], dtype=int64)

In [47]:
# Print the classification report for the model
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.88      0.88      0.88       625

    accuracy                           0.99     19384
   macro avg       0.94      0.94      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [50]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.2856517805087359, 'interest_rate'),
 (0.18217568289246236, 'borrower_income'),
 (0.17161259891769354, 'debt_to_income'),
 (0.16256482001289166, 'total_debt'),
 (0.12244025118943648, 'loan_size'),
 (0.07543498689330673, 'num_of_accounts'),
 (0.00011987958547331872, 'derogatory_marks')]

## Try KNeighbors

In [48]:
from sklearn.neighbors import KNeighborsClassifier

In [49]:
knn = KNeighborsClassifier(n_neighbors=3)

In [51]:
# Train the model
knn.fit(X_train_scaled, y_train)

In [52]:
knn_predictions = knn.predict(X_test_scaled)

In [53]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, knn_predictions)

array([[18677,    82],
       [   47,   578]], dtype=int64)

In [54]:
# Print the classification report for the model
print(classification_report(y_test, knn_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.88      0.92      0.90       625

    accuracy                           0.99     19384
   macro avg       0.94      0.96      0.95     19384
weighted avg       0.99      0.99      0.99     19384



## Try Logistic Regression with scaled data

In [55]:
classifier_scaled = LogisticRegression(solver='lbfgs', random_state=1)
# Fit the model using training data
classifier_scaled.fit(X_train_scaled, y_train)

In [61]:
predictions_scaled = classifier_scaled.predict(X_test_scaled)

In [62]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions_scaled)

array([[18669,    90],
       [   14,   611]], dtype=int64)

In [63]:
# Print the classification report for the model
print(classification_report(y_test, predictions_scaled))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      0.98      0.92       625

    accuracy                           0.99     19384
   macro avg       0.94      0.99      0.96     19384
weighted avg       1.00      0.99      0.99     19384



In [65]:
coefficients_scaled = classifier_scaled.coef_
feature_importance = pd.DataFrame({'Feature': [*X_train.columns], 'Coefficient': [*coefficients_scaled[0]]})
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
print(feature_importance)

            Feature  Coefficient  Absolute Coefficient
3    debt_to_income     7.743048              7.743048
4   num_of_accounts    -0.859236              0.859236
2   borrower_income    -0.588518              0.588518
6        total_debt    -0.588518              0.588518
1     interest_rate    -0.587286              0.587286
0         loan_size    -0.426810              0.426810
5  derogatory_marks     0.289596              0.289596
