In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("./Resources/lending_data.csv")
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Count of rows with null values
df.isna().sum()

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

In [4]:
y = df["loan_status"]
target_names = ["negative", "positive"]

In [5]:
X = df.drop("loan_status", axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Assign X and y values
        
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [7]:
# Checking y 
y

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

In [8]:
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [9]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
X_train.shape, X_test.shape

((58152, 7), (19384, 7))

# Logistic Regression 

In [11]:
# Train a Logistic Regression model print the model score
# Step 1: Instantiate model
model = LogisticRegression()

In [12]:
# Normalize data using Standard scaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Re-fit the sclaed data
model.fit(X_train_scaled, y_train)

In [14]:
# Evaluate the model
print("Train R2 Score: ", model.score(X_train_scaled, y_train))
print("Test R2 Score: ", model.score(X_test_scaled, y_test))

Train R2 Score:  0.9941188609162196
Test R2 Score:  0.9941704498555509


In [15]:
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score
#Assess other metrics for the model
y_true = y_test
y_pred = model.predict(X_test_scaled)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.98      0.91       592

    accuracy                           0.99     19384
   macro avg       0.93      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



# Random Forest Classifier

In [16]:
# Fit a Random Forests classifier to the data
from sklearn.ensemble import RandomForestClassifier

# Step 1: Instantiate model
clf = RandomForestClassifier(random_state=1, n_estimators=500)

In [17]:
# Re-fit Random Forest classifier to the scaled data
clf.fit(X_train_scaled, y_train)

In [18]:
print(f"Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Score: {clf.score(X_test_scaled, y_test)}")

Training Score: 0.9971970009629936
Testing Score: 0.991900536524969


In [19]:
#Assess other metrics for the model
y_true = y_test
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.84      0.90      0.87       592

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.93     19384
weighted avg       0.99      0.99      0.99     19384



# Hyperparameter Tuning and Re-evaluation of models

## model = LogisticRegression()

In [20]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# parameter grid
param_grid = {
    'tol' : [0.001, 0.01, 0.1], 
    'C': [0.1, 1, 10, 100],
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

In [21]:
grid_clf = GridSearchCV(model, param_grid, verbose=3)

In [22]:
grid_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END C=0.1, solver=newton-cg, tol=0.001;, score=0.995 total time=   0.7s
[CV 2/5] END C=0.1, solver=newton-cg, tol=0.001;, score=0.993 total time=   0.3s
[CV 3/5] END C=0.1, solver=newton-cg, tol=0.001;, score=0.992 total time=   0.4s
[CV 4/5] END C=0.1, solver=newton-cg, tol=0.001;, score=0.993 total time=   0.3s
[CV 5/5] END C=0.1, solver=newton-cg, tol=0.001;, score=0.994 total time=   0.3s
[CV 1/5] END .C=0.1, solver=newton-cg, tol=0.01;, score=0.995 total time=   0.2s
[CV 2/5] END .C=0.1, solver=newton-cg, tol=0.01;, score=0.993 total time=   0.2s
[CV 3/5] END .C=0.1, solver=newton-cg, tol=0.01;, score=0.992 total time=   0.2s
[CV 4/5] END .C=0.1, solver=newton-cg, tol=0.01;, score=0.993 total time=   0.3s
[CV 5/5] END .C=0.1, solver=newton-cg, tol=0.01;, score=0.994 total time=   0.2s
[CV 1/5] END ..C=0.1, solver=newton-cg, tol=0.1;, score=0.995 total time=   0.5s
[CV 2/5] END ..C=0.1, solver=newton-cg, tol=0.1

In [23]:
print("Tuned Hyperparameters :", grid_clf.best_params_)
print("Accuracy :",grid_clf.best_score_)

Tuned Hyperparameters : {'C': 1, 'solver': 'newton-cg', 'tol': 0.001}
Accuracy : 0.9941188419804666


In [24]:
# Make predictions with the hypertuned model
predictions = grid_clf.predict(X_test_scaled)

In [25]:
#Assess other metrics for the model
y_true = y_test
y_pred = predictions
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.98      0.91       592

    accuracy                           0.99     19384
   macro avg       0.93      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



## model = Random Forest Classifier()

In [40]:
param_grid1 = { 
    'n_estimators': [100, 200],
    'max_depth' : [4, 6, 8],
    'criterion' :['gini', 'entropy']
}

In [41]:
RFC_CV = GridSearchCV(estimator=clf, param_grid=param_grid1, verbose=3)


In [42]:
RFC_CV.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END criterion=gini, max_depth=4, n_estimators=100;, score=0.996 total time=   6.4s
[CV 2/5] END criterion=gini, max_depth=4, n_estimators=100;, score=0.994 total time=   3.7s
[CV 3/5] END criterion=gini, max_depth=4, n_estimators=100;, score=0.993 total time=   2.4s
[CV 4/5] END criterion=gini, max_depth=4, n_estimators=100;, score=0.994 total time=   2.7s
[CV 5/5] END criterion=gini, max_depth=4, n_estimators=100;, score=0.995 total time=   3.0s
[CV 1/5] END criterion=gini, max_depth=4, n_estimators=200;, score=0.996 total time=   4.9s
[CV 2/5] END criterion=gini, max_depth=4, n_estimators=200;, score=0.994 total time=   4.6s
[CV 3/5] END criterion=gini, max_depth=4, n_estimators=200;, score=0.993 total time=   4.2s
[CV 4/5] END criterion=gini, max_depth=4, n_estimators=200;, score=0.994 total time=   4.3s
[CV 5/5] END criterion=gini, max_depth=4, n_estimators=200;, score=0.995 total time=   4.4s
[CV 1/5] END criter

In [43]:
print("Tuned Hyperparameters :", RFC_CV.best_params_)
print("Accuracy :",RFC_CV.best_score_)

Tuned Hyperparameters : {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 100}
Accuracy : 0.9943939865392195


In [44]:
# Make predictions with the hypertuned model
predictions1 = RFC_CV.predict(X_test_scaled)

In [45]:
#Assess other metrics for the model
y_true1 = y_test
y_pred1 = predictions1
print(classification_report(y_true1, y_pred1))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.99      0.92       592

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.96     19384
weighted avg       1.00      0.99      0.99     19384

