In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# Import the data
df=pd.read_csv('data/lending_data.csv')
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Split the data into X_train, X_test, y_train, y_test
y = df['loan_status']
X = df.drop('loan_status', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [4]:
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
75979,20000.0,11.635,90100,0.667037,13,3,60100
36522,8100.0,6.553,42300,0.29078,2,0,12300
56519,11000.0,7.781,53800,0.442379,5,1,23800
63054,6300.0,5.794,35100,0.145299,1,0,5100
71996,9000.0,6.965,46100,0.349241,3,0,16100


## Logistic Regression (linear classifier)

In [5]:
# Train a Logistic Regression model print the model score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print(f"Training Score: {clf.score(X_train, y_train)}")
print(f"Testing Score: {clf.score(X_test, y_test)}")

Training Score: 0.9916597881414225
Testing Score: 0.99303549319026


In [7]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18693,    80],
       [   55,   556]], dtype=int64)

In [8]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")

accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

precision = tp / (tp + fp)
print(f"Precision: {precision}")

sensitivity = tp / (tp + fn)
print(f"Recall/Sensitivity: {sensitivity}")

f1 = 2*precision*sensitivity / (precision + sensitivity)
print(f"f1 Score: {f1}")


True positives (TP): 556
True negatives (TN): 18693
False positives (FP): 80
False negatives (FN): 55
Accuracy: 0.99303549319026
Precision: 0.8742138364779874
Sensitivity: 0.9099836333878887
f1 Score: 0.8917401764234162


In [9]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18773
           1       0.87      0.91      0.89       611

    accuracy                           0.99     19384
   macro avg       0.94      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



## Random Forest

In [12]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier(random_state=0).fit(X_train, y_train)
print(f"Training Score: {clf.score(X_train, y_train)}")
print(f"Testing Score: {clf.score(X_test, y_test)}")

Training Score: 0.9971454120236621
Testing Score: 0.992777548493603


In [13]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = rfclf.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18696,    77],
       [   63,   548]], dtype=int64)

In [14]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")

accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

precision = tp / (tp + fp)
print(f"Precision: {precision}")

sensitivity = tp / (tp + fn)
print(f"Sensitivity: {sensitivity}")

f1 = 2*precision*sensitivity / (precision + sensitivity)
print(f"f1 Score: {f1}")

True positives (TP): 548
True negatives (TN): 18696
False positives (FP): 77
False negatives (FN): 63
Accuracy: 0.992777548493603
Precision: 0.8768
Sensitivity: 0.8968903436988543
f1 Score: 0.8867313915857605


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18773
           1       0.88      0.90      0.89       611

    accuracy                           0.99     19384
   macro avg       0.94      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Intersting example
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html