In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

Prediction: The Random Forest Classifier Model will be more accurate because it will remove any noisy parameters from the model.

In [2]:
# Import the data
file_path = Path('Resources/lending_data.csv')

df = pd.read_csv(file_path)

df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Split the data into X_train, X_test, y_train, y_test

from sklearn.preprocessing import StandardScaler

y = df['loan_status']
X = df.drop('loan_status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Creating a Logistic Regression Model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

# Fitting model to the training data
classifier.fit(X_train_scaled, y_train)

# Printing Model Score
print(f'Training Score: {classifier.score(X_train_scaled, y_train)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test)}')

Training Score: 0.9942908240473243
Testing Score: 0.9936545604622369


In [8]:
# Creating a Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1, n_estimators = 100).fit(X_train_scaled, y_train)

from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)

scaler = StandardScaler().fit(X_selected_train)

X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

clf.fit(X_selected_train_scaled, y_train)



print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')



Training Score: 0.9974893382858715
Testing Score: 0.9916941807676434


Conclusion: The logistic model ended up being slightly more accurate than the Random Forest Model. This is likely because all parameters added value to the model, and in removing parameters from the Random Forest Model it actually made the model less accurate. Both models were very accurate, however, the logistic model ended up slightly outperforming.