In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import the data and create a Pandas dataframe
file = 'Resources/lending_data.csv'
credit_df = pd.read_csv(file)
credit_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Separate the dataset into data and target
y = credit_df['loan_status']
X = credit_df.drop(['loan_status'], axis = 1)
X

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


In [4]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Scale the X data using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled)
print(X_test_scaled)

[[-0.57708952 -0.56367666 -0.5652314  ... -0.43489843 -0.67289855
  -0.5652314 ]
 [-0.95927354 -0.98302549 -0.98332378 ... -0.96014741 -0.67289855
  -0.98332378]
 [ 0.09173251  0.10413354  0.10371642 ...  0.09035056  1.04334691
   0.10371642]
 ...
 [ 0.18727852  0.19070153  0.18733489 ...  0.09035056  1.04334691
   0.18733489]
 [ 0.61723554  0.61229888  0.61737277 ...  0.61559954  1.04334691
   0.61737277]
 [ 0.37837052  0.3807014   0.37846284 ...  0.61559954  1.04334691
   0.37846284]]
[[ 1.33383057  1.32170668  1.32215708 ...  1.14084852  1.04334691
   1.32215708]
 [-0.00381349 -0.00491964 -0.00379305 ...  0.09035056 -0.67289855
  -0.00379305]
 [ 0.52168953  0.53584922  0.5337543  ...  0.61559954  1.04334691
   0.5337543 ]
 ...
 [-0.86372754 -0.86947527 -0.86386881 ... -0.96014741 -0.67289855
  -0.86386881]
 [ 0.80832755  0.79555319  0.79655522 ...  0.61559954  1.04334691
   0.79655522]
 [-0.29045151 -0.30284843 -0.30243047 ... -0.43489843 -0.67289855
  -0.30243047]]


## Prediction:
I predict the Random Forest model will perform better than the Logistic Regression model because the Random Forest algorithm samples the data and builds smaller, more simple decision trees, which the algorithm then averages. This is in contrast to a Logistic Regression algorithm that uses a linear equation (which can be general and less representative/inclusive) to predict a discrete set of classes/categories. Additionally, Random Forest models are better suited to handle outliers and multiple dimensions and to hedge against overfitting. 

### Train a Logistic Regression model and print the model score

In [6]:
# Create a logistic regression model
classifier = LogisticRegression(max_iter = 10000)
classifier

LogisticRegression(max_iter=10000)

In [7]:
# Fit/train the logistic regression model by using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [8]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9921240885954051
Testing Data Score: 0.9918489475856377


### Train a Random Forest Classifier model and print the model score

In [9]:
# Create a random forest classifer model
rf_classifier = RandomForestClassifier(random_state=1, n_estimators=500)

In [10]:
# Fit/train the logistic regression model by using the training data
rf_classifier.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=500, random_state=1)

In [11]:
# Validate the model by using the test data
print(f"Training Score: {rf_classifier.score(X_train_scaled, y_train)}")
print(f"Testing Score: {rf_classifier.score(X_test_scaled, y_test)}")

Training Score: 0.9975409272252029
Testing Score: 0.9917457697069748


## Results

* **Logistic Regression Score:** 0.9921240885954051
* **Random Forest Score:** 0.9975409272252029
    * As predicted, the Random Forest model performed better than the Logistic Regression model. 
    * Moving forward, as I am working with large datasets with multiple dimensions and would like to make predictions, a Random Forest model is likely the best option since it will (presumably) perform the best.