# Credit Risk Evaluator


In [24]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split


## Retrieve the Data

The data is located in the Challenge Files Folder:

- `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.


In [38]:
# Import the data
lending_data_raw_df = pd.read_csv(
    "Starter_Code/Resources/lending_data.csv", low_memory=False)
lending_data_raw_df.head()


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [26]:
lending_data_raw_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


In [27]:
lending_data_cleaned_df = lending_data_raw_df[lending_data_raw_df.duplicated(keep = "first")]
lending_data_cleaned_df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
64,8500.0,6.728,43900,0.316629,3,0,13900,0
72,8600.0,6.773,44300,0.322799,3,0,14300,0
87,8400.0,6.700,43600,0.311927,3,0,13600,0
88,10800.0,7.698,53000,0.433962,5,1,23000,0
113,8600.0,6.778,44400,0.324324,3,0,14400,0
...,...,...,...,...,...,...,...,...
77529,19300.0,11.347,87400,0.656751,12,2,57400,1
77530,19700.0,11.508,88900,0.662542,13,2,58900,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1


In [28]:
lending_data_cleaned_df = lending_data_raw_df.drop_duplicates(keep="first")
lending_data_cleaned_df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77524,16900.0,10.302,77500,0.612903,10,2,47500,1
77526,18300.0,10.895,83100,0.638989,11,2,53100,1
77528,15100.0,9.557,70500,0.574468,9,2,40500,1
77531,19100.0,11.261,86600,0.653580,12,2,56600,1


## Predict Model Performance

You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct!

Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess.


_Replace the text in this markdown cell with your predictions, and be sure to provide justification for your guess._

--------------------------------------------------------------------------------------------------

In V2 - I ran a model with duplicate dropped to see what happened to the solutions.  However I note that there is no customer ID so I it posiable that there are many entries that are different customers with the same data. 

In this case I still think random forrest will be the best model.

My rational for this is that the data being entered for diffrerent applicants can be quite different and even incomplete. Random forrest manages raw data sets of this nature better.  
They also minimise the amount of time required in preapearinmg and cleaning datasets and can reduce bias in the modelling.


## Split the Data into Training and Testing Sets


In [29]:
#Create the features DataFrame, X, by removing the loan_status column. 
# Create y, the labels set, by using the loan_status
X = lending_data_cleaned_df.drop(["loan_status"], axis=1)
y = lending_data_cleaned_df["loan_status"]

In [30]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Create, Fit and Compare Models

Create a Logistic Regression model, fit it to the data, and print the model's score. Do the same for a Random Forest Classifier. You may choose any starting hyperparameters you like.

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the designated markdown cell.


In [31]:
# Create a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [32]:
# Train a Logistic Regression model and print the model score
classifier.fit(X_train, y_train)

LogisticRegression()

In [33]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8923743942871717
Testing Data Score: 0.918960244648318


In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [35]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9658250446314716
Testing Score: 0.8685015290519877


_Which model performed better? How does that compare to your prediction? Replace the text in this markdown cell with your answers to these questions._


The logistic regression model peformed better as the variance between the training and testing data was less / closer at < 3%.  Although the random forrest method gave a better training result there was a 10% variance betrween the Training and Testing data.  This is the opposite of what I though would happen in my original prediction above.  

I feel the cleaning of the data and removing duplicates made the reults look more realistic, however I have concern the data is not dublicated as with a lack of customer id column it is impossiable to know.