# Credit Risk Classification

In [3]:
# load dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

## Split Data into Training and Testing Sets

#### 1 – Read and Review Data

In [5]:
# Read the lending_data file into a data frame
lending_df = pd.read_csv('Resources/lending_data.csv')

# review the data frame
lending_df.head(5)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


#### 2 – Create Labels Set and Feature Matrix
Create the labels set, $y$, from the 'loan_status' column. Then create the features matrix, $X$, from the remaining DataFrame columns:

In [8]:
# separate the data into label and features
y = lending_df['loan_status']
X = lending_df.loc[:, lending_df.columns != 'loan_status']

In [19]:
# review the y variable Series
y.describe()

count    77536.000000
mean         0.032243
std          0.176646
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: loan_status, dtype: float64

In [20]:
# review the X variable DataFrame
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


#### 3 – Check Balance of Label Variable
Check the balance of our target variable, $y$, using the `value_counts` function:

In [21]:
# check target variable balance
y.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

#### 4 – Split Data
Split the data into training and testing sets using `train_test_split`:

In [22]:
# import the `train_test_learn` module
from sklearn.model_selection import train_test_split

# split the data, assigning a random_state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [26]:
# quick shape check
print(f"""
X_train shape: {X_train.shape}
X_test shape: {X_test.shape}
y_train shape: {y_train.shape}
y_test shape: {y_test.shape}
""")


X_train shape: (58152, 7)
X_test shape: (19384, 7)
y_train shape: (58152,)
y_test shape: (19384,)



## Train and Test a Logistic Regression Model

### Train with Original Data

#### 1 – Fit a Logistic Regression to the Training Set
Fit a logistic regression model using the training data, `X_train` and `y_train`:

In [None]:
# import the LogisticRegression module
from sklearn.linear_model import LogisticRegression

# instantiate a logistic regression model
# assign a random_state of 1

# fit the model to training data


#### 2 – Predict Against the Testing Set
Save predictions on the testing data labels using the testing feature data, `X_test`, and the fitted model:

In [None]:
# make a prediction against the testing data


#### 3 – Evaluate Model Performance

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# print the `balanced_accuracy`` score of the model


In [None]:
# generate a confusion matrix for the model


In [None]:
# print the classification report for the model


#### 4 – Analysis
The fundamental question is: _How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?_

### Train with Resampled Data

#### 1 – Resample the Data
Use the `RandomOverSampler` module from the `imblearn` (imbalanced learn) library to resample the data. Confirm that the labels have an equal number of data points:

In [None]:
# import the `RandomOverSampler`
from imblearn.over_sampling import RandomOverSampler

# instantiate a random_oversampler model
# # assign a random_state of 1

# fit the original training data to the random_oversampler model


In [None]:
# count distinct values of the resampled labels data


#### 2 – Fit and Predict Using the Resampled Training Set

Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions:

In [None]:
# instantiate ahe logistic regression model
# assign a random_state of 1


# fit the model using the resampled training data


In [None]:
# make a prediction using the testing data


#### 3 – Evaluate Model Performance

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# print the balanced_accuracy score of the model 


In [None]:
# generate a confusion matrix for the model


In [None]:
# print the classification report for the model


#### 4 – Analysis

The fundamental question is: _How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?_