# Challenge 20: Credit Risk Clasification
### Note: Report in README.md"

In [8]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

# Split the Data into Training and Testing Sets

## Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [13]:
# Load the data into a Pandas DataFrame
lending_data_df = pd.read_csv("Resources/lending_data.csv")

### - Display sample data

In [16]:
lending_data_df.head(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


### - Review the DataFrame:
- find any null values: with 'any_nulls = 'lending_data_df.isnull().values.any()' -> 'print(any_nulls)'
- dataframe statistics with 'lending_data_df.describe()'
- dataframe rows and column number with 'lending_data_df.shape'
- dataframe data types with 'lending_data_df.info()'
- dataframe column headers with 'lending_data_df.columns'

### - find any null values:
- 'any_nulls ='lending_data_df.isnull().values.any()' -> 'print(any_nulls)'

In [20]:
any_nulls = lending_data_df.isnull().values.any()
print(any_nulls)

False


### - get dataframe statistics 
- 'lending_data_df.describe()'

In [23]:
lending_data_df.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.032243
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.176646
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0


### - get number of dataframe rows and columns 
- 'lending_data_df.shape'

In [26]:
lending_data_df.shape

(77536, 8)

### - get dataframe data types 
- 'lending_data_df.info()'

In [29]:
lending_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


### - get dataframe column headers
- 'lending_data_df.columns'

In [32]:
lending_data_df.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

##  Step 2: Separate the data into labels and features

### Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

### - Separate the y variable, the labels
> 'y = lending_data_df['loan_status']'

In [197]:
y = lending_data_df['loan_status']

### - Separate the X variable, the features
> 'feature_cols = ['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt']'<br>
> 'X = lending_data_df.loc[:,feature_cols] '

In [200]:
feature_cols = ['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt']
X = lending_data_df.loc[:,feature_cols] 

### - Review the y variable Series ('0' means OK; '1' means 'default')

In [202]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

### - Review the X variable DataFrame


In [204]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


## Step 3: Split the data into training and testing datasets by using `train_test_split`.

### - Import the train_test_learn module

In [207]:
from sklearn.model_selection import train_test_split

### - Split the data using train_test_split; Assign a random_state of 1 to the function

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42)

### - Check for correct number of records in the X_train set

In [216]:
train_set_size = len(X_train)
print(f"The number of records in X_train: {train_set_size}")
data_set_size = lending_data_df.shape[0]
print(f"The number of records i data set: {lending_data_df.shape[0]}")
fraction_of_data_in_training_set = train_set_size/ data_set_size
print(f"Fraction of records in training set: {fraction_of_data_in_training_set}")

The number of records in X_train: 58152
The number of records i data set: 77536
Fraction of records in training set: 0.75


---

# Create a Logistic Regression Model with the Original Data

##  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

### -Import the LogisticRegression module from SKLearn

In [222]:
from sklearn.linear_model import LogisticRegression

### - Instantiate the Logistic Regression model Assign a random_state parameter of 1 to the model

In [225]:
logreg_model = LogisticRegression()

### - Fit the model using training data

In [228]:
logreg_model.fit(X_train, y_train)

## Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

### - Make a prediction using the testing data

In [246]:
train_predictions = logreg_model.predict(X_train)
test_predictions = logreg_model.predict(X_test)
print(f"Training predictions: {train_predictions}")
print(f"Test predictions: {test_predictions}")

Training predictions: [0 0 0 ... 1 0 0]
Test predictions: [0 0 0 ... 0 0 0]


## Step 3: Evaluate the model’s performance by doing the following:
1. Generate a confusion matrix.
2. Print the classification report.

### - Generate a confusion matrix for the model with training data

In [249]:
training_confusion_matrix = confusion_matrix(y_train, train_predictions)
print(training_confusion_matrix)

[[55944   300]
 [  109  1799]]


### - Generate a confusion matrix for the model with test data

In [254]:
test_confusion_matrix = confusion_matrix(y_test, test_predictions)
print(test_confusion_matrix)

[[18694    98]
 [   33   559]]


### - Print the training classification report for the model

In [261]:
training_report = classification_report(y_train, train_predictions)
print(training_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56244
           1       0.86      0.94      0.90      1908

    accuracy                           0.99     58152
   macro avg       0.93      0.97      0.95     58152
weighted avg       0.99      0.99      0.99     58152



### - Print the testing classification report for the model

In [266]:
testing_report = classification_report(y_test, test_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18792
           1       0.85      0.94      0.90       592

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.95     19384
weighted avg       0.99      0.99      0.99     19384



## Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** WRITE YOUR ANSWER HERE!

---