In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
Lendingdf = pd.read_csv('Resources/lending_data.csv')

# Review the DataFrame
Lendingdf.head(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


In [9]:
colummns = [x for x in list(Lendingdf.columns)[:-1]]

['loan_size',
 'interest_rate',
 'borrower_income',
 'debt_to_income',
 'num_of_accounts',
 'derogatory_marks',
 'total_debt']

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [20]:
# Separate the data into labels and features

# Separate the y variable, the labels
y =Lendingdf[['loan_status']]


# Separate the X variable, the features
X = Lendingdf.copy()
X.drop('loan_status',inplace=True,axis=1)


In [24]:
# Review the y variable Series
y.tail(10)

Unnamed: 0,loan_status
77526,1
77527,1
77528,1
77529,1
77530,1
77531,1
77532,1
77533,1
77534,1
77535,1


In [25]:
# Review the X variable DataFrame
X.tail(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
77526,18300.0,10.895,83100,0.638989,11,2,53100
77527,20900.0,11.988,93400,0.678801,14,3,63400
77528,15100.0,9.557,70500,0.574468,9,2,40500
77529,19300.0,11.347,87400,0.656751,12,2,57400
77530,19700.0,11.508,88900,0.662542,13,2,58900
77531,19100.0,11.261,86600,0.65358,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300
77535,15600.0,9.742,72300,0.585062,9,2,42300


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [48]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [47]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
model = LogisticRegression(random_state=1)


# Fit the model using training data
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [46]:
# Make a prediction using the testing data
predictions = model.predict(X_test)
list(predictions)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [37]:
# Generate a confusion matrix for the model
cmatrix = confusion_matrix(y_test, predictions)
cmatrix_df = pd.DataFrame(cmatrix)  
cmatrix_df

Unnamed: 0,0,1
0,18663,102
1,56,563


In [42]:
# Print the classification report for the model
print(f'Model Score: {round(model.score(X_test, y_test),2)}')
print('-'*100)
print(classification_report(y_test, predictions))
print('-'*100)

Model Score: 0.99
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384

----------------------------------------------------------------------------------------------------


In [57]:
#Reviewing data contrast between predictions and real accuarcy
X_test['real'] = y_test
X_test['prediction']=predictions
X_test.head(50)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,real,prediction
60914,12600.0,8.469,60300,0.502488,6,1,30300,0,0
36843,9800.0,7.289,49200,0.390244,4,0,19200,0,0
1966,10900.0,7.77,53700,0.441341,5,1,23700,0,0
70137,10700.0,7.666,52700,0.43074,5,1,22700,0,0
27237,9900.0,7.353,49800,0.39759,4,0,19800,0,0
40013,10900.0,7.773,53700,0.441341,5,1,23700,0,0
43107,11500.0,8.015,56000,0.464286,5,1,26000,0,0
61988,8100.0,6.582,42500,0.294118,2,0,12500,0,0
57437,8000.0,6.534,42100,0.287411,2,0,12100,0,0
46757,10100.0,7.426,50500,0.405941,4,1,20500,0,0


In [66]:
#Prediction seems pretty accurable but lets review all of those inputs were model failed 
X_test[X_test['prediction']!=X_test['real']]

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,real,prediction
35204,19200.0,11.273,86700,0.653979,12,2,56700,0,1
70054,18000.0,10.788,82100,0.634592,11,2,52100,0,1
77349,14900.0,9.473,69700,0.569584,8,1,39700,1,0
75055,16000.0,9.935,74100,0.595142,9,2,44100,1,0
38933,16800.0,10.256,77100,0.610895,10,2,47100,0,1
...,...,...,...,...,...,...,...,...,...
59974,19600.0,11.465,88500,0.661017,13,2,58500,0,1
75254,15700.0,9.793,72800,0.587912,9,2,42800,1,0
76707,15300.0,9.609,71000,0.577465,9,2,41000,1,0
75324,16500.0,10.148,76100,0.605782,10,2,46100,1,0


### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** After training the data, seems the model predicts pretty well data, the model score shows 99% accuracy, and after reviewing the report, seems the model lacks on prediction with actual loan status as active


---