In [5]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [8]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
def load_data(file_path):
    path = Path(file_path)
    if path.is_file():
        return pd.read_csv(path, encoding='ISO-8859-1')
    else:
        print(f"Error: {file_path} not found.")
        return pd.DataFrame()  # Return empty DataFrame to prevent further errors

lending_data = load_data("Resources/lending_data.csv")

# Review the DataFrame
lending_data.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [9]:
# Separate the data into labels and features

# Separate the y variable, the labels
labels = ["loan_status"]

# Separate the X variable, the features
features = lending_data.columns.difference(labels)

In [12]:
# Review the y variable Series
y = labels
print(y)

['loan_status']


In [13]:
# Review the X variable DataFrame
X = features
print(X)

Index(['borrower_income', 'debt_to_income', 'derogatory_marks',
       'interest_rate', 'loan_size', 'num_of_accounts', 'total_debt'],
      dtype='object')


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [14]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(lending_data[X], lending_data[y], random_state=1)

# Assign a random_state of 1 to the function
# YOUR CODE HERE!

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [15]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!

linear_model = LogisticRegression(random_state=1)

# Fit the model using training data
linear_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [16]:
# Make a prediction using the testing data
# YOUR CODE HERE!
y_pred = linear_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [17]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!
confusion_matrix(y_test, y_pred)

array([[18663,   102],
       [   56,   563]])

In [18]:
# Print the classification report for the model
# YOUR CODE HERE!
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** healthy loan is predicted at 100% precision and are correctly classified but high risk loans are less accurate.  This makes sense because people who pay back their loans have more in common with each other than a high risk loan taker who are closer to the edges of a data set

---