#### Loan Default Prediction with Machine Learning 
Model A – Logisic Regression (Binary Classification Model)

In [11]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 22:39:28 2024

@author: Felipe J.

Loan Default Prediction with ML pt.4, Classification w/ Logistic Regression.

"""

# Import Packages and Data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

loan_df = pd.read_csv('/Users/pjaram84/Desktop/Independent Pursuits/Machine Learning with CFI/Loan Default Prediction ML/data/vehicle_loans_feat_eng.csv', 
                      index_col='UNIQUEID')

#### Train, Test, and Split the Data

In [12]:
#%% Train Test Split

loan_df.info()
category_cols = ['MANUFACTURER_ID', 'STATE_ID', 'DISBURSAL_MONTH', 'DISBURSED_CAT', # Observe, the first 3 columns are 
                 'PERFORM_CNS_SCORE_DESCRIPTION', 'EMPLOYMENT_TYPE']                # stored as int64 type but they're
# Convert to categorical using ' .astype() ' function.                              # really categorical data.
loan_df[category_cols] = loan_df[category_cols].astype('category')  # Specify and convert columns as categorical.
                                                                    # data type 'category' is more memory efficient than 'object'

# Excercise: select 6 variables including our Target (Y) and 'Disbursed_Cat' in new df. Perform TrainTestSplit.
loan_df_small = loan_df[['TOTAL_DISBURSED_AMOUNT', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE', 
                        'OVERDUE_PCT', 'LTV', 'LOAN_DEFAULT']] # We have a mix of numerical and categorical data.

X = loan_df_small.drop(columns=['LOAN_DEFAULT'], axis=1) # Define X data. Note, Recall 'axis=1'
y = loan_df_small['LOAN_DEFAULT']                        # Define Y data.

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=84)  
    # ORDER MATTERS! x_train,  x_test,  y_train,  y_test                                                                                                         
    # We're not done. We didn't OneHotEncoded category features.                        
    # Must do it.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 34 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   DISBURSED_AMOUNT                     233154 non-null  float64
 1   ASSET_COST                           233154 non-null  float64
 2   LTV                                  233154 non-null  float64
 3   MANUFACTURER_ID                      233154 non-null  int64  
 4   EMPLOYMENT_TYPE                      233154 non-null  object 
 5   STATE_ID                             233154 non-null  int64  
 6   MOBILENO_AVL_FLAG                    233154 non-null  int64  
 7   AADHAR_FLAG                          233154 non-null  int64  
 8   PAN_FLAG                             233154 non-null  int64  
 9   VOTERID_FLAG                         233154 non-null  int64  
 10  DRIVING_FLAG                         233154 non-null  int64  
 11  PASSPORT

#### One Hot Encoding categorical variables

In [13]:
#%% Dummy Variables – Variable Encoding
# Turn unordered categorical features into useful data; so our Logistic Regression Model can deal with it.

# If we were to build our binary classifier right now we'd get an error message due to      # Example: Get every different entry in State_ID column and make it a separate
# our category column(s). Our model doesnt know who to treat 'str' instead of int data.     # column. Then give 0s and 1s to each row whether they have (or not) that ID.

# One Hot Encode
loan_data_dumm = pd.get_dummies(loan_df_small, prefix_sep='_ _ _', drop_first=True)     # Here we have added multiple new columns due to OneHotEncoding. (Binary Columns).
    # pd.get_dummies() only OneHotEncoded 'category' columns, so the other ones are left intact.
    # in practice, we would one hot encode with sklearn OHE.
    
#%% Train and Validate
# Train Testn Split dataframe including the OneHotEncoded columns we just created.
X_enc = loan_data_dumm.drop(columns=['LOAN_DEFAULT'], axis=1)
Y_enc = loan_data_dumm['LOAN_DEFAULT']
x_train_enc, x_test_enc, y_train_enc, y_test_enc = train_test_split(X_enc, Y_enc, train_size=0.8, random_state=84)


In [14]:
#%% Build the Binary Classifier Model

our_logistic_model = LogisticRegression() # Initialize
our_logistic_model.fit(x_train_enc, y_train_enc)  # Fit the logistic model.

#%% Generate Predictions
preds = our_logistic_model.predict(x_test_enc) # Predictions made on X-test data.

# Get Accuracy
our_logistic_model.score(x_test_enc, y_test_enc) # Great, Predictions compared to Actual Y-test output.
                                                 # Our model has 78% accuracy.

0.7836846732860114

In [15]:
model_accuracy = our_logistic_model.score(x_test_enc, y_test_enc)
print(f"Model Accuracy: {model_accuracy * 100:.2f}%")

Model Accuracy: 78.37%


In [16]:
#%% Summary:                                                 
"""
Summary:
- We OneHotEncoded necessary Categorical Features
- We split our data into X and Y dataframes
- Then we used train_test_split function
- We fed our data to our LogisticRegression Model and made Predictions.
"""

'\nSummary:\n- We OneHotEncoded necessary Categorical Features\n- We split our data into X and Y dataframes\n- Then we used train_test_split function\n- We fed our data to our LogisticRegression Model and made Predictions.\n'