## Logistic Regression

We try to predict defaulters here using `logistic regression`. A logistic regression model is quite suitable here since we only have 2 classes to classify the data into. That is, defaulters and non defaulters.

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
def split_data(data):
    x = data.iloc[:, 1:]
    y = data['TARGET']
    x_train, x_rem, y_train, y_rem = train_test_split(x, y, random_state=0, test_size=0.2)
    x_validation, x_test, y_validation, y_test = train_test_split(x_rem, y_rem, random_state=0, test_size=0.5)
    return x_train, x_test, x_validation, y_train, y_test, y_validation

In [3]:
def load_model(file_name):
    loaded_model = pickle.load(open(file_name, 'rb'))
    return loaded_model

In [4]:
def save_model(model, file_name):
    pickle.dump(model, open(file_name, 'wb'))

In [5]:
# Train logistic regression model with hyperparam tuning
def train_log_reg_model(x_train, x_validation, y_train, y_validation):
    solver_penalties = {
        'liblinear': ['l1', 'l2'],
        'sag': ['l2', None],
        'saga': ['l1', 'l2', 'elasticnet', None],
        'lbfgs': ['l2', None],
        'newton-cg': ['l2', None],
        'newton-cholesky' : ['l2', None]
    }
    max_score = float('-inf')
    optimal_model = None
    for solver, penalties in solver_penalties.items():
        for penalty in penalties:
            n_jobs = None if solver == 'liblinear' else -1
            l1_ratios = [i/10 for i in range(1,10)] if penalty == 'elasticnet' else [None]
            regularization_strengths = [10 ** (i - 3) for i in range(7)] if penalty is not None else [1.0]
            for c in regularization_strengths:
                for l1_ratio in l1_ratios:
                    logistic_regression_model = LogisticRegression(solver=solver, penalty=penalty, C=c,
                                                                   random_state=0, max_iter=4000, 
                                                                   n_jobs=n_jobs, l1_ratio=l1_ratio)
                    logistic_regression_model.fit(x_train, y_train)
                    score = logistic_regression_model.score(x_validation, y_validation)
                    if score > max_score:
                        max_score = score
                        optimal_model = logistic_regression_model
    return optimal_model
                    

In [6]:
def plot_cm(model, x_test, y_test):
    class_labels = ['Non Defaulter', 'Defaulter']
    disp = ConfusionMatrixDisplay.from_estimator(model, x_test, y_test,
                                                 cmap="Blues", display_labels=class_labels)
    score = round(model.score(x_test, y_test) * 100, 2)
    true_positives = int(disp.text_[1][1].get_text())
    true_negatives = int(disp.text_[0][0].get_text())
    false_positives = int(disp.text_[0][1].get_text())
    false_negatives = int(disp.text_[1][0].get_text())
    total_non_defaulters = true_negatives + false_positives
    total_defaulters = true_positives + false_negatives
    true_positive_rate = round((true_positives / total_defaulters) * 100, 2)
    false_negative_rate = round((false_negatives / total_defaulters) * 100, 2)
    true_negative_rate = round((true_negatives / total_non_defaulters) * 100, 2)
    false_positive_rate = round((false_positives / total_non_defaulters) * 100, 2)
    disp.ax_.set_title('Confusion matrix of logistic regression model\n'
                       + f'Accuracy score: {score}%\n' 
                       + f'True positive rate: {true_positive_rate}%\n' 
                       + f'False negative rate: {false_negative_rate}%\n' 
                       + f'True negative rate: {true_negative_rate}%\n' 
                       + f'False positive rate: {false_positive_rate}%')
    plt.show()

In [7]:
# Read cleaned data
data = pd.read_csv("data/cleaned_final.csv")

In [8]:
# Sample 100% of data (full data)
full_shuffled = data.sample(frac=1)
full_shuffled.reset_index(drop=True, inplace=True)
full_shuffled

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,HOUR_APPR_PROCESS_START,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
0,0,0.000000,0.002470,0.182941,0.392880,0.558117,0.044619,0.884444,0.372794,0.695652,...,0,0,0,0,0,0,0,0,0,0
1,0,0.000000,0.000931,0.051627,0.143358,0.197033,1.000000,0.591359,0.309296,0.782609,...,0,0,0,0,0,0,0,0,0,0
2,0,0.000000,0.000816,0.214366,0.097136,0.465906,0.043888,0.644536,0.923162,0.695652,...,0,0,0,0,0,0,0,0,0,0
3,0,0.052632,0.000794,0.068462,0.421848,0.584308,0.045433,0.997852,0.349590,0.608696,...,0,0,0,0,0,0,0,0,0,0
4,0,0.000000,0.000546,0.178451,0.429796,0.274408,1.000000,0.688797,0.452550,0.478261,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244275,0,0.157895,0.000739,0.034792,0.038481,0.845307,0.045206,0.925219,0.617341,0.695652,...,0,0,0,0,0,0,0,0,0,0
244276,0,0.000000,0.002085,0.158249,0.129331,0.529358,0.041341,0.592372,0.964291,0.521739,...,0,1,0,0,0,0,0,0,0,0
244277,0,0.000000,0.001123,0.079686,0.245631,0.320628,0.039431,0.814162,0.606503,0.434783,...,0,0,0,0,0,0,0,0,0,0
244278,0,0.000000,0.001700,0.102132,0.344429,0.863509,0.046222,0.997082,0.618452,0.434783,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split the data into 80% training set, 10% test set and 10% validation set
(x_train_full, x_test_full, x_validation_full,
 y_train_full, y_test_full, y_validation_full) = split_data(full_shuffled)

print(x_train_full.shape, x_test_full.shape, x_validation_full.shape, 
      y_train_full.shape, y_test_full.shape, y_validation_full.shape)

(195424, 156) (24428, 156) (24428, 156) (195424,) (24428,) (24428,)


In [None]:
# load full trained model if exists, else train new logistic regression model
full_model_file_name = 'log_reg_predictor_full.sav'
path = os.path.join(os.getcwd(), full_model_file_name)
trained_model_full = None
if os.path.exists(path):
    trained_model_full = load_model(full_model_file_name)
else:
    trained_model_full = train_log_reg_model(x_train_full, x_validation_full,
                                             y_train_full, y_validation_full)
    save_model(trained_model_full, full_model_file_name)

In [None]:
# Show full trained model and its accuracy
print("Trained Model Full:", trained_model_full)
print("Accuracy:", trained_model_full.score(x_test_full, y_test_full))

In [None]:
plot_cm(trained_model_full, x_test_full, y_test_full)

As seen above, the true positive rate is very low. That is, the accuracy of the model in predicting a defaulter is very low. This is likely due to an under-representation of defaulters in the dataset (there is not enough data on defaulters to accurately predict a defaulter). Hence, let's try using a dataset that has an equal ratio of defaulters and non defaulters and see if this can potentially improve the true positive rate.

In [None]:
# create a subset of data so that defaulters and non-defaulters are 1:1
defaulters = data.loc[data['TARGET'] == 1]
non_defaulters = data.loc[data['TARGET'] == 0]
non_defaulters_subset = non_defaulters.sample(len(defaulters))
subset_shuffled = pd.concat([non_defaulters_subset, defaulters])
subset_shuffled.reset_index(drop=True, inplace=True)
subset_shuffled

In [None]:
# Split the data into 80% training set, 10% test set and 10% validation set
(x_train_subset, x_test_subset, x_validation_subset,
 y_train_subset, y_test_subset, y_validation_subset) = split_data(subset_shuffled)

print(x_train_subset.shape, x_test_subset.shape, x_validation_subset.shape, 
      y_train_subset.shape, y_test_subset.shape, y_validation_subset.shape)

In [None]:
# load subset trained model if exists, else train new logistic regression model
subset_model_file_name = 'log_reg_predictor_subset.sav'
path = os.path.join(os.getcwd(), subset_model_file_name)
trained_model_subset = None
if os.path.exists(path):
    trained_model_subset = load_model(subset_model_file_name)
else:
    trained_model_subset = train_log_reg_model(x_train_subset, x_validation_subset,
                                               y_train_subset, y_validation_subset)
    save_model(trained_model_subset, subset_model_file_name)

In [None]:
# Show model and model accuracy
print("Trained Model Subset:", trained_model_subset)
print("Accuracy:", trained_model_subset.score(x_test_subset, y_test_subset))

In [None]:
plot_cm(trained_model_subset, x_test_subset, y_test_subset)