Because the problem is binary classification, i would like to use Logistic Regression with GridSearchCV(Cross-Validation) after doing data wrangling.

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             accuracy_score, confusion_matrix, precision_recall_fscore_support as score)

In [2]:
def load_dataset(file_path):
    """
    Load dataset from a CSV file.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist.")
    return pd.read_csv(file_path)

def apply_transformations(df):
    transformed_df = df.copy()
    for feature in ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']:
        if feature in transformed_df.columns:
            transformed_df[feature] = transformed_df[feature] + 1e-5
            transformed_df[feature], _ = boxcox(transformed_df[feature])
            transformed_df[feature] = StandardScaler().fit_transform(transformed_df[[feature]])
    
    return transformed_df

In [3]:
file_path = 'loan_data_cleaned.csv'
try:
    df = load_dataset(file_path)
except FileNotFoundError as e:
    print(e)
    exit()

In [4]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1.0,1.0,1.0,1.0,0.0,4583.0,1508.0,128.0,360.0,1.0,0.0,0.0
1,1.0,1.0,0.0,1.0,1.0,3000.0,0.0,66.0,360.0,1.0,2.0,1.0
2,1.0,1.0,0.0,0.0,0.0,2583.0,2358.0,120.0,360.0,1.0,2.0,1.0
3,1.0,0.0,0.0,1.0,0.0,6000.0,0.0,141.0,360.0,1.0,2.0,1.0
4,1.0,1.0,0.0,0.0,0.0,2333.0,1516.0,95.0,360.0,1.0,2.0,1.0


In [5]:
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
X_train_transformed = apply_transformations(X_train)
X_test_transformed = apply_transformations(X_test)

In [7]:
# Convert X_train_transformed and X_test_transformed back to DataFrame

X_train_transformed = pd.DataFrame(X_train_transformed, columns=X_train_transformed.columns)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X_test_transformed.columns)

X_train_transformed.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
337,1.0,0.0,0.0,0.0,0.0,-0.761419,0.942775,0.734316,0.195494,1.0,0.0
331,1.0,1.0,3.0,1.0,1.0,1.473344,0.897555,-0.329234,0.195494,1.0,0.0
297,1.0,0.0,0.0,1.0,0.0,1.651967,-1.067121,1.396566,0.195494,1.0,0.0
246,1.0,1.0,0.0,1.0,0.0,-0.730192,0.947199,0.469572,0.195494,1.0,2.0
126,1.0,0.0,0.0,1.0,0.0,-0.829513,-1.067121,-1.554343,0.195494,1.0,2.0


In [8]:
param_grid = [{
    'C': np.logspace(-2, -1, 50),
    'penalty': ['l2','l1','elasticnet','none'],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [10, 20, 50],
    'class_weight': ['None', 'balanced']
}]

In [9]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='roc_auc', n_jobs=-1)

In [10]:
grid_search.fit(X_train_transformed, y_train)
best_model = grid_search.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
y_pred = best_model.predict(X_test_transformed)
y_prob = best_model.predict_proba(X_test_transformed)[:, 1]

In [12]:
precision, recall, fscore, _ = score(y_test, y_pred, average='binary')
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

metrics = pd.Series({'precision': precision, 'recall': recall, 
                     'fscore': fscore, 'accuracy': accuracy,
                     'auc': auc})

print("Best Model Hyperparameters:")
print(grid_search.best_params_)

print("\nModel Metrics:")
print(metrics)


Best Model Hyperparameters:
{'C': 0.07196856730011521, 'class_weight': 'balanced', 'max_iter': 10, 'penalty': 'l1', 'solver': 'saga'}

Model Metrics:
precision    0.795181
recall       0.985075
fscore       0.880000
accuracy     0.812500
auc          0.717705
dtype: float64
