# üîÅ Loan Default Prediction (Balanced Logistic Regression with Threshold Tuning)

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load enhanced dataset
df = pd.read_csv("/Users/devanshudixit/Desktop/projects/BankIQ/data/processed/enhanced_customers.csv")
df.head()


Unnamed: 0,CustomerID,Name,Age,Gender,Income,Location,CreditScore,RiskProfile,RelationshipLength,MaritalStatus,...,ProductCount,ActiveProductCount,ProductEngagementScore,AvgTransactionAmount,TransactionFrequency,AvgLoanAmount,AvgEMItoIncomeRatio,HighRiskLoan,LoanBurdenScore,SupportFrequency
0,0148fa1c-5a7d-48a3-925c-2b06ffc907ce,Dominique Bradshaw,49,Other,108074,Leeland,573,High,17,Divorced,...,5,3.0,0.6,2105.35,11,0.0,0.0,0.0,0.0,1.0
1,9a82df3e-2406-4679-9dfc-e42f0888a627,Alicia Davis,26,Other,188870,Williamsonside,409,Low,13,Divorced,...,5,2.0,0.4,2864.696667,9,464040.0,0.004167,0.0,1933.5,5.0
2,de8116f2-4303-4e09-b3ea-16cef2dafcb8,Brooke Estes,56,Male,97001,New Stacyborough,463,High,12,Single,...,5,1.0,0.2,2592.950909,11,823343.0,0.003788,0.0,3118.72,4.0
3,ad3175a2-3c3c-4e6b-9008-9900673092a5,Ryan Harrell,58,Female,157205,Lake Emilyfort,462,Medium,15,Divorced,...,2,0.0,0.0,2252.98,7,0.0,0.0,0.0,0.0,2.0
4,c7c0c3ad-2259-4325-a779-60f89c0d7cc4,Christina Park,40,Male,194139,Port Kevin,538,High,18,Divorced,...,5,2.0,0.4,2704.130833,12,0.0,0.0,0.0,0.0,3.0


In [3]:

# View available columns
print(df.columns.tolist())


['CustomerID', 'Name', 'Age', 'Gender', 'Income', 'Location', 'CreditScore', 'RiskProfile', 'RelationshipLength', 'MaritalStatus', 'AgeGroup', 'IncomeBracket', 'RiskScore', 'ProductCount', 'ActiveProductCount', 'ProductEngagementScore', 'AvgTransactionAmount', 'TransactionFrequency', 'AvgLoanAmount', 'AvgEMItoIncomeRatio', 'HighRiskLoan', 'LoanBurdenScore', 'SupportFrequency']


In [5]:

# Filter rows with valid target
df = df.dropna(subset=["HighRiskLoan"])

# Select features
features = [
    "LoanBurdenScore", "AvgLoanAmount", "AvgEMItoIncomeRatio",
    "CreditScore", "Income", "Age"
]
target = "HighRiskLoan"

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:

# Train Logistic Regression with class_weight='balanced'
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Predict probabilities
y_prob = model.predict_proba(X_test)[:, 1]


In [7]:

# Apply a lower threshold (0.3) for better recall
y_pred = (y_prob > 0.3).astype(int)

# Confusion matrix and metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
[[113  68]
 [  1  18]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.62      0.77       181
         1.0       0.21      0.95      0.34        19

    accuracy                           0.66       200
   macro avg       0.60      0.79      0.55       200
weighted avg       0.92      0.66      0.73       200

ROC AUC Score: 0.8112823495202093
