In [1]:
#To develop predictive models, including logistic regression and machine learning classifiers (Decision Trees, Random Forests), to estimate the probability of credit card default

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt


In [6]:
df = pd.read_csv(r'C:\Users\Swetha\Downloads\default of credit card clients.csv')

In [7]:
#Target variable distribution
df["default payment next month"].value_counts(normalize=True)

default payment next month
0    0.7788
1    0.2212
Name: proportion, dtype: float64

In [8]:
#Prepare predictors and target
X = df.drop(columns=["ID", "default payment next month"])
y = df["default payment next month"]

In [9]:
#train-test split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [10]:
#Standardize predictors (Logistic Regression only)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
#Standardize predictors (Logistic Regression only)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
#Logistic Regression evaluation
# Logistic Regression
logit = LogisticRegression(max_iter=1000)
logit.fit(X_train_scaled, y_train)

# Evaluation
y_prob_logit = logit.predict_proba(X_test_scaled)[:, 1]
auc_logit = roc_auc_score(y_test, y_prob_logit)

auc_logit

print(f"Logistic Regression AUC: {auc_logit:.3f}")



Logistic Regression AUC: 0.715


In [17]:
#Logistic Regression coefficient interpretation

coef_df = pd.DataFrame({
    "Variable": X.columns,
    "Coefficient": logit.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

coef_df.head(10)

Unnamed: 0,Variable,Coefficient
5,PAY_0,0.653654
13,BILL_AMT3,0.106742
6,PAY_2,0.104018
7,PAY_3,0.086926
4,AGE,0.058383
8,PAY_4,0.05653
16,BILL_AMT6,0.047317
14,BILL_AMT4,0.039001
10,PAY_6,0.023046
9,PAY_5,0.015472


In [18]:
#Decision Tree model
#Fits a nonlinear classification model.

tree = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=500,
    random_state=42
)

tree.fit(X_train, y_train)

In [24]:
#Decision Tree evaluation
#Measures predictive performance.
y_prob_tree = tree.predict_proba(X_test)[:, 1]
auc_tree = roc_auc_score(y_test, y_prob_tree)

auc_tree
print(f"AUC: {auc_tree:.3f}")


AUC: 0.749


In [25]:
#Random Forest model
#Fits an ensemble of trees to improve prediction.

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

In [27]:
#Random Forest evaluation
#Evaluates the strongest predictive model.

y_prob_rf = rf.predict_proba(X_test)[:, 1]
auc_rf = roc_auc_score(y_test, y_prob_rf)

auc_rf
print(f"AUC: {auc_rf:.3f}")


AUC: 0.775


In [28]:
#Feature importance (Random Forest)
#Identifies the most influential predictors.

importances = pd.DataFrame({
    "Variable": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

importances.head(10)

Unnamed: 0,Variable,Importance
5,PAY_0,0.330292
6,PAY_2,0.160583
7,PAY_3,0.086063
8,PAY_4,0.084908
9,PAY_5,0.069228
10,PAY_6,0.067789
0,LIMIT_BAL,0.035027
17,PAY_AMT1,0.034424
18,PAY_AMT2,0.02184
19,PAY_AMT3,0.01864


In [29]:
#Model comparison (objective verification)
#Compares models side by side.

results = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest"],
    "AUC": [auc_logit, auc_tree, auc_rf]
})

results

Unnamed: 0,Model,AUC
0,Logistic Regression,0.71503
1,Decision Tree,0.748869
2,Random Forest,0.775262


In [None]:
#Random Forest > Decision Tree > Logistic Regression
#The logistic regression model achieved an AUC of 0.715, indicating good baseline discriminatory power. 
#The decision tree model improved performance to an AUC of 0.749 by capturing nonlinear effects. 
#The random forest model performed best with an AUC of 0.775, highlighting the benefits of ensemble learning in credit risk prediction.
#These results demonstrate a clear trade-off between interpretability and predictive accuracy.