In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_excel('Updated_Delinquency_Dataset.xlsx')

In [3]:
X = df.drop(['Customer_ID', 'Delinquent_Account'], axis=1)
y = df['Delinquent_Account']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
numerical_features = ['Missed_Payments', 'Credit_Score', 'Credit_Utilization', 'Debt_to_Income_Ratio']

In [6]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('scaler', StandardScaler())])
preprocessor = ColumnTransformer([('num', num_pipeline, numerical_features)])

## Random Forest Fine-Tuning

In [7]:
model_pipeline = Pipeline([('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))])

In [8]:
param_grid = {'classifier__n_estimators': [100, 150, 200],
              'classifier__max_depth': [3, 5, 7],
              'classifier__min_samples_leaf': [5, 10]}

In [9]:
grid_search_rf = GridSearchCV(model_pipeline, param_grid, cv=5,
                              scoring='roc_auc', n_jobs=-1, verbose=2)

In [10]:
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [11]:
print("\nBest parameters found:")
print(grid_search_rf.best_params_)
print(f"Best cross-validation (training) AUC score: {grid_search_rf.best_score_}")


Best parameters found:
{'classifier__max_depth': 3, 'classifier__min_samples_leaf': 5, 'classifier__n_estimators': 200}
Best cross-validation (training) AUC score: 0.5052477994642175


In [12]:
print("\n--- Evaluation of Best Decision Tree Model on Test Set ---")
best_rf_model = grid_search_rf.best_estimator_
rf_y_pred = best_rf_model.predict(X_test)
rf_y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, rf_y_pred))
print(confusion_matrix(y_test, rf_y_pred))
print(f"Test Set AUC-ROC Score: {roc_auc_score(y_test, rf_y_pred_proba)}")


--- Evaluation of Best Decision Tree Model on Test Set ---
              precision    recall  f1-score   support

           0       0.84      0.64      0.73        84
           1       0.17      0.38      0.23        16

    accuracy                           0.60       100
   macro avg       0.51      0.51      0.48       100
weighted avg       0.74      0.60      0.65       100

[[54 30]
 [10  6]]
Test Set AUC-ROC Score: 0.46800595238095244
