In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_excel('Updated_Delinquency_Dataset.xlsx')

In [3]:
X = df.drop(['Customer_ID', 'Delinquent_Account'], axis=1)
y = df['Delinquent_Account']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
numerical_features = ['Age', 'Income', 'Credit_Score', 'Credit_Utilization', 'Loan_Balance',
                      'Debt_to_Income_Ratio', 'Account_Tenure', 'Missed_Payments']
categorical_features = ['Employment_Status', 'Credit_Card_Type', 'Location', 'Month_1',
                        'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6']

In [6]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('scaler', StandardScaler())])

In [7]:
cat_pipeline = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [8]:
preprocessor = ColumnTransformer([('num', num_pipeline, numerical_features),
                                  ('cat', cat_pipeline, categorical_features)])

# Logistic Regression Model

In [9]:
logistic_model_pipeline = ImbPipeline([('preprocessor', preprocessor),
                                       ('smote', SMOTE(random_state=42)),
                                       ('classifier', LogisticRegression(random_state=42))])

In [10]:
logistic_model_pipeline.fit(X_train, y_train)

[WinError 2] The system cannot find the file specified
  File "C:\Users\lionh\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\lionh\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\lionh\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [11]:
y_pred = logistic_model_pipeline.predict(X_test)
y_pred_proba = logistic_model_pipeline.predict_proba(X_test)[:, 1]

In [12]:
print("Initial Evaluation on Test Set:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred_proba))

Initial Evaluation on Test Set:
              precision    recall  f1-score   support

           0       0.83      0.63      0.72        84
           1       0.14      0.31      0.19        16

    accuracy                           0.58       100
   macro avg       0.48      0.47      0.45       100
weighted avg       0.72      0.58      0.63       100

[[53 31]
 [11  5]]
AUC-ROC Score: 0.4598214285714286


# Decision Tree model

In [13]:
dt_model_pipeline = ImbPipeline([('preprocessor', preprocessor),
                              ('smote', SMOTE(random_state=42)),
                              ('classifier', DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42))])

In [14]:
dt_model_pipeline.fit(X_train, y_train)

In [15]:
dt_y_pred = dt_model_pipeline.predict(X_test)
dt_y_pred_proba = dt_model_pipeline.predict_proba(X_test)[:, 1]

In [16]:
print("Initial Evaluation on Test Set:")
print(classification_report(y_test, dt_y_pred))
print(confusion_matrix(y_test, dt_y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, dt_y_pred_proba))

Initial Evaluation on Test Set:
              precision    recall  f1-score   support

           0       0.83      0.57      0.68        84
           1       0.14      0.38      0.21        16

    accuracy                           0.54       100
   macro avg       0.49      0.47      0.44       100
weighted avg       0.72      0.54      0.60       100

[[48 36]
 [10  6]]
AUC-ROC Score: 0.5093005952380952


# Random Forest Model

In [17]:
rf_model_pipeline = ImbPipeline([('preprocessor', preprocessor),
                                 ('smote', SMOTE(random_state=42)),
                                 ('classifier', RandomForestClassifier(random_state=42))])

In [18]:
param_grid = {'classifier__n_estimators': [100, 150],
              'classifier__max_depth': [5, 8, 10],
              'classifier__min_samples_leaf': [5, 10]}

In [19]:
grid_search_rf = GridSearchCV(rf_model_pipeline, param_grid,
                              cv=3, scoring='roc_auc',
                              n_jobs=-1, verbose=2)

In [20]:
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [21]:
print("Best parameters found:")
print(grid_search_rf.best_params_)
print(f"Best cross-validation AUC score: {grid_search_rf.best_score_}")

Best parameters found:
{'classifier__max_depth': 10, 'classifier__min_samples_leaf': 5, 'classifier__n_estimators': 100}
Best cross-validation AUC score: 0.5462275819418676


In [22]:
print("\n--- Evaluation of Best Random Forest Model on Test Set ---")

best_rf_model = grid_search_rf.best_estimator_
rf_y_pred = best_rf_model.predict(X_test)
rf_y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, rf_y_pred))
print(confusion_matrix(y_test, rf_y_pred))
print(f"Test Set AUC-ROC Score: {roc_auc_score(y_test, rf_y_pred_proba)}")


--- Evaluation of Best Random Forest Model on Test Set ---
              precision    recall  f1-score   support

           0       0.85      0.99      0.91        84
           1       0.50      0.06      0.11        16

    accuracy                           0.84       100
   macro avg       0.67      0.53      0.51       100
weighted avg       0.79      0.84      0.78       100

[[83  1]
 [15  1]]
Test Set AUC-ROC Score: 0.5022321428571428
