<h1>Bankruptcy Prediction Model


In [2]:
import pandas as pd
import numpy as np
import arff
from glob import glob

# Sklearn / Imputation / Scaling / Model
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")


In [3]:
# Load Dataset

from scipy.io import arff
import pandas as pd

def load_arff_file(path):
    """Load a single ARFF file and clean the data."""
    data, meta = arff.loadarff(path)
    df = pd.DataFrame(data)

    # Convert byte columns to numeric if needed
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].apply(lambda x: x.decode() if isinstance(x, bytes) else x)
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

    # Ensure class column is integer (0/1)
    if 'class' in df.columns:
        df['class'] = df['class'].astype(int)
    else:
        raise ValueError(f"'class' column not found in {path}")
    return df

file_names = ["1year.arff", "2year.arff", "3year.arff", "4year.arff", "5year.arff"]
year_datasets = {}

for i, fname in enumerate(file_names, start=1):
    df = load_arff_file(fname)
    df['company_name'] = [f"Company_{j}" for j in range(1, len(df)+1)]
    year_datasets[i] = df

print("‚úÖ Loaded datasets for horizons:", list(year_datasets.keys()))


‚úÖ Loaded datasets for horizons: [1, 2, 3, 4, 5]


In [6]:
#TRAIN + VALIDATE + TEST PIPELINE
# ================================

models = {}
evals = {}

for horizon, df in year_datasets.items():
    print(f"\n{'='*50}")
    print(f"üèÅ TRAINING & EVALUATION: {horizon}-Year Horizon")
    print(f"{'='*50}")

    # -------------------------------
    # STEP 1: Prepare data
    # -------------------------------
    X = df.drop(columns=['class', 'company_name'])
    y = df['class'].astype(int)

    # 70-15-15 split (stratified)
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.1765,  # 0.1765 of 85% ‚âà 15% of total
        random_state=42, stratify=y_train_val
    )

    print(f"üìä Data Split Summary ‚Üí Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    # -------------------------------
    # STEP 2: Handle class imbalance
    # -------------------------------
    pos_count = int(y_train.sum())
    neg_count = int(len(y_train) - pos_count)
    pos_weight = (neg_count / pos_count) if pos_count > 0 else 1.0

    # -------------------------------
    # STEP 3: Build model pipeline
    # -------------------------------
    pipe = Pipeline([
        ("imputer", IterativeImputer(random_state=42)),
        ("scaler", StandardScaler()),
        ("clf", LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            scale_pos_weight=pos_weight,
            objective="binary",
            random_state=42
        ))
    ])

    # -------------------------------
    # STEP 4: Train on TRAIN set
    # -------------------------------
    pipe.fit(X_train, y_train)
    models[horizon] = pipe

    # -------------------------------
    # STEP 5: Validate on VALIDATION set
    # -------------------------------
    val_pred = pipe.predict(X_val)
    val_proba = pipe.predict_proba(X_val)[:, 1]

    val_acc = accuracy_score(y_val, val_pred)
    val_prec = precision_score(y_val, val_pred, zero_division=0)
    val_rec = recall_score(y_val, val_pred, zero_division=0)
    val_f1 = f1_score(y_val, val_pred, zero_division=0)
    val_auc = roc_auc_score(y_val, val_proba)

    print(f"\nüìà Validation Metrics (15% data):")
    print(f"Accuracy : {val_acc:.3f}")
    print(f"Precision: {val_prec:.3f}")
    print(f"Recall   : {val_rec:.3f}")
    print(f"F1-Score : {val_f1:.3f}")
    print(f"AUC-ROC  : {val_auc:.3f}")


üèÅ TRAINING & EVALUATION: 1-Year Horizon
üìä Data Split Summary ‚Üí Train: 4917, Val: 1055, Test: 1055
[LightGBM] [Info] Number of positive: 189, number of negative: 4728
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4623
[LightGBM] [Info] Number of data points in the train set: 4917, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038438 -> initscore=-3.219511
[LightGBM] [Info] Start training from score -3.219511

üìà Validation Metrics (15% data):
Accuracy : 0.977
Precision: 0.815
Recall   : 0.537
F1-Score : 0.647
AUC-ROC  : 0.875

üèÅ TRAINING & EVALUATION: 2-Year Horizon
üìä Data Split Summary ‚Üí Train: 7120, Val: 1527, Test: 1526
[LightGBM] [Info] Number of positive: 280, number of negative: 6840
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002670 seconds.
Yo

In [10]:
# STEP 6: Final evaluation on TEST set

y_pred = pipe.predict(X_test)
try:
    y_proba = pipe.predict_proba(X_test)[:, 1]
except Exception:
    y_proba = y_pred.astype(float)

# --- Safe metric helper ---
def safe_metric(fn, *args, **kwargs):
    try:
        return fn(*args, **kwargs)
    except Exception:
            return np.nan

# --- Compute metrics ---
acc = safe_metric(accuracy_score, y_test, y_pred)
prec = safe_metric(precision_score, y_test, y_pred, zero_division=0)
rec = safe_metric(recall_score, y_test, y_pred, zero_division=0)
f1 = safe_metric(f1_score, y_test, y_pred, zero_division=0)
auc = safe_metric(roc_auc_score, y_test, y_proba) if len(np.unique(y_test)) > 1 else np.nan
cm = confusion_matrix(y_test, y_pred)

# STEP 7: Print results cleanly

print(f"\n Final TEST Metrics (15% completely unseen data)")
print("-" * 34)
print(f"Accuracy : {acc:.3f}" if not np.isnan(acc) else "Accuracy : N/A")
print(f"Precision: {prec:.3f}" if not np.isnan(prec) else "Precision: N/A")
print(f"Recall   : {rec:.3f}" if not np.isnan(rec) else "Recall   : N/A")
print(f"F1-Score : {f1:.3f}" if not np.isnan(f1) else "F1-Score : N/A")
print(f"AUC-ROC  : {auc:.3f}" if not np.isnan(auc) else "AUC-ROC  : N/A")
print("Confusion Matrix (rows=True, cols=Pred):")
print(cm)
print("\nClassification Report:")
try:
    print(classification_report(y_test, y_pred, digits=3, zero_division=0))
except Exception:
    print("N/A (single-class y_test)")

# -------------------------------
# STEP 8: Save metrics summary
# -------------------------------
evals[horizon] = {
    "val_acc": val_acc,
    "val_f1": val_f1,
    "val_auc": val_auc,
    "test_acc": acc,
    "test_prec": prec,
    "test_rec": rec,
    "test_f1": f1,
    "test_auc": auc,
    "test_cm": cm
    }

# -------------------------------
#  All models complete

print("\nüèÅ All models trained, validated, and evaluated successfully!")

# ================================
#  OPTIONAL SUMMARY TABLE
# ================================
summary = pd.DataFrame.from_dict(evals, orient="index")[[
    "val_acc", "val_f1", "val_auc",
    "test_acc", "test_f1", "test_auc"
]]
print("\n================= PERFORMANCE SUMMARY =================")
print(summary.round(3).to_string())
print("========================================================")



 Final TEST Metrics (15% completely unseen data)
----------------------------------
Accuracy : 0.963
Precision: 0.822
Recall   : 0.597
F1-Score : 0.692
AUC-ROC  : 0.969
Confusion Matrix (rows=True, cols=Pred):
[[817   8]
 [ 25  37]]

Classification Report:
              precision    recall  f1-score   support

           0      0.970     0.990     0.980       825
           1      0.822     0.597     0.692        62

    accuracy                          0.963       887
   macro avg      0.896     0.794     0.836       887
weighted avg      0.960     0.963     0.960       887


üèÅ All models trained, validated, and evaluated successfully!

   val_acc  val_f1  val_auc  test_acc  test_f1  test_auc
5    0.957   0.642    0.951     0.963    0.692     0.969


In [9]:
#one line summary
while True:
    company_input = input("\nEnter company name (e.g. Company_25) or 'exit' to quit: ")
    if company_input.lower() == 'exit':
        break

    bankruptcy_year = None  # to store first predicted bankruptcy
    bankruptcy_proba = None

    for horizon, df in year_datasets.items():
        row = df[df['company_name'] == company_input]
        if row.empty:
            continue

        X_row = row.drop(columns=['class', 'company_name'])
        pred = models[horizon].predict(X_row)[0]
        proba = models[horizon].predict_proba(X_row)[0][1]

        if pred == 1:  # predicted bankrupt
            bankruptcy_year = horizon
            bankruptcy_proba = proba
            break  # stop at the earliest year of predicted bankruptcy

    if bankruptcy_year is not None:
        print(f"\n‚ö†Ô∏è {company_input} is predicted to go BANKRUPT within {bankruptcy_year} year(s).")
        print(f"   (Model confidence: {bankruptcy_proba:.2f})")
    else:
        print(f"\n‚úÖ {company_input} is predicted to REMAIN STABLE across all 1‚Äì5 year horizons.")



Enter company name (e.g. Company_25) or 'exit' to quit:  Company_10009



‚ö†Ô∏è Company_10009 is predicted to go BANKRUPT within 2 year(s).
   (Model confidence: 1.00)



Enter company name (e.g. Company_25) or 'exit' to quit:  Company_1



‚úÖ Company_1 is predicted to REMAIN STABLE across all 1‚Äì5 year horizons.



Enter company name (e.g. Company_25) or 'exit' to quit:  exit
