In [4]:
import pandas as pd
import numpy as np
import joblib

# 1) Safe import for HistGradientBoostingClassifier (handles older sklearn too)
try:
    from sklearn.ensemble import HistGradientBoostingClassifier
except Exception:
    # Required for older scikit-learn versions where HGB was experimental
    from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
    from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report
)

# Robust target loader: handles both 'target' header or no header
def load_target(path: str) -> pd.Series:
    df = pd.read_csv(path)
    return (df['target'] if 'target' in df.columns else df.iloc[:, 0]).astype(int)

In [5]:
# 2) Load original (unscaled) features for tree-based boosting
X_train = pd.read_csv('X_train_original.csv')
X_test  = pd.read_csv('X_test_original.csv')
y_train = load_target('y_train.csv')
y_test  = load_target('y_test.csv')

print("Shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape},  y_test:  {y_test.shape}")

Shapes:
  X_train: (455, 30), y_train: (455,)
  X_test:  (114, 30),  y_test:  (114,)


In [6]:
# Basic integrity checks
if X_train.isnull().sum().sum() > 0 or X_test.isnull().sum().sum() > 0:
    raise ValueError("NaNs found in X_train/X_test. Please re-run preprocessing.")
if y_train.isnull().sum() > 0 or y_test.isnull().sum() > 0:
    raise ValueError("NaNs found in y_train/y_test. Please re-run preprocessing.")
if not np.isfinite(X_train.values).all() or not np.isfinite(X_test.values).all():
    raise ValueError("Non-finite values detected in features. Please re-run preprocessing.")

In [8]:
# 3) Train HistGradientBoostingClassifier (XGBoost-like fallback)
hgb = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=5,
    max_iter=300,
    l2_regularization=0.0,
    random_state=42
)
hgb.fit(X_train, y_train)

In [9]:
# 4) Predictions & probabilities/scores
y_pred = hgb.predict(X_test)

# Prefer predict_proba if available; otherwise use decision_function and sigmoid
if hasattr(hgb, "predict_proba"):
    y_score = hgb.predict_proba(X_test)[:, 1]
elif hasattr(hgb, "decision_function"):
    z = hgb.decision_function(X_test)
    y_score = 1.0 / (1.0 + np.exp(-z))  # sigmoid to get [0,1] scores
else:
    # Fallback (AUC may be less meaningful)
    y_score = y_pred.astype(float)

In [10]:
# 5) Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
auc = roc_auc_score(y_test, y_score)

print("\n=== HistGradientBoosting (XGBoost fallback) Performance (Original features) ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"MCC:       {mcc:.4f}")
print(f"AUC:       {auc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


=== HistGradientBoosting (XGBoost fallback) Performance (Original features) ===
Accuracy:  0.9737
Precision: 1.0000
Recall:    0.9286
F1 Score:  0.9630
MCC:       0.9442
AUC:       0.9944

Confusion Matrix:
[[72  0]
 [ 3 39]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9600    1.0000    0.9796        72
           1     1.0000    0.9286    0.9630        42

    accuracy                         0.9737       114
   macro avg     0.9800    0.9643    0.9713       114
weighted avg     0.9747    0.9737    0.9735       114



In [11]:
# 6) Save model
joblib.dump(hgb, 'hist_gb_model.pkl')
print("\nSaved: hist_gb_model.pkl")


Saved: hist_gb_model.pkl
