In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

# 1. Load Dataset
data = pd.read_csv("kyphosis.csv")
data['Kyphosis'] = data['Kyphosis'].map({'present': 1, 'absent': 0})
X = data.drop('Kyphosis', axis=1)
y = data['Kyphosis']

print("Original dataset shape:", X.shape)
print("Class distribution:\n", y.value_counts())

# 2. Expand Dataset with SMOTE
sm = SMOTE(sampling_strategy={0: 64, 1: 136}, random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print("\nExpanded dataset shape:", X_res.shape)
print("Class distribution after SMOTE:\n", y_res.value_counts())

# 3. Define Models
rf = RandomForestClassifier(n_estimators=200, max_depth=4, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)

stacking_model = StackingClassifier(
    estimators=[('rf', rf), ('gb', gb)],
    final_estimator=LogisticRegression(max_iter=500),
    cv=5,
    passthrough=True,           # pass original features to meta-learner
    stack_method='predict_proba'  # use probabilities instead of hard labels
)

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)
print("\nTrain set shape:", X_train.shape, "| Test set shape:", X_test.shape)

# 5. Train Models
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
stacking_model.fit(X_train, y_train)

# 6. Evaluate Models
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    # Use predict_proba for ROC AUC if available
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = y_pred

    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.3f}")
    print(f"ROC AUC: {roc:.3f}")
    print("\nClassification Report:")
    display(report_df)
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

# Evaluate each model
evaluate_model(rf, X_test, y_test, "Random Forest")
evaluate_model(gb, X_test, y_test, "Gradient Boosting")
evaluate_model(stacking_model, X_test, y_test, "Stacking Ensemble")

# 7. Save Models
joblib.dump(rf, "rf_model.pkl")
joblib.dump(gb, "gb_model.pkl")
joblib.dump(stacking_model, "stacking_model.pkl")
print("\nModels saved successfully: rf_model.pkl, gb_model.pkl, stacking_model.pkl")


Original dataset shape: (81, 3)
Class distribution:
 Kyphosis
0    64
1    17
Name: count, dtype: int64

Expanded dataset shape: (200, 3)
Class distribution after SMOTE:
 Kyphosis
1    136
0     64
Name: count, dtype: int64

Train set shape: (160, 3) | Test set shape: (40, 3)

=== Random Forest ===
Accuracy: 0.950
ROC AUC: 0.989

Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.923077,0.923077,0.923077,13.0
1,0.962963,0.962963,0.962963,27.0
accuracy,0.95,0.95,0.95,0.95
macro avg,0.94302,0.94302,0.94302,40.0
weighted avg,0.95,0.95,0.95,40.0


Confusion Matrix:
 [[12  1]
 [ 1 26]]

=== Gradient Boosting ===
Accuracy: 0.900
ROC AUC: 0.983

Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.846154,0.846154,0.846154,13.0
1,0.925926,0.925926,0.925926,27.0
accuracy,0.9,0.9,0.9,0.9
macro avg,0.88604,0.88604,0.88604,40.0
weighted avg,0.9,0.9,0.9,40.0


Confusion Matrix:
 [[11  2]
 [ 2 25]]

=== Stacking Ensemble ===
Accuracy: 0.925
ROC AUC: 0.957

Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.916667,0.846154,0.88,13.0
1,0.928571,0.962963,0.945455,27.0
accuracy,0.925,0.925,0.925,0.925
macro avg,0.922619,0.904558,0.912727,40.0
weighted avg,0.924702,0.925,0.924182,40.0


Confusion Matrix:
 [[11  2]
 [ 1 26]]

Models saved successfully: rf_model.pkl, gb_model.pkl, stacking_model.pkl
