In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)
from sklearn.base import clone


Load data

In [2]:
TRAIN_FILE = r"C:\Users\admin\Downloads\archive (5)\mitbih_train.csv"
TEST_FILE  = r"C:\Users\admin\Downloads\archive (5)\mitbih_test.csv"

TARGET_COL = "Label"
N_SPLITS = 7
RANDOM_STATE = 42
N_CLASSES = 5

df_train = pd.read_csv(TRAIN_FILE, header=None)
df_test  = pd.read_csv(TEST_FILE, header=None)

print("Train shape:", df_train.shape)
print("Test shape :", df_test.shape)

X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1].astype(int).values

X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1].astype(int).values


Train shape: (87554, 188)
Test shape : (21892, 188)


Data augmentation

In [3]:
X_train = X_train + np.random.normal(0, 0.01, X_train.shape)


base models

In [4]:
base_models = {
    "ET": ExtraTreesClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
    "RF": RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
    "LGB": LGBMClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE
    ),
    "XGB": XGBClassifier(
        n_estimators=100,
        eval_metric="mlogloss",
        random_state=RANDOM_STATE,
        num_class=N_CLASSES
    )
}


K-fold

In [5]:
skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)


meta feature initialization

In [6]:
meta_features = {
    name: np.zeros((len(X_train), N_CLASSES))
    for name in base_models
}

fold_scores = {name: [] for name in base_models}


train base models and create meta features

In [7]:
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nüîÅ Fold {fold+1}/{N_SPLITS}")

    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    for name, model in base_models.items():
        model.fit(X_tr, y_tr)

        val_pred = model.predict(X_val)
        acc = accuracy_score(y_val, val_pred)
        fold_scores[name].append(acc)

        meta_features[name][val_idx, :] = model.predict_proba(X_val)

        print(f"‚úÖ {name} Accuracy: {acc:.4f}")



üîÅ Fold 1/7
‚úÖ ET Accuracy: 0.9748
‚úÖ RF Accuracy: 0.9723
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47685
[LightGBM] [Info] Number of data points in the train set: 75046, number of used features: 187
[LightGBM] [Info] Start training from score -0.189065
[LightGBM] [Info] Start training from score -3.673619
[LightGBM] [Info] Start training from score -2.716494
[LightGBM] [Info] Start training from score -4.915938
[LightGBM] [Info] Start training from score -2.611174
‚úÖ LGB Accuracy: 0.9773
‚úÖ XGB Accuracy: 0.9776

üîÅ Fold 2/7
‚úÖ ET Accuracy: 0.9721
‚úÖ RF Accuracy: 0.9704
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47685
[LightGBM] [Info] Number of data points in the train set: 75046, n

result of base model

In [8]:
results = pd.DataFrame(fold_scores)
results.loc["Mean"] = results.mean()
results.loc["Std"] = results.std()

print("\nüìä Base model CV results:")
print(results.round(4))

results.to_csv("stacking_base_model_scores_ECG.csv")



üìä Base model CV results:
          ET      RF     LGB     XGB
0     0.9748  0.9723  0.9773  0.9776
1     0.9721  0.9704  0.9751  0.9758
2     0.9711  0.9709  0.9759  0.9769
3     0.9747  0.9729  0.9771  0.9793
4     0.9734  0.9720  0.9760  0.9775
5     0.9721  0.9703  0.9749  0.9759
6     0.9717  0.9696  0.9735  0.9757
Mean  0.9729  0.9712  0.9757  0.9770
Std   0.0014  0.0011  0.0012  0.0012


create meta training set

In [9]:
meta_X = np.hstack([meta_features[name] for name in base_models])
meta_y = y_train

meta_df = pd.DataFrame(meta_X)
meta_df["label"] = meta_y
meta_df.to_csv("meta_train.csv", index=False)

print("\nüíæ Saved meta_train.csv")



üíæ Saved meta_train.csv


meta definition and training

In [10]:
meta_model = XGBClassifier(
    n_estimators=200,
    eval_metric="mlogloss",
    random_state=RANDOM_STATE,
    num_class=N_CLASSES
)

cv_scores = cross_val_score(
    meta_model,
    meta_X,
    meta_y,
    cv=5,
    scoring="accuracy"
)

print("\nüìä Meta-model CV:")
print(f"Mean: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
meta_model.fit(meta_X, meta_y)


üìä Meta-model CV:
Mean: 0.9795 ¬± 0.0010


retrain base model on full data

In [11]:
final_base_models = {}

for name, base_model in base_models.items():
    model = clone(base_model)
    model.fit(X_train, y_train)
    final_base_models[name] = model


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47685
[LightGBM] [Info] Number of data points in the train set: 87554, number of used features: 187
[LightGBM] [Info] Start training from score -0.189069
[LightGBM] [Info] Start training from score -3.673398
[LightGBM] [Info] Start training from score -2.716469
[LightGBM] [Info] Start training from score -4.916982
[LightGBM] [Info] Start training from score -2.611126


evaluation on test set

In [12]:
meta_test = np.hstack([
    model.predict_proba(X_test)
    for model in final_base_models.values()
])

meta_pred_test = meta_model.predict(meta_test)

accuracy = accuracy_score(y_test, meta_pred_test)
precision = precision_score(y_test, meta_pred_test, average="macro")
recall = recall_score(y_test, meta_pred_test, average="macro")
f1 = f1_score(y_test, meta_pred_test, average="macro")

cm = confusion_matrix(y_test, meta_pred_test)

print("\nüìä TEST PERFORMANCE:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
print("\nConfusion Matrix:\n", cm)



üìä TEST PERFORMANCE:
Accuracy : 0.9802
Precision: 0.9240
Recall   : 0.8661
F1-score : 0.8927

Confusion Matrix:
 [[18014    50    33    10    11]
 [  144   407     3     0     2]
 [   64     5  1356    16     7]
 [   29     0    21   112     0]
 [   33     1     4     0  1570]]


In [None]:
joblib.dump(final_base_models, "final_base_models_ECG.pkl")
joblib.dump(meta_model, "meta_model_ECG.pkl")