In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv(url, names=column_names)

data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
data = data.astype(float)

data['target'] = data['target'].apply(lambda x: 1 if x > 0 else 0)

X = data.drop('target', axis=1)
y = data['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

log_reg = LogisticRegression()
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# arrays for stacking
train_meta = np.zeros((X_train.shape[0], 2))  # Two models' predictions
test_meta = np.zeros((X_test.shape[0], 2))

# conversion of y_train to numpy array is needed for proper indexing
y_train_array = y_train.values

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train_array[train_index], y_train_array[val_index]
    
    # train first-level
    log_reg.fit(X_fold_train, y_fold_train)
    xgb_clf.fit(X_fold_train, y_fold_train)
    
    # predict
    train_meta[val_index, 0] = log_reg.predict(X_fold_val)
    train_meta[val_index, 1] = xgb_clf.predict(X_fold_val)
    
    test_meta[:, 0] += log_reg.predict(X_test)
    test_meta[:, 1] += xgb_clf.predict(X_test)

# average test set predictions
test_meta /= kf.get_n_splits()

# train second-level
meta_model = LogisticRegression()
meta_model.fit(train_meta, y_train_array)
y_pred_meta = meta_model.predict(test_meta)

# outcomes
accuracy_meta = accuracy_score(y_test, y_pred_meta)
precision_meta = precision_score(y_test, y_pred_meta)
print(f'Stacked Model Test Accuracy: {accuracy_meta * 100:.3f}%')
print(f'Stacked Model Test Precision: {precision_meta * 100:.3f}%')


Stacked Model Test Accuracy: 88.333%
Stacked Model Test Precision: 84.000%
