# Heart Disease Prediction - Binary Classification
## Playground Series S6E2 - Optimized Pipeline

**Goal**: Predict heart disease probability (AUC-ROC metric)
**Approach**: Feature engineering + Stacking + Blending
**Target**: Top 10 (~24 position)

## 1. Setup & Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

print("âœ“ All imports successful")

âœ“ All imports successful


## 2. Data Loading & Paths

In [2]:
# Check if running locally or on Kaggle
import os
IS_KAGGLE = os.path.exists('/kaggle')

if IS_KAGGLE:
    DATA_PATH = Path('/kaggle/input/playground-series-s6e2')
    OUTPUT_PATH = Path('/kaggle/working')
    print("Running on Kaggle")
else:
    DATA_PATH = Path('./data')
    OUTPUT_PATH = Path('./submissions')
    print("Running locally")

print(f"Data path: {DATA_PATH}")
print(f"Output path: {OUTPUT_PATH}")

Running locally
Data path: data
Output path: submissions


## 3. Load Data

In [3]:
# Load datasets
train_df = pd.read_csv(DATA_PATH / 'train.csv')
test_df = pd.read_csv(DATA_PATH / 'test.csv')
submission_sample = pd.read_csv(DATA_PATH / 'sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {list(train_df.columns)}")
print(f"\nFirst rows of train:")
print(train_df.head())

Train shape: (630000, 15)
Test shape: (270000, 14)

Train columns: ['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease']

First rows of train:
   id  Age  Sex  Chest pain type   BP  Cholesterol  FBS over 120  EKG results  \
0   0   58    1                4  152          239             0            0   
1   1   52    1                1  125          325             0            2   
2   2   56    0                2  160          188             0            2   
3   3   44    0                3  134          229             0            2   
4   4   58    1                4  140          234             0            2   

   Max HR  Exercise angina  ST depression  Slope of ST  \
0     158                1            3.6            2   
1     171                0            0.0            1   
2     151                0            0.0

## 4. EDA & Feature Analysis

In [None]:
# Data types and missing values
print("="*60)
print("DATA OVERVIEW")
print("="*60)

print("\nData types:")
print(train_df.dtypes)

print("\nMissing values:")
print(train_df.isnull().sum())

# Feature analysis
print("\nTarget variable distribution:")
print(train_df['Heart Disease'].value_counts())
print(f"\nTarget ratio: {train_df['Heart Disease'].mean():.4f} (% positive)")

# Identify feature types
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# Remove id and target from features
if 'id' in numeric_features:
    numeric_features.remove('id')
if 'Heart Disease' in numeric_features:
    numeric_features.remove('Heart Disease')

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features[:5]}...")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

## 5. Feature Correlations

In [None]:
# Calculate correlations with target
correlations = train_df[numeric_features + ['Heart Disease']].corr()['Heart Disease'].drop('Heart Disease')
correlations = correlations.abs().sort_values(ascending=False)

print("Feature Correlations with Target (absolute):")
print(correlations)

# Basic statistics
print("\n" + "="*60)
print("NUMERIC FEATURE STATISTICS")
print("="*60)
print(train_df[numeric_features].describe())

## 6. Preprocessing

In [None]:
def preprocess_data(df, categorical_features, numeric_features, fit_scalers=True, scalers=None):
    """Encode categoricals, scale numerics"""
    df_proc = df.copy()
    
    if fit_scalers:
        scalers = {}
        # Encode categorical features
        for col in categorical_features:
            if col in df_proc.columns:
                le = LabelEncoder()
                df_proc[col] = le.fit_transform(df_proc[col].astype(str))
                scalers[col] = le
        
        # Scale numeric features
        if numeric_features:
            scaler = StandardScaler()
            df_proc[numeric_features] = scaler.fit_transform(df_proc[numeric_features])
            scalers['numeric'] = scaler
    else:
        # Transform using fitted scalers
        for col in categorical_features:
            if col in df_proc.columns and col in scalers:
                df_proc[col] = scalers[col].transform(df_proc[col].astype(str))
        
        if numeric_features and 'numeric' in scalers:
            df_proc[numeric_features] = scalers['numeric'].transform(df_proc[numeric_features])
    
    return df_proc, scalers

# Preprocess training data
train_processed, scalers = preprocess_data(train_df.drop('id', axis=1), 
                                           categorical_features, 
                                           numeric_features,
                                           fit_scalers=True)

# Preprocess test data
test_processed, _ = preprocess_data(test_df.drop('id', axis=1), 
                                    categorical_features, 
                                    numeric_features,
                                    fit_scalers=False, 
                                    scalers=scalers)

print("âœ“ Preprocessing complete")
print(f"Train shape: {train_processed.shape}")
print(f"Test shape: {test_processed.shape}")

## 7. Train/Val Split & Feature Preparation

In [None]:
# Separate features and target
X_all = train_processed.drop('Heart Disease', axis=1)
y_all = train_processed['Heart Disease']
X_test = test_processed.drop('Heart Disease', axis=1) if 'Heart Disease' in test_processed.columns else test_processed

# 80/20 split for baseline
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, 
                                                    random_state=42, stratify=y_all)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget ratio - Train: {y_train.mean():.4f}, Val: {y_val.mean():.4f}")

## 8. Baseline Model - Logistic Regression

In [None]:
print("="*60)
print("BASELINE MODEL - LOGISTIC REGRESSION")
print("="*60)

baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# Predictions
y_train_pred = baseline_model.predict_proba(X_train)[:, 1]
y_val_pred = baseline_model.predict_proba(X_val)[:, 1]

# Evaluation
train_auc = roc_auc_score(y_train, y_train_pred)
val_auc = roc_auc_score(y_val, y_val_pred)

print(f"Training AUC-ROC: {train_auc:.4f}")
print(f"Validation AUC-ROC: {val_auc:.4f}")

## 9. Model Comparison - 5-Fold Cross-Validation

In [None]:
def auc_score(y_true, y_pred):
    """Calculate AUC-ROC"""
    return roc_auc_score(y_true, y_pred)

def cross_validate_classifier(model_builder, X, y, name, folds=5):
    """5-fold cross-validation for classifiers"""
    kf = KFold(n_splits=folds, shuffle=False, random_state=42)
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        model = model_builder()
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        val_preds = model.predict_proba(X.iloc[val_idx])[:, 1]
        score = auc_score(y.iloc[val_idx], val_preds)
        scores.append(score)
    
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{name:15s}: {mean_score:.4f} Â± {std_score:.4f}")
    return mean_score, std_score

print("="*60)
print("MODEL COMPARISON - 5-FOLD CROSS-VALIDATION")
print("="*60)

# Model builders
def build_lgb():
    return LGBMClassifier(n_estimators=200, learning_rate=0.08, num_leaves=80, 
                         max_depth=8, subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0.1, reg_lambda=0.3, random_state=42, verbose=-1)

def build_xgb():
    return XGBClassifier(n_estimators=250, learning_rate=0.05, max_depth=5,
                        subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1,
                        reg_lambda=0.5, random_state=42, verbosity=0)

def build_cat():
    return CatBoostClassifier(iterations=200, learning_rate=0.08, depth=7,
                             subsample=0.8, random_state=42, verbose=0)

# Test each model
lgb_mean, lgb_std = cross_validate_classifier(build_lgb, X_all, y_all, "LightGBM")
xgb_mean, xgb_std = cross_validate_classifier(build_xgb, X_all, y_all, "XGBoost")
cat_mean, cat_std = cross_validate_classifier(build_cat, X_all, y_all, "CatBoost")

print(f"\nBest model: {'LightGBM' if lgb_mean > xgb_mean and lgb_mean > cat_mean else 'XGBoost' if xgb_mean > cat_mean else 'CatBoost'}")

## 10. Feature Engineering

In [None]:
print("="*60)
print("FEATURE ENGINEERING")
print("="*60)

# Get original numeric features from unprocessed data
X_train_orig = train_df.loc[X_train.index].drop(['id', 'Heart Disease'], axis=1)
X_val_orig = train_df.loc[X_val.index].drop(['id', 'Heart Disease'], axis=1)
X_test_orig = test_df.drop('id', axis=1)

def create_features(df_orig, df_proc):
    """Create engineered features for classification"""
    df_eng = df_proc.copy()
    
    # Get numeric columns from original (unscaled) data
    numeric_cols = [col for col in numeric_features if col in df_orig.columns]
    
    if len(numeric_cols) > 0:
        # Interactions between top correlated features
        if len(numeric_cols) >= 2:
            # Pairwise interactions
            for i in range(min(3, len(numeric_cols))):
                for j in range(i+1, min(4, len(numeric_cols))):
                    col1, col2 = numeric_cols[i], numeric_cols[j]
                    df_eng[f'{col1}_x_{col2}'] = df_orig[col1] * df_orig[col2]
        
        # Polynomial features for top features
        for col in numeric_cols[:3]:
            df_eng[f'{col}_sq'] = df_orig[col] ** 2
            df_eng[f'{col}_sqrt'] = np.sqrt(np.abs(df_orig[col]))
        
        # Ratio features
        if len(numeric_cols) >= 2:
            df_eng[f'{numeric_cols[0]}_div_{numeric_cols[1]}'] = (df_orig[numeric_cols[0]] / 
                                                                    (np.abs(df_orig[numeric_cols[1]]) + 1e-5))
    
    return df_eng

# Create engineered features
X_train_eng = create_features(X_train_orig, X_train)
X_val_eng = create_features(X_val_orig, X_val)
X_test_eng = create_features(X_test_orig, X_test)

print(f"Original features: {X_train.shape[1]}")
print(f"Engineered features: {X_train_eng.shape[1]}")
print(f"New features added: {X_train_eng.shape[1] - X_train.shape[1]}")

## 11. Feature Set Comparison

In [None]:
print("="*60)
print("FEATURE SET COMPARISON - 3-FOLD CV")
print("="*60)

def quick_cv(X, y, model_builder, folds=3):
    """Quick 3-fold CV for comparison"""
    kf = KFold(n_splits=folds, shuffle=False, random_state=42)
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        model = model_builder()
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        val_preds = model.predict_proba(X.iloc[val_idx])[:, 1]
        score = auc_score(y.iloc[val_idx], val_preds)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

# Compare feature sets
orig_mean, orig_std = quick_cv(X_all, y_all, build_lgb, folds=3)
eng_mean, eng_std = quick_cv(X_train_eng.append(X_val_eng), y_all, build_lgb, folds=3)

print(f"Original features:   {orig_mean:.4f} Â± {orig_std:.4f} AUC-ROC")
print(f"Engineered features: {eng_mean:.4f} Â± {eng_std:.4f} AUC-ROC")
print(f"Improvement: {(eng_mean - orig_mean):.4f} AUC-ROC")

# Use engineered features
if eng_mean > orig_mean:
    print("âœ“ Using engineered features")
else:
    print("âœ“ Using original features")
    X_train_eng, X_val_eng, X_test_eng = X_train.copy(), X_val.copy(), X_test.copy()

## 12. Stacking Ensemble - 5-Fold Meta-Features

In [None]:
print("="*60)
print("STACKING ENSEMBLE - 5-FOLD CV")
print("="*60)

# Combine train/val for full dataset stacking
X_stack = pd.concat([X_train_eng, X_val_eng], axis=0)
y_stack = pd.concat([y_train, y_val], axis=0)

# 5-fold CV for meta-features
kf_stack = KFold(n_splits=5, shuffle=False, random_state=42)
meta_train = np.zeros((len(X_stack), 2))  # 2 base models
meta_test = np.zeros((len(X_test_eng), 2))

for fold, (train_idx, val_idx) in enumerate(kf_stack.split(X_stack)):
    print(f"Fold {fold+1}/5...", end=" ")
    
    # Train base models on fold
    lgb_base = build_lgb()
    lgb_base.fit(X_stack.iloc[train_idx], y_stack.iloc[train_idx])
    
    xgb_base = build_xgb()
    xgb_base.fit(X_stack.iloc[train_idx], y_stack.iloc[train_idx])
    
    # Meta-features on validation fold
    meta_train[val_idx, 0] = lgb_base.predict_proba(X_stack.iloc[val_idx])[:, 1]
    meta_train[val_idx, 1] = xgb_base.predict_proba(X_stack.iloc[val_idx])[:, 1]
    
    # Average test predictions
    meta_test[:, 0] += lgb_base.predict_proba(X_test_eng)[:, 1] / 5
    meta_test[:, 1] += xgb_base.predict_proba(X_test_eng)[:, 1] / 5
    
    lgb_val_auc = auc_score(y_stack.iloc[val_idx], meta_train[val_idx, 0])
    xgb_val_auc = auc_score(y_stack.iloc[val_idx], meta_train[val_idx, 1])
    print(f"LGB={lgb_val_auc:.4f}, XGB={xgb_val_auc:.4f}")

# Train meta-learner (Ridge for classification)
meta_learner = Ridge(alpha=1.0)
meta_learner.fit(meta_train, y_stack)

# Meta-learner predictions
stack_train_preds = meta_learner.predict(meta_train)
stack_test_preds = meta_learner.predict(meta_test)

print(f"\nStack train AUC: {auc_score(y_stack, stack_train_preds):.4f}")
print(f"Meta-learner weights: LGB={meta_learner.coef_[0]:.4f}, XGB={meta_learner.coef_[1]:.4f}")

## 13. Blending - Final Ensemble

In [None]:
print("="*60)
print("BLENDING - WEIGHTED ENSEMBLE")
print("="*60)

# Train final base models on full dataset
lgb_final = build_lgb()
lgb_final.fit(X_stack, y_stack)
lgb_preds_test = lgb_final.predict_proba(X_test_eng)[:, 1]

xgb_final = build_xgb()
xgb_final.fit(X_stack, y_stack)
xgb_preds_test = xgb_final.predict_proba(X_test_eng)[:, 1]

# Blend: weight 0.4 LGB, 0.4 XGB, 0.2 Stacking
blend_preds = 0.4 * lgb_preds_test + 0.4 * xgb_preds_test + 0.2 * stack_test_preds

print(f"\nâœ“ Blend weights: LGB=0.4, XGB=0.4, Stacking=0.2")
print(f"Blend predictions shape: {blend_preds.shape}")
print(f"Blend prediction range: [{blend_preds.min():.4f}, {blend_preds.max():.4f}]")
print(f"Mean: {blend_preds.mean():.4f}, Std: {blend_preds.std():.4f}")

## 14. Final Submission

In [None]:
print("="*60)
print("FINAL SUBMISSION")
print("="*60)

# Create submission
submission_final = submission_sample.copy()
submission_final['Heart Disease'] = blend_preds
submission_final.to_csv(OUTPUT_PATH / 'submission.csv', index=False)

print(f"\nâœ“ Submission saved to submission.csv")
print(f"  Format: Stacking (LGB + XGB) + Blending")
print(f"  Features: {X_test_eng.shape[1]} total")
print(f"\nSubmission preview:")
print(submission_final.head(10))
print(f"\nSubmission stats:")
print(f"  Records: {len(submission_final)}")
print(f"  Mean: {blend_preds.mean():.4f}")
print(f"  Min:  {blend_preds.min():.4f}")
print(f"  Max:  {blend_preds.max():.4f}")

## 15. Summary

In [None]:
print("\n" + "="*70)
print("FINAL SUMMARY - HEART DISEASE PREDICTION")
print("="*70)

print("\nðŸ“Š PIPELINE:")
print(f"  âœ“ Baseline Logistic Regression: {val_auc:.4f} AUC")
print(f"  âœ“ LightGBM (5-fold): {lgb_mean:.4f} Â± {lgb_std:.4f}")
print(f"  âœ“ XGBoost (5-fold): {xgb_mean:.4f} Â± {xgb_std:.4f}")
print(f"  âœ“ CatBoost (5-fold): {cat_mean:.4f} Â± {cat_std:.4f}")
print(f"  âœ“ Feature Engineering: {X_test_eng.shape[1]} features")
print(f"  âœ“ Stacking Ensemble: 5-fold meta-features + Ridge")
print(f"  âœ“ Blending: LGB 0.4 + XGB 0.4 + Stacking 0.2")

print("\nðŸŽ¯ SUBMISSION:")
print(f"  Location: submissions/submission.csv")
print(f"  Records: {len(submission_final)}")
print(f"  Prediction range: [{blend_preds.min():.4f}, {blend_preds.max():.4f}]")
print(f"  Status: âœ… Ready for upload")

print("\n" + "="*70)