In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

print("âœ… Libraries loaded")

âœ… Libraries loaded


In [2]:
# Load data
train = pd.read_csv('/kaggle/input/playground-series-s6e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s6e2/test.csv')

print(f"Train: {train.shape}, Test: {test.shape}")
print(f"Train columns: {train.columns.tolist()}")

# Standardize column names
new_cols_train = ['id', 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
new_cols_test = ['id', 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
                 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

train.columns = new_cols_train
test.columns = new_cols_test

# Convert target to numeric (handle both numeric and text values)
target_mapping = {'Absence': 0, 'Presence': 1, 0: 0, 1: 1, '0': 0, '1': 1}
train['target'] = train['target'].map(target_mapping)

# Ensure all features are numeric
for col in train.columns:
    if col not in ['id', 'target']:
        train[col] = pd.to_numeric(train[col], errors='coerce')
        
for col in test.columns:
    if col != 'id':
        test[col] = pd.to_numeric(test[col], errors='coerce')

# Fill any NaN values
train = train.fillna(train.median(numeric_only=True))
test = test.fillna(test.median(numeric_only=True))

print(f"âœ… Data loaded and cleaned")
print(f"Target values: {train['target'].unique()}")
print(f"Target distribution: {train['target'].value_counts(normalize=True).to_dict()}")


Train: (630000, 15), Test: (270000, 14)
Train columns: ['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease']
âœ… Data loaded and cleaned
Target values: [1 0]
Target distribution: {0: 0.5516603174603174, 1: 0.44833968253968254}


In [3]:
def add_features(df):
    """Minimal high-quality features - closest to Model 1"""
    # Keep Model 1's exact features
    df['age_sq'] = df['age'] ** 2
    df['is_senior'] = (df['age'] >= 65).astype(int)
    df['age_hr'] = df['age'] * df['thalach']
    df['age_bp'] = df['age'] * df['trestbps']
    df['age_chol'] = df['age'] * df['chol']
    df['age_hr_ratio'] = df['age'] / (df['thalach'] + 1)
    df['age_bp_ratio'] = df['age'] / (df['trestbps'] + 1)
    df['hr_reserve'] = 220 - df['age'] - df['thalach']
    df['bp_sq'] = df['trestbps'] ** 2
    df['high_bp'] = (df['trestbps'] > 140).astype(int)
    df['chol_sq'] = df['chol'] ** 2
    df['high_chol'] = (df['chol'] > 240).astype(int)
    df['oldpeak_sq'] = df['oldpeak'] ** 2
    df['has_st_dep'] = (df['oldpeak'] > 0).astype(int)
    df['cardiac_load'] = df['thalach'] * df['trestbps']
    df['ca_thal'] = df['ca'] * df['thal']
    df['ca_sq'] = df['ca'] ** 2
    df['oldpeak_hr'] = df['oldpeak'] * df['thalach']
    df['risk_score'] = (df['chol'] > 240).astype(int) + (df['trestbps'] > 140).astype(int) + (df['fbs'] == 1).astype(int)
    df['male_risk'] = ((df['sex'] == 1) & (df['age'] > 55)).astype(int)
    df['female_risk'] = ((df['sex'] == 0) & (df['age'] > 65)).astype(int)
    df['typical_angina'] = (df['cp'] == 0).astype(int)
    df['asymptomatic'] = (df['cp'] == 3).astype(int)
    
    # Add ONLY these 5 new features that might help
    df['ca_oldpeak'] = df['ca'] * df['oldpeak']
    df['thal_oldpeak'] = df['thal'] * df['oldpeak']
    df['exang_oldpeak'] = df['exang'] * df['oldpeak']
    df['severe_st_dep'] = (df['oldpeak'] > 2).astype(int)
    df['multiple_vessels'] = (df['ca'] >= 2).astype(int)
    
    return df

train = add_features(train)
test = add_features(test)

print(f"âœ… Features engineered (38 features)")
print(f"Train shape: {train.shape}, Test shape: {test.shape}")


âœ… Features engineered (38 features)
Train shape: (630000, 43), Test shape: (270000, 42)


In [4]:
# Split features and target
X = train.drop(['id', 'target'], axis=1).values
y = train['target'].values
X_test = test.drop(['id'], axis=1).values
test_ids = test['id'].values

# Convert to proper types
X = X.astype(np.float32)
X_test = X_test.astype(np.float32)
y = y.astype(np.int32)

print(f"âœ… Data prepared")
print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Target classes: {np.unique(y)}")


âœ… Data prepared
X shape: (630000, 41), y shape: (630000,)
X_test shape: (270000, 41)
Target classes: [0 1]


In [5]:
# CELL 5: LightGBM - Use different random seeds for diversity
print("ðŸš€ Training LightGBM with seed diversity...")

seeds = [42, 123, 456, 789, 2024]
lgb_preds_list = []

for seed in seeds:
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    lgb_oof = np.zeros(len(X))
    lgb_preds_seed = np.zeros(len(X_test))
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.02,
        'num_leaves': 31,
        'max_depth': 7,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': seed,
        'verbose': -1
    }
    
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val)
        
        model = lgb.train(params, train_data, num_boost_round=2000, 
                          valid_sets=[val_data], 
                          callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
        
        lgb_oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        lgb_preds_seed += model.predict(X_test, num_iteration=model.best_iteration) / 5
    
    cv_score = roc_auc_score(y, lgb_oof)
    lgb_preds_list.append(lgb_preds_seed)
    print(f"Seed {seed} CV AUC: {cv_score:.6f}")

# Average across all seeds
lgb_preds = np.mean(lgb_preds_list, axis=0)
lgb_score = np.mean([roc_auc_score(y, lgb_oof) for lgb_oof in [lgb_oof]])
print(f"\nLightGBM Multi-Seed Average\n")


ðŸš€ Training LightGBM with seed diversity...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1319]	valid_0's auc: 0.95539
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1457]	valid_0's auc: 0.954413
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1331]	valid_0's auc: 0.955295
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1310]	valid_0's auc: 0.954791
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1338]	valid_0's auc: 0.955651
Seed 42 CV AUC: 0.955105
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1238]	valid_0's auc: 0.954806
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1452]	valid_0's auc: 0.956257
Training until validation scores d

In [6]:
# CELL 6: XGBoost - Multiple seeds
print("ðŸš€ Training XGBoost with seed diversity...")

seeds = [42, 123, 456, 789, 2024]
xgb_preds_list = []

for seed in seeds:
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    xgb_oof = np.zeros(len(X))
    xgb_preds_seed = np.zeros(len(X_test))
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.02,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'min_child_weight': 1,
        'random_state': seed,
        'tree_method': 'hist'
    }
    
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test)
        
        model = xgb.train(params, dtrain, num_boost_round=2000,
                          evals=[(dval, 'val')],
                          early_stopping_rounds=100,
                          verbose_eval=0)
        
        xgb_oof[val_idx] = model.predict(dval)
        xgb_preds_seed += model.predict(dtest) / 5
    
    cv_score = roc_auc_score(y, xgb_oof)
    xgb_preds_list.append(xgb_preds_seed)
    print(f"Seed {seed} CV AUC: {cv_score:.6f}")

xgb_preds = np.mean(xgb_preds_list, axis=0)
print(f"\nXGBoost Multi-Seed Average\n")


ðŸš€ Training XGBoost with seed diversity...
Seed 42 CV AUC: 0.955106
Seed 123 CV AUC: 0.955117
Seed 456 CV AUC: 0.955116
Seed 789 CV AUC: 0.955131
Seed 2024 CV AUC: 0.955110

XGBoost Multi-Seed Average



In [7]:
# CELL 7: CatBoost - Multiple seeds
print("ðŸš€ Training CatBoost with seed diversity...")

seeds = [42, 123, 456, 789, 2024]
cat_preds_list = []

for seed in seeds:
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cat_oof = np.zeros(len(X))
    cat_preds_seed = np.zeros(len(X_test))
    
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        model = CatBoostClassifier(
            iterations=2000,
            learning_rate=0.02,
            depth=6,
            l2_leaf_reg=3,
            eval_metric='AUC',
            random_state=seed,
            verbose=0,
            early_stopping_rounds=100
        )
        
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=False)
        
        cat_oof[val_idx] = model.predict_proba(X_val)[:, 1]
        cat_preds_seed += model.predict_proba(X_test)[:, 1] / 5
    
    cv_score = roc_auc_score(y, cat_oof)
    cat_preds_list.append(cat_preds_seed)
    print(f"Seed {seed} CV AUC: {cv_score:.6f}")

cat_preds = np.mean(cat_preds_list, axis=0)
print(f"\nCatBoost Multi-Seed Average\n")


ðŸš€ Training CatBoost with seed diversity...
Seed 42 CV AUC: 0.955302
Seed 123 CV AUC: 0.955298
Seed 456 CV AUC: 0.955298
Seed 789 CV AUC: 0.955311
Seed 2024 CV AUC: 0.955285

CatBoost Multi-Seed Average



In [8]:
# Simple ensemble
ensemble = (lgb_preds + xgb_preds + cat_preds) / 3

pd.DataFrame({
    'id': test_ids,
    'Heart Disease': ensemble
}).to_csv('submission_multiseed_ensemble.csv', index=False)

print("âœ… Multi-seed ensemble submission created!")
print("ðŸŽ¯ Expected LB: 0.9535-0.9540 (targeting 0.95340+)")


âœ… Multi-seed ensemble submission created!
ðŸŽ¯ Expected LB: 0.9535-0.9540 (targeting 0.95340+)


In [9]:
# Load your 4 best submissions
# Model 1: 0.95331 (original 5-fold)
# Model 3: 0.95329 (7-fold, 44 features) 
# Model 4: 0.95330 (multi-seed)

# Since they're in memory, create optimal blend
# If you need to load saved CSV files, do:
# sub1 = pd.read_csv('submission_model1.csv')
# sub3 = pd.read_csv('submission_model3.csv')
# sub4 = pd.read_csv('submission_model4.csv')

# For now, using current model's predictions:
ensemble_simple = (lgb_preds + xgb_preds + cat_preds) / 3

# Create multiple ensemble strategies
from scipy.stats import rankdata, gmean

# Strategy 1: Simple average
pred_avg = ensemble_simple

# Strategy 2: Rank average (most robust)
lgb_rank = rankdata(lgb_preds) / len(lgb_preds)
xgb_rank = rankdata(xgb_preds) / len(xgb_preds)
cat_rank = rankdata(cat_preds) / len(cat_preds)
pred_rank = (lgb_rank + xgb_rank + cat_rank) / 3

# Strategy 3: Geometric mean
pred_geom = gmean([lgb_preds, xgb_preds, cat_preds], axis=0)

# Strategy 4: Power mean (higher power = more weight to better predictions)
pred_power = ((lgb_preds**2 + xgb_preds**2 + cat_preds**2) / 3) ** 0.5

# Strategy 5: Median (most conservative)
pred_median = np.median([lgb_preds, xgb_preds, cat_preds], axis=0)

# Strategy 6: Weighted toward CatBoost (best CV)
pred_catboost_heavy = 0.25*lgb_preds + 0.25*xgb_preds + 0.50*cat_preds

# Save all strategies
strategies = {
    'submission_rank_ensemble.csv': pred_rank,
    'submission_geometric.csv': pred_geom,
    'submission_power_mean.csv': pred_power,
    'submission_catboost_weighted.csv': pred_catboost_heavy,
    'submission_median.csv': pred_median,
    'submission_simple_avg.csv': pred_avg,
}

for filename, preds in strategies.items():
    pd.DataFrame({
        'id': test_ids,
        'Heart Disease': preds
    }).to_csv(filename, index=False)

print("âœ… 6 ensemble strategies created!")
print("\nðŸŽ¯ Submission priority order:")
print("1. submission_rank_ensemble.csv       (Most robust)")
print("2. submission_catboost_weighted.csv   (Best CV model)")
print("3. submission_power_mean.csv          (Emphasizes confident predictions)")
print("4. submission_geometric.csv           (Conservative blend)")
print("5. submission_median.csv              (Most stable)")
print("6. submission_simple_avg.csv          (Baseline)")


âœ… 6 ensemble strategies created!

ðŸŽ¯ Submission priority order:
1. submission_rank_ensemble.csv       (Most robust)
2. submission_catboost_weighted.csv   (Best CV model)
3. submission_power_mean.csv          (Emphasizes confident predictions)
4. submission_geometric.csv           (Conservative blend)
5. submission_median.csv              (Most stable)
6. submission_simple_avg.csv          (Baseline)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

print("ðŸ§  Training meta-model stacker...")

# Create meta-features from your base models
meta_train = np.column_stack([lgb_oof, xgb_oof, cat_oof])

# Also add some statistics
meta_train_enhanced = np.column_stack([
    lgb_oof, xgb_oof, cat_oof,
    (lgb_oof + xgb_oof + cat_oof) / 3,  # average
    np.minimum.reduce([lgb_oof, xgb_oof, cat_oof]),  # min
    np.maximum.reduce([lgb_oof, xgb_oof, cat_oof]),  # max
    np.std([lgb_oof, xgb_oof, cat_oof], axis=0),  # std
])

meta_test = np.column_stack([lgb_preds, xgb_preds, cat_preds])
meta_test_enhanced = np.column_stack([
    lgb_preds, xgb_preds, cat_preds,
    (lgb_preds + xgb_preds + cat_preds) / 3,
    np.minimum.reduce([lgb_preds, xgb_preds, cat_preds]),
    np.maximum.reduce([lgb_preds, xgb_preds, cat_preds]),
    np.std([lgb_preds, xgb_preds, cat_preds], axis=0),
])

# Train simple logistic regression as meta-model
meta_model = LogisticRegression(random_state=42, max_iter=1000)
meta_model.fit(meta_train_enhanced, y)

# Get meta predictions
meta_preds = meta_model.predict_proba(meta_test_enhanced)[:, 1]

print(f"Meta-model weights: {meta_model.coef_[0][:3]}")

pd.DataFrame({
    'id': test_ids,
    'Heart Disease': meta_preds
}).to_csv('submission_meta_stacking.csv', index=False)

print("âœ… Meta-stacking submission created!")


ðŸ§  Training meta-model stacker...
Meta-model weights: [-4.00190587 -2.20346607  9.01172209]
âœ… Meta-stacking submission created!


In [11]:
from sklearn.isotonic import IsotonicRegression

print("ðŸ”§ Calibrating predictions...")

# Use your best ensemble (e.g., rank ensemble)
pred_rank = (rankdata(lgb_preds)/len(lgb_preds) + 
             rankdata(xgb_preds)/len(xgb_preds) + 
             rankdata(cat_preds)/len(cat_preds)) / 3

# Isotonic calibration on OOF predictions
oof_rank = (rankdata(lgb_oof)/len(lgb_oof) + 
            rankdata(xgb_oof)/len(xgb_oof) + 
            rankdata(cat_oof)/len(cat_oof)) / 3

calibrator = IsotonicRegression(out_of_bounds='clip')
calibrator.fit(oof_rank, y)

# Calibrate test predictions
pred_calibrated = calibrator.predict(pred_rank)

# Also try percentile clipping (remove extreme outliers)
lower_bound = np.percentile(pred_calibrated, 0.5)
upper_bound = np.percentile(pred_calibrated, 99.5)
pred_clipped = np.clip(pred_calibrated, lower_bound, upper_bound)

# Save calibrated versions
pd.DataFrame({
    'id': test_ids,
    'Heart Disease': pred_calibrated
}).to_csv('submission_calibrated.csv', index=False)

pd.DataFrame({
    'id': test_ids,
    'Heart Disease': pred_clipped
}).to_csv('submission_calibrated_clipped.csv', index=False)

print("âœ… Calibrated submissions created!")
print(f"Original range: [{pred_rank.min():.4f}, {pred_rank.max():.4f}]")
print(f"Calibrated range: [{pred_calibrated.min():.4f}, {pred_calibrated.max():.4f}]")
print(f"Clipped range: [{pred_clipped.min():.4f}, {pred_clipped.max():.4f}]")


ðŸ”§ Calibrating predictions...
âœ… Calibrated submissions created!
Original range: [0.0000, 1.0000]
Calibrated range: [0.0000, 1.0000]
Clipped range: [0.0010, 1.0000]
