In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss

In [2]:
path = '/kaggle/input/playground-series-s6e2/'

train = pd.read_csv(f'{path}train.csv')
test  = pd.read_csv(f'{path}test.csv')

print(train.shape, test.shape)

(630000, 15) (270000, 14)


In [3]:
le = LabelEncoder()
train['Heart Disease'] = le.fit_transform(train['Heart Disease'])

y = train['Heart Disease'].values

In [9]:
def engineer_features(df):
    df = df.copy()

    # Core interactions (safe)
    df['HR_Reserve'] = (220 - df['Age']) - df['Max HR']
    df['BP_Age'] = df['BP'] * df['Age']
    df['Chol_Age'] = df['Cholesterol'] * df['Age']
    df['HR_Age_Ratio'] = df['Max HR'] / (df['Age'] + 1)

    # Risk indicators
    df['Is_Elderly'] = (df['Age'] > 60).astype(int)
    df['Is_High_BP'] = (df['BP'] > 140).astype(int)
    df['Is_High_Chol'] = (df['Cholesterol'] > 240).astype(int)
    df['Is_Tachy'] = (df['Max HR'] > 150).astype(int)

    # Strong categoricalâ€“numeric interaction
    df['Slope_HR'] = df['Slope of ST'] * df['Max HR']

    return df

In [10]:
train = engineer_features(train)
test  = engineer_features(test)


In [11]:
X = train.drop(['id', 'Heart Disease'], axis=1)
X_test = test.drop(['id'], axis=1)

cat_cols = [
    'Sex', 'Chest pain type', 'FBS over 120',
    'EKG results', 'Exercise angina',
    'Slope of ST', 'Number of vessels fluro', 'Thallium'
]

# IMPORTANT: CatBoost requires column indices
cat_features_idx = [X.columns.get_loc(c) for c in cat_cols]


In [12]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof_cb = np.zeros(len(X))
test_preds = []


In [13]:
def get_catboost(seed):
    return CatBoostClassifier(
        iterations=8000,
        learning_rate=0.015,
        depth=10,
        loss_function='Logloss',
        eval_metric='AUC',
        l2_leaf_reg=20,
        bagging_temperature=0.8,
        random_strength=1.1,
        task_type='GPU',
        random_seed=seed,
        early_stopping_rounds=400,
        verbose=0
    )


In [14]:
print("ðŸš€ Training CatBoost (10-Fold OOF)...")

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    xt, xv = X.iloc[t_idx], X.iloc[v_idx]
    yt, yv = y[t_idx], y[v_idx]

    cb = get_catboost(fold)
    cb.fit(
        xt, yt,
        eval_set=(xv, yv),
        cat_features=cat_features_idx
    )

    oof_cb[v_idx] = cb.predict_proba(xv)[:, 1]
    test_preds.append(cb.predict_proba(X_test)[:, 1])

    print(f" Fold {fold + 1} completed")


ðŸš€ Training CatBoost (10-Fold OOF)...


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 1 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 2 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 3 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 4 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 5 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 6 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 7 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 8 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 9 completed


Default metric period is 5 because AUC is/are not implemented for GPU


âœ… Fold 10 completed


In [15]:
oof_cb = np.clip(oof_cb, 1e-6, 1 - 1e-6)

print("OOF LogLoss:", log_loss(y, oof_cb))

best_thr, best_acc = 0, 0
for t in np.linspace(0.3, 0.65, 300):
    acc = accuracy_score(y, (oof_cb > t).astype(int))
    if acc > best_acc:
        best_acc = acc
        best_thr = t

print(f"Best Threshold: {best_thr:.3f}")
print(f"OOF Accuracy : {best_acc:.4f}")

OOF LogLoss: 0.26932029381685574
Best Threshold: 0.495
OOF Accuracy : 0.8881


In [16]:
final_test_probs = np.mean(test_preds, axis=0)
final_labels = (final_test_probs > best_thr).astype(int)

submission = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': final_labels
})

submission.to_csv('submission.csv', index=False)
print("submission.csv generated successfully!")


ðŸŽ¯ submission.csv generated successfully!
