In [1]:
# --- INSTALL LIBRARIES (If needed) ---
# Un-comment the line below if you get a "No module named catboost" error
# !pip install catboost xgboost lightgbm scikit-learn pandas numpy

import os
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')
print("1. SEARCHING FOR DATA ")
train_path = None
test_path = None

# Search every folder in the input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        if 'train' in filename.lower() and filename.endswith('.csv'):
            train_path = path
        elif 'test' in filename.lower() and filename.endswith('.csv'):
            test_path = path

if not train_path or not test_path:
    print("CRITICAL ERROR: Could not find 'train.csv' or 'test.csv'.")
    print("Files found in /kaggle/input:")
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    raise FileNotFoundError("Stopping execution because data was not found.")

print(f"Found Train: {train_path}")
print(f"Found Test:  {test_path}")

# Load the data
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# 2. AUTOMATIC TARGET DETECTION
print("\n 2. DETECTING TARGET COLUMN ")

# List of common names for the target column
candidate_names = ['Heart Disease', 'HeartDisease', 'target', 'Target', 'output', 'Output', 'class', 'Class']
target_col = None

# 1. Check if any candidate name exists exactly
for name in candidate_names:
    if name in train.columns:
        target_col = name
        break

# 2. If not found, search for partial matches (e.g. "Heart Disease " with a space)
if target_col is None:
    for col in train.columns:
        if 'heart' in col.lower() or 'target' in col.lower() or 'output' in col.lower():
            target_col = col
            break

if target_col is None:
    raise ValueError(f"Could not automatically find the target column. Available columns: {train.columns.tolist()}")

print(f"SUCCESS: Target column identified as '{target_col}'")

# 3. PREPROCESSING
print("\n 3. PREPROCESSING DATA ")

# Separate Features (X) and Target (y)
X = train.drop(columns=['id', target_col], errors='ignore')
y = train[target_col]
X_test = test.drop(columns=['id'], errors='ignore')

# FIX TARGET LABELS: Convert text (e.g., "Presence"/"Absence") to 1/0
if y.dtype == 'object':
    unique_vals = y.unique()
    print(f"Detected text labels in target: {unique_vals}")
    # We assume the less frequent one is usually the 'positive' case, 
    # but for safety, we map them explicitly if they match known patterns.
    if 'Presence' in unique_vals:
        mapping = {'Presence': 1, 'Absence': 0}
    elif 'Yes' in unique_vals:
        mapping = {'Yes': 1, 'No': 0}
    else:
        # Fallback: Map the second unique value to 1, first to 0
        mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
    
    y = y.map(mapping)
    print(f"Mapped labels using: {mapping}")

# ENCODE CATEGORICAL COLUMNS
print("Encoding categorical features...")
for col in X.columns:
    # Check if column is text (object) in either train or test
    if X[col].dtype == 'object' or X_test[col].dtype == 'object':
        le = LabelEncoder()
        # Combine both to ensure we know all categories
        combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
        le.fit(combined)
        
        X[col] = le.transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

# 4. TRAINING ENSEMBLE (XGB + LGBM + CAT)
print("\n 4. TRAINING ENSEMBLE ")

# Initialize Stratified K-Fold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store predictions
oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(X_test.shape[0])

# Define Models
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.05, max_depth=6, subsample=0.8, n_jobs=-1, random_state=42)
lgb_model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, num_leaves=31, n_jobs=-1, random_state=42, verbose=-1)
cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, verbose=0, random_state=42, allow_writing_files=False)

for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    # Train XGBoost
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    p1 = xgb_model.predict_proba(X_val)[:, 1]
    t1 = xgb_model.predict_proba(X_test)[:, 1]

    # Train LightGBM
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    p2 = lgb_model.predict_proba(X_val)[:, 1]
    t2 = lgb_model.predict_proba(X_test)[:, 1]

    # Train CatBoost
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))
    p3 = cat_model.predict_proba(X_val)[:, 1]
    t3 = cat_model.predict_proba(X_test)[:, 1]

    # Average predictions (Ensemble)
    avg_pred = (p1 + p2 + p3) / 3
    oof_preds[val_idx] = avg_pred
    test_preds += (t1 + t2 + t3) / 3 / folds.get_n_splits()
    
    # Calculate accuracy for this fold (using standard 0.5 threshold for display)
    fold_acc = accuracy_score(y_val, (avg_pred > 0.5).astype(int))
    print(f"Fold {fold+1} Accuracy: {fold_acc:.5f}")


# 5. THRESHOLD OPTIMIZATION
print("\n 5. OPTIMIZING THRESHOLD")
best_acc = 0
best_thresh = 0.5

# Test 1000 different thresholds to find the perfect cut-off
for thresh in np.linspace(0.2, 0.8, 1000):
    score = accuracy_score(y, (oof_preds > thresh).astype(int))
    if score > best_acc:
        best_acc = score
        best_thresh = thresh

print(f"BEST OOF ACCURACY: {best_acc:.5f}")
print(f"BEST THRESHOLD:    {best_thresh:.5f}")

# 6. GENERATE SUBMISSION
print("\n 6. SAVING SUBMISSION ")

# Apply the best threshold to the test predictions
final_labels = (test_preds > best_thresh).astype(int)

# Create DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    target_col: final_labels
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("SUCCESS: 'submission.csv' has been saved!")

1. SEARCHING FOR DATA 
Found Train: /kaggle/input/playground-series-s6e2/train.csv
Found Test:  /kaggle/input/playground-series-s6e2/test.csv

 2. DETECTING TARGET COLUMN 
SUCCESS: Target column identified as 'Heart Disease'

 3. PREPROCESSING DATA 
Detected text labels in target: ['Presence' 'Absence']
Mapped labels using: {'Presence': 1, 'Absence': 0}
Encoding categorical features...

 4. TRAINING ENSEMBLE 
Fold 1 Accuracy: 0.88986
Fold 2 Accuracy: 0.88737
Fold 3 Accuracy: 0.88918
Fold 4 Accuracy: 0.88788
Fold 5 Accuracy: 0.88888

 5. OPTIMIZING THRESHOLD
BEST OOF ACCURACY: 0.88871
BEST THRESHOLD:    0.50811

 6. SAVING SUBMISSION 
SUCCESS: 'submission.csv' has been saved!
