## Predicting Heart Diseases: 

### Steps to Follow: 
1) Load Data
2) Check Data Quality
3) Clean Data
    1) Check for missing Data 
    2) Review Data Distribution for Each Feature
4) Check Data Distribution for Target
5) Stratified K-Folds for Model Comparison
6) Choose Best Performing Model using AUC Score or F1 Score.
7) HyperParameter Tuning
8) Look at Feature Importance
9) Get Predictions and Store in "submission.csv"

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import classification_report, f1_score

from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import optuna

from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings("ignore")

sns.set_theme()

: 

## Load Data

In [None]:
class Config:
    train_path = "/kaggle/input/playground-series-s6e2/train.csv"
    test_path = "/kaggle/input/playground-series-s6e2/test.csv"

In [None]:
train_data = pd.read_csv(Config.train_path)
test_data = pd.read_csv(Config.test_path)

train_data.head()

In [None]:
print("Total Train Samples:", len(train_data))
print("Total Test Samples:", len(test_data))

## Check Data Quality

In [None]:
# Check for Missing Data
print("Null Counts for Each Column:\n")
display(train_data.isna().sum())

print("Data Type for Each Column:\n")
display(train_data.info())

In [None]:
# Plot Distribution for Each Feature Column
target = "Heart Disease"
features = [i for i in train_data.columns[1:] if i!=target]

for i in features:
    train_data[i].hist()
    plt.title(f"Distribution of {i}")
    plt.xlabel("Value")
    plt.ylabel("Counts")
    plt.show()

In [None]:
# Plot Distribution for Each Target Column
train_data[target].hist()
plt.title(f"Distribution of {target}")
plt.xlabel("Value")
plt.ylabel("Counts")
plt.show()

## Stratified K-Folds for Model Comparison

In [None]:
def evaluate_model(ModelClass, X, y, splits=4, verbose=False, **kwargs):
    skf = StratifiedKFold(n_splits=splits)
    f1_scores = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        if verbose:
            print(f"Fold {i}:")
            print("Train Samples:", len(train_index))
            print("Test Samples:", len(test_index))
            
        scaler = StandardScaler().set_output(transform="pandas")
        X_train = scaler.fit_transform(train_data.loc[train_index, features])
        y_train = y[train_index]
        X_test = scaler.transform(train_data.loc[test_index, features])
        y_test = y[test_index]
        
        model = ModelClass(**kwargs)
    
        model.fit(X_train, y_train)
    
        y_prob = model.predict(X_test)
        
        score = f1_score(y_test, y_prob)
        report = classification_report(y_test, y_prob, target_names=label_encoder.classes_)

        f1_scores.append(score)
        
        if verbose: 
            print("F1-Score:", f1_score)
            print(report)
    
    return np.mean(f1_scores), np.std(f1_scores)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_data[target])

print('Total Classes:', len(label_encoder.classes_))
print("Classes:", label_encoder.classes_)
exps = [
    {"ModelClass": LogisticRegression, "X": train_data, "y": y, "class_weight":"balanced"}, 
    {"ModelClass": GaussianNB, "X": train_data, "y": y},
    # {"ModelClass": RandomForestClassifier, "X": train_data, "y": y, "class_weight":"balanced"},
    # {"ModelClass": DecisionTreeClassifier, "X": train_data, "y": y, "class_weight":"balanced"},
    # {"ModelClass": SVC, "X": train_data, "y": y, "class_weight":"balanced"},
    # {"ModelClass": KNeighborsClassifier, "X": train_data, "y": y},
    # {"ModelClass": GradientBoostingClassifier, "X": train_data, "y": y},
    {"ModelClass": ctb.CatBoostClassifier, "X": train_data, "y": y, "verbose": False, "logging_level":"Silent"},
    {"ModelClass": lgb.LGBMClassifier, "X": train_data, "y": y, "class_weight":"balanced", "verbosity": 0}, 
    {"ModelClass": xgb.XGBClassifier, "X": train_data, "y": y, "class_weight":"balanced", "verbosity": 0},
]

for param in exps: 
    mean, std = evaluate_model(**param)
    print(f"Model: {param['ModelClass'].__name__}\nMean: {mean} | STD: {std}")

## HyperParameter Tuning

In [None]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "n_jobs": -1,
        "class_weight": "balanced",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 75, 125),
    }

    f1_scores = []

    for train_idx, val_idx in skf.split(train_data[features], y):
        X_train_fold, X_val_fold = train_data[features].iloc[train_idx], train_data[features].iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        scaler = StandardScaler().set_output(transform="pandas")
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)

        model = lgb.LGBMClassifier(**params)

        model.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric="binary_logloss",
            callbacks=[
                lgb.early_stopping(100, verbose=False),
                lgb.log_evaluation(0)
            ]
        )

        preds = model.predict(X_val_fold)
        f1 = f1_score(y_val_fold, preds)
        f1_scores.append(f1)

        trial.report(np.mean(f1_scores), step=len(f1_scores))
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return np.mean(f1_scores)

In [None]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner()
)

study.optimize(objective, n_trials=100)

print("Best F1:", study.best_value)
print("Best Params:", study.best_params)

## Retrain Best Model with Best HyperParameters

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data[features], y, test_size=0.1, random_state=42, stratify=y)

scaler = StandardScaler().set_output(transform="pandas")
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = lgb.LGBMClassifier(class_weight="balanced", verbosity=0, **study.best_params)
model.fit(X_train, y_train)

In [None]:
f1_score(y_test, model.predict(X_test))

## Evaluate Feature Importance

In [None]:
# Permutation Importance
perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42, n_jobs=-1)
perm_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance Mean': perm_importance.importances_mean,
    'Importance Std': perm_importance.importances_std
})
print("\nPermutation Importance:")
print(perm_importance_df.sort_values(by='Importance Mean', ascending=False))

## Get Predictions and Store in "submission.csv"

In [None]:
print(label_encoder.classes_)

In [None]:
test_df = pd.read_csv(Config.test_path)
X_test = test_df[features]
X_test = scaler.transform(X_test)
y_prob = model.predict_proba(X_test)
submission = pd.DataFrame({
    "id": test_df["id"], 
    "Heart Disease": y_prob[:, 1]
})
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()