In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings("ignore")


In [None]:

import os

if os.path.exists('/mnt/data/heart_cleaned.csv'):
    df = pd.read_csv('/mnt/data/heart_cleaned.csv')
elif os.path.exists('heart_cleaned.csv'):
    df = pd.read_csv('heart_cleaned.csv')
else:
    df = pd.read_csv('heart.csv')

df.head()


In [None]:

df['age_group'] = pd.cut(
    df['age'], bins=[0, 35, 45, 55, 65, 100],
    labels=['<35', '35-44', '45-54', '55-64', '65+']
)

if 'chol' in df.columns:
    df['chol_per_age'] = df['chol'] / df['age']

if 'trestbps' in df.columns and 'chol' in df.columns:
    df['bp_chol_ratio'] = df['trestbps'] / df['chol']

for col in df.columns:
    if df[col].dtype == 'object' or str(df[col].dtype) == 'category':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].mean())

cat_cols = df.select_dtypes(include=['object', 'category']).columns

for col in cat_cols:
    if df[col].nunique() <= 2:
        df[col] = LabelEncoder().fit_transform(df[col])
    else:
        df = pd.get_dummies(df, columns=[col], drop_first=True)

num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = StandardScaler().fit_transform(df[num_cols])
df[num_cols] = MinMaxScaler().fit_transform(df[num_cols])

df.head()


In [None]:

x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, [x for x in range(0,16)]],
    df.iloc[:, [x for x in range(16,20)]],
    train_size=0.7, test_size=0.15, random_state=42, shuffle=False
)

x_validate, y_validate = (
    df.iloc[[x for x in range(871, len(df))], [x for x in range(0,16)]],
    df.iloc[[x for x in range(871, len(df))], [x for x in range(16,20)]],
)

def to_flat(arr2d, col_idx):
    return np.array(arr2d.iloc[:, [col_idx]]).ravel()

y_train_groups = [to_flat(y_train, i) for i in range(y_train.shape[1])]
y_test_groups = [to_flat(y_test, i) for i in range(y_test.shape[1])]
y_validate_groups = [to_flat(y_validate, i) for i in range(y_validate.shape[1])]


In [None]:

model_constructors = {
    'Logistic': lambda: LogisticRegression(max_iter=1000),
    'DecisionTree': lambda: DecisionTreeClassifier(max_depth=200, max_leaf_nodes=50),
    'RandomForest': lambda: RandomForestClassifier(max_depth=50, max_leaf_nodes=25, n_estimators=100),
    'AdaBoost': lambda: AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    'KNN': lambda: KNeighborsClassifier(n_neighbors=10, n_jobs=2),
    'SVC': lambda: SVC(probability=True)
}


In [None]:

def metrics_from_binary(y_true, y_pred, y_prob=None):
    res = {}
    res['accuracy'] = round(accuracy_score(y_true, y_pred), 4)
    res['precision'] = round(precision_score(y_true, y_pred, zero_division=0), 4)
    res['recall'] = round(recall_score(y_true, y_pred), 4)
    res['f1'] = round(f1_score(y_true, y_pred), 4)
    if y_prob is not None:
        try:
            res['roc_auc'] = round(roc_auc_score(y_true, y_prob), 4)
        except:
            res['roc_auc'] = None
    else:
        res['roc_auc'] = None
    return res

def safe_probs(clf, X):
    if hasattr(clf, 'predict_proba'):
        return clf.predict_proba(X)[:, 1]
    elif hasattr(clf, 'decision_function'):
        df = clf.decision_function(X)
        return 1 / (1 + np.exp(-df))
    else:
        return clf.predict(X).astype(float)


In [None]:

results_summary = {}

for grp_idx in range(4):

    print("\n------------------------------------------------------------")
    print(f'Age-group target index: {grp_idx}')
    print("------------------------------------------------------------")

    ytr = y_train_groups[grp_idx]
    yval = y_validate_groups[grp_idx]
    yte = y_test_groups[grp_idx]

    trained = {}
    val_probs = {}
    test_probs = {}
    val_metrics = {}

    for name, ctor in model_constructors.items():
        clf = ctor()
        try:
            clf.fit(x_train, ytr)
        except Exception as e:
            print(f'WARNING: {name} failed:', e)
            continue

        trained[name] = clf
        val_probs[name] = safe_probs(clf, x_validate)
        test_probs[name] = safe_probs(clf, x_test)

        val_pred_label = (val_probs[name] >= 0.5).astype(int)
        val_metrics[name] = metrics_from_binary(yval, val_pred_label, val_probs[name])

    def auc_or_neg(name):
        auc = val_metrics[name]['roc_auc']
        return auc if auc is not None else -1

    top3 = sorted(trained.keys(), key=auc_or_neg, reverse=True)[:3]
    print('Top-3 models:', top3)

    avg_val_prob = np.mean([val_probs[m] for m in top3], axis=0)
    avg_test_prob = np.mean([test_probs[m] for m in top3], axis=0)

    vote_estimators = [(m, model_constructors[m]()) for m in top3]
    try:
        vote_clf = VotingClassifier(vote_estimators, voting='soft')
        vote_clf.fit(x_train, ytr)
        vote_val_prob = vote_clf.predict_proba(x_validate)[:, 1]
        vote_test_prob = vote_clf.predict_proba(x_test)[:, 1]
    except:
        vote_val_prob = avg_val_prob
        vote_test_prob = avg_test_prob

    eps = 1e-12
    loglikes = {}

    for m in top3:
        p = np.clip(val_probs[m], eps, 1-eps)
        ll = np.sum(yval*np.log(p) + (1-yval)*np.log(1-p))
        loglikes[m] = ll

    ll_vals = np.array(list(loglikes.values()))
    centered = ll_vals - ll_vals.max()
    w_unnorm = np.exp(centered)
    weights = w_unnorm / w_unnorm.sum()

    bma_val_prob, bma_test_prob = 0, 0
    for w, m in zip(weights, top3):
        bma_val_prob += w * val_probs[m]
        bma_test_prob += w * test_probs[m]

    table = []

    def add_row(name, vp, tp):
        return {
            'model': name,
            'val_accuracy': metrics_from_binary(yval, (vp>=0.5).astype(int), vp)['accuracy'],
            'val_precision': metrics_from_binary(yval, (vp>=0.5).astype(int), vp)['precision'],
            'val_recall': metrics_from_binary(yval, (vp>=0.5).astype(int), vp)['recall'],
            'val_f1': metrics_from_binary(yval, (vp>=0.5).astype(int), vp)['f1'],
            'val_roc_auc': metrics_from_binary(yval, (vp>=0.5).astype(int), vp)['roc_auc'],
            'test_accuracy': metrics_from_binary(yte, (tp>=0.5).astype(int), tp)['accuracy'],
            'test_precision': metrics_from_binary(yte, (tp>=0.5).astype(int), tp)['precision'],
            'test_recall': metrics_from_binary(yte, (tp>=0.5).astype(int), tp)['recall'],
            'test_f1': metrics_from_binary(yte, (tp>=0.5).astype(int), tp)['f1'],
            'test_roc_auc': metrics_from_binary(yte, (tp>=0.5).astype(int), tp)['roc_auc'],
        }

    for m in top3:
        table.append(add_row(m, val_probs[m], test_probs[m]))

    table.append(add_row('Average_top3', avg_val_prob, avg_test_prob))
    table.append(add_row('Voting_top3', vote_val_prob, vote_test_prob))
    table.append(add_row('BMA_top3', bma_val_prob, bma_test_prob))

    results_summary[f'group_{grp_idx}'] = {
        'top3': top3,
        'bma_weights': dict(zip(top3, weights)),
        'eval': pd.DataFrame(table).set_index('model')
    }

    display(results_summary[f'group_{grp_idx}']['eval'])
