In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
#import pandas_profiling
from pydantic_settings import BaseSettings
%matplotlib inline
df = pd.read_csv('framingham_extended.csv')
df_hd = pd.read_csv('heart_disease_fixed.csv')

In [50]:
# Filling out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)
df.isna().sum()
df['fbs'] = (df['glucose'] > 120).astype(int)  # engineer 'fbs' since it's not available

# Filling values for heart_disease dataset
df_hd['trestbps'].fillna(df_hd.trestbps.mean(), inplace = True)
df_hd['chol'].fillna(df_hd.chol.mean(), inplace = True)
df_hd['result'] = (df_hd['target'] > 0).astype(int)

# Fill in fbs column using sampling
df_hd['fbs'] = pd.to_numeric(df_hd['fbs'], errors='coerce')
fbs_dist = df_hd['fbs'].dropna().value_counts(normalize=True)
p_0 = fbs_dist.get(0.0, 0)
p_1 = fbs_dist.get(1.0, 0)
total = p_0 + p_1
p_0 /= total
p_1 /= total
num_missing = df_hd['fbs'].isna().sum()
random_fill = np.random.choice([0, 1], size=num_missing, p=[p_0, p_1])
df_hd.loc[df_hd['fbs'].isna(), 'fbs'] = random_fill

shared_features = ['age', 'sex', 'trestbps', 'chol', 'fbs']

df.rename(columns={
    'male': 'sex',
    'sysBP': 'trestbps',
    'totChol': 'chol',
    'heartRate': 'thalach'
}, inplace=True)

df['result'] = df['TenYearCHD'].astype(int)

df_combined_fram = pd.concat([df[shared_features+['result']], df_hd[shared_features+['result']]])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BPMeds'].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['glucose'].fillna(df.glucose.mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [51]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# separate independent & dependent variables
X = df.loc[:,shared_features]  #independent columns
y = df.iloc[:,-1]    #target column i.e CHD

X_hd = df_hd.loc[:,shared_features]
y_hd = df_hd.loc[:,['result']].iloc[:,0]

X_concat_fram = df_combined_fram.loc[:,shared_features]
y_concat_fram = df_combined_fram.loc[:,['result']].iloc[:,0]




In [52]:
from sklearn.model_selection import train_test_split

#y = df['TenYearCHD'] #target variable
#X = df.drop(['TenYearCHD'], axis = 1) #features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

X_hd_train, X_hd_test, y_hd_train, y_hd_test = train_test_split(X_hd, y_hd, test_size=0.2)

X_concat_fram_train, X_concat_fram_test, y_concat_fram_train, y_concat_fram_test = train_test_split(X_concat_fram, y_concat_fram, test_size=0.2)

print (X_hd_train.shape, y_hd_train.shape)
print (X_hd_test.shape, y_hd_test.shape)
print (X_concat_fram_train.shape, y_concat_fram_train.shape)
print (X_concat_fram_test.shape, y_concat_fram_test.shape)

(4603, 5) (4603,)
(1151, 5) (1151,)
(736, 5) (736,)
(184, 5) (184,)
(5339, 5) (5339,)
(1335, 5) (1335,)


# ECE228 optimization: MLP

In [53]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# 1. Configurable MLP Model
class ConfigurableMLP(nn.Module):
    # Note this is supposed to be a binary classification task, bacause the target output "10 year CHD" is 
    # either 1 or 0
    def __init__(self, input_dim, hidden_dim, num_hidden_layers=2,
                 activation='relu', dropout_rate=0.0):
        super().__init__()

        # Choose activation function
        if activation == 'relu':
            activation_fn = nn.ReLU()
        elif activation == 'sigmoid':
            activation_fn = nn.Sigmoid()
        elif activation == 'tanh':
            activation_fn = nn.Tanh()
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        layers = []

        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(activation_fn)

        # Hidden layers
        for _ in range(num_hidden_layers):
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(activation_fn)

        # Output layer (for binary classification — change if needed)
        layers.append(nn.Linear(hidden_dim, 1))  # output logits
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# 2. train model
def train_model(
    X_train, y_train, X_val, y_val,
    input_dim,
    hidden_dim=64,
    num_hidden_layers=2,
    activation='relu',
    dropout_rate=0.0,
    learning_rate=1e-3,
    batch_size=32,
    epochs=100,
    use_l1=False,
    l1_lambda=1e-5,
    use_l2=False,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
):
    # Build model
    model = ConfigurableMLP(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_hidden_layers=num_hidden_layers,
        activation=activation,
        dropout_rate=dropout_rate if dropout_rate > 0 else 0.0
    )

    # Set optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                           weight_decay=l2_lambda if use_l2 else 0.0)

    # Use BCEWithLogitsLoss for binary classification with logits
    criterion = nn.BCEWithLogitsLoss()

    # DataLoader setup
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)

    best_val_loss = float('inf')
    patience_counter = early_stopping_patience
    best_model_state = None

    # skip training for baseline results
    if skip_training:
        print("[Baseline] Skipping training, evaluating untrained model...")
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1, 1)).item()
        val_loss /= len(val_loader)
        print(f"[Baseline] Untrained Model Val Loss: {val_loss:.4f}")
        return model


    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb.view(-1, 1))


            if use_l1:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_lambda * l1_norm

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1,1)).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {total_loss:.4f}, Val Loss = {val_loss:.4f}")

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = early_stopping_patience
            best_model_state = model.state_dict()
        else:
            patience_counter -= 1
            if patience_counter == 0:
                print("Early stopping triggered.")
                break

    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model





In [54]:
from sklearn.preprocessing import StandardScaler

# Framingham dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit only on Framingham

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Heart Disease dataset — use the SAME SCALER, only transform
X_hd_scaled = scaler.transform(X_hd)
X_hd_tensor = torch.tensor(X_hd_scaled, dtype=torch.float32)
y_hd_tensor = torch.tensor(y_hd, dtype=torch.float32)

# Optional split
X_hd_train, X_hd_val, y_hd_train, y_hd_val = train_test_split(X_hd_tensor, y_hd_tensor, test_size=0.2, random_state=42)

# Concatenated dataset — use the SAME SCALER, only transform
X_concat_scaled = scaler.transform(X_concat_fram)
X_concat_tensor = torch.tensor(X_concat_scaled, dtype=torch.float32)
y_concat_tensor = torch.tensor(y_concat_fram.values, dtype=torch.float32)

# Optional split
X_concat_train, X_concat_val, y_concat_train, y_concat_val = train_test_split(X_concat_tensor, y_concat_tensor, test_size=0.2, random_state=42)


In [55]:
def evaluate_model(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = torch.sigmoid(model(X)).squeeze()
        preds = (outputs >= 0.5).int()
        accuracy = (preds == y.int()).float().mean().item()
    return accuracy


In [56]:
untrained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=128,
    num_hidden_layers=5,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=100,
    skip_training=True
)

[Baseline] Skipping training, evaluating untrained model...
[Baseline] Untrained Model Val Loss: 0.6931


In [57]:
accuracy = evaluate_model(untrained_model, X_hd_val, y_hd_val)
print(f"Baseline Accuracy: {accuracy:.2%}")

Baseline Accuracy: 37.50%


In [58]:
fram_trained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=100,
    skip_training=False
)


Epoch 1/100: Train Loss = 90.7621, Val Loss = 0.6030
Epoch 2/100: Train Loss = 86.5722, Val Loss = 0.6022
Epoch 3/100: Train Loss = 86.0309, Val Loss = 0.5973
Epoch 4/100: Train Loss = 85.9419, Val Loss = 0.6000
Epoch 5/100: Train Loss = 85.4338, Val Loss = 0.5971
Epoch 6/100: Train Loss = 84.9017, Val Loss = 0.6130
Epoch 7/100: Train Loss = 84.5248, Val Loss = 0.5957
Epoch 8/100: Train Loss = 84.7702, Val Loss = 0.5963
Epoch 9/100: Train Loss = 84.1010, Val Loss = 0.6001
Epoch 10/100: Train Loss = 83.8347, Val Loss = 0.6009
Epoch 11/100: Train Loss = 83.7084, Val Loss = 0.5946
Epoch 12/100: Train Loss = 83.6793, Val Loss = 0.5975
Epoch 13/100: Train Loss = 83.4068, Val Loss = 0.5934
Epoch 14/100: Train Loss = 83.5902, Val Loss = 0.5957
Epoch 15/100: Train Loss = 83.2952, Val Loss = 0.5905
Epoch 16/100: Train Loss = 83.0200, Val Loss = 0.5971
Epoch 17/100: Train Loss = 82.6408, Val Loss = 0.5920
Epoch 18/100: Train Loss = 82.6798, Val Loss = 0.5959
Epoch 19/100: Train Loss = 82.7340, V

In [59]:
accuracy = evaluate_model(fram_trained_model, X_hd_val, y_hd_val)
print(f"Framingham Dataset Trained Accuracy: {accuracy:.2%}")

Framingham Dataset Trained Accuracy: 62.50%


In [60]:
hd_trained_model = train_model(
    X_hd_train, y_hd_train,
    X_hd_val, y_hd_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=100,
    skip_training=False
)


Epoch 1/100: Train Loss = 17.6279, Val Loss = 0.6976
Epoch 2/100: Train Loss = 16.3699, Val Loss = 0.6598
Epoch 3/100: Train Loss = 15.5683, Val Loss = 0.6273
Epoch 4/100: Train Loss = 15.6492, Val Loss = 0.6232
Epoch 5/100: Train Loss = 15.6556, Val Loss = 0.6361
Epoch 6/100: Train Loss = 15.6263, Val Loss = 0.6261
Epoch 7/100: Train Loss = 15.3807, Val Loss = 0.6306
Epoch 8/100: Train Loss = 15.3327, Val Loss = 0.6282
Epoch 9/100: Train Loss = 15.3635, Val Loss = 0.6548
Epoch 10/100: Train Loss = 15.3792, Val Loss = 0.6288
Epoch 11/100: Train Loss = 15.2878, Val Loss = 0.6408
Epoch 12/100: Train Loss = 15.2848, Val Loss = 0.6460
Epoch 13/100: Train Loss = 15.1681, Val Loss = 0.6458
Epoch 14/100: Train Loss = 15.4038, Val Loss = 0.6412
Epoch 15/100: Train Loss = 15.3278, Val Loss = 0.6377
Epoch 16/100: Train Loss = 15.1740, Val Loss = 0.6292
Epoch 17/100: Train Loss = 15.2552, Val Loss = 0.6379
Epoch 18/100: Train Loss = 15.1128, Val Loss = 0.6247
Epoch 19/100: Train Loss = 15.0825, V

In [61]:
accuracy = evaluate_model(hd_trained_model, X_hd_val, y_hd_val)
print(f"Heart_Disease Dataset Trained Accuracy: {accuracy:.2%}")

Heart_Disease Dataset Trained Accuracy: 61.41%


In [62]:
print(torch.isnan(X_concat_train).any(), torch.isnan(y_concat_train).any())
print(torch.min(y_concat_train), torch.max(y_concat_train))
print(torch.min(X_concat_train), torch.max(X_concat_train))

tensor(False) tensor(False)
tensor(0.) tensor(1.)
tensor(-9.3399) tensor(620.0317)


In [None]:
concat_model = train_model(
    X_concat_train, y_concat_train, 
    X_concat_val, y_concat_val,
    input_dim=X_concat_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)

Epoch 1/100: Train Loss = 111.9721, Val Loss = 0.6158
Epoch 2/100: Train Loss = 104.9733, Val Loss = 0.6037
Epoch 3/100: Train Loss = 103.7364, Val Loss = 0.5989
Epoch 4/100: Train Loss = 103.2812, Val Loss = 0.5975
Epoch 5/100: Train Loss = 102.4427, Val Loss = 0.5935
Epoch 6/100: Train Loss = 102.3420, Val Loss = 0.5946
Epoch 7/100: Train Loss = 101.6116, Val Loss = 0.5939
Epoch 8/100: Train Loss = 101.5428, Val Loss = 0.5911
Epoch 9/100: Train Loss = 101.4839, Val Loss = 0.5895
Epoch 10/100: Train Loss = 101.0588, Val Loss = 0.5881
Epoch 11/100: Train Loss = 100.9332, Val Loss = 0.5889
Epoch 12/100: Train Loss = 100.4694, Val Loss = 0.5886
Epoch 13/100: Train Loss = 100.2637, Val Loss = 0.5875
Epoch 14/100: Train Loss = 100.0480, Val Loss = 0.5892
Epoch 15/100: Train Loss = 99.7842, Val Loss = 0.5869
Epoch 16/100: Train Loss = 100.3631, Val Loss = 0.5893
Epoch 17/100: Train Loss = 99.9973, Val Loss = 0.5882
Epoch 18/100: Train Loss = 99.5402, Val Loss = 0.5900
Epoch 19/100: Train Lo

In [64]:
accuracy = evaluate_model(concat_model, X_hd_val, y_hd_val)
print(f"Combining Datasets Trained Accuracy: {accuracy:.2%}")

Combining Datasets Trained Accuracy: 54.89%


In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

try:
    df = pd.read_csv('framingham.csv')
except FileNotFoundError:
    print("Error: 'framingham.csv' not found. Please ensure the file is in the correct directory.")
    exit()

print("Handling missing values by mean imputation...")
for column in df.columns:
    if df[column].isnull().any():
        df[column] = df[column].fillna(df[column].mean())
print("Missing values handled.\n")

X = df.drop('TenYearCHD', axis=1)
y = df['TenYearCHD']
feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

print("## Determining Most Important Features (using RandomForest):")
rf_model_fi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_fi.fit(X_train_scaled_df, y_train)
importances = rf_model_fi.feature_importances_
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print(feature_importance_df)
print("\n" + "="*60 + "\n")

print("## Logistic Regression without SMOTE:")
log_reg_no_smote = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
log_reg_no_smote.fit(X_train_scaled_df, y_train)
y_pred_lr_no_smote = log_reg_no_smote.predict(X_test_scaled_df)
accuracy_lr_no_smote = accuracy_score(y_test, y_pred_lr_no_smote)
print(f"Accuracy: {accuracy_lr_no_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr_no_smote, zero_division=0))
print("\n" + "="*60 + "\n")
print("## Ensemble (Logistic Regression + KNN) without SMOTE:")
knn_no_smote = KNeighborsClassifier(n_neighbors=5)
knn_no_smote.fit(X_train_scaled_df, y_train)

ensemble_clf_no_smote = VotingClassifier(
    estimators=[('lr', log_reg_no_smote), ('knn', knn_no_smote)],
    voting='hard'
)
ensemble_clf_no_smote.fit(X_train_scaled_df, y_train)
y_pred_ensemble_no_smote = ensemble_clf_no_smote.predict(X_test_scaled_df)
accuracy_ensemble_no_smote = accuracy_score(y_test, y_pred_ensemble_no_smote)
print(f"Accuracy: {accuracy_ensemble_no_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_ensemble_no_smote, zero_division=0))
print("\n" + "="*60 + "\n")

print("Class distribution in original training data:")
print(y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled_df, y_train)

print("\nClass distribution in SMOTE training data:")
print(y_train_smote.value_counts())
print(f"Shape of training data before SMOTE: {X_train_scaled_df.shape}")
print(f"Shape of training data after SMOTE: {X_train_smote.shape}")
print("\n" + "="*60 + "\n")

print("## Logistic Regression with SMOTE:")
log_reg_smote = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
log_reg_smote.fit(X_train_smote, y_train_smote)
y_pred_lr_smote = log_reg_smote.predict(X_test_scaled_df)
accuracy_lr_smote = accuracy_score(y_test, y_pred_lr_smote)
print(f"Accuracy: {accuracy_lr_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr_smote, zero_division=0))
print("\n" + "="*60 + "\n")
print("## Ensemble (Logistic Regression + KNN) with SMOTE:")
knn_smote = KNeighborsClassifier(n_neighbors=5)
knn_smote.fit(X_train_smote, y_train_smote)

# log_reg_smote is already trained
ensemble_clf_smote = VotingClassifier(
    estimators=[('lr', log_reg_smote), ('knn', knn_smote)],
    voting='hard'
)

ensemble_clf_smote.fit(X_train_smote, y_train_smote)
y_pred_ensemble_smote = ensemble_clf_smote.predict(X_test_scaled_df)
accuracy_ensemble_smote = accuracy_score(y_test, y_pred_ensemble_smote)
print(f"Accuracy: {accuracy_ensemble_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_ensemble_smote, zero_division=0))
print("\n" + "="*60 + "\n")

import torch
import numpy as np
from scipy.stats import mode
from sklearn.metrics import accuracy_score, classification_report

top_features = feature_importance_df['feature'].iloc[:5].tolist()
X_test_full = X_test_scaled_df
X_test_top5 = X_test_scaled_df[top_features]
X_test_full_tensor = torch.tensor(X_test_top5.values, dtype=torch.float32)

all_preds = []
with torch.no_grad():
    for m in [concat_model, hd_trained_model, fram_trained_model]:
        outputs = m(X_test_full_tensor)
        preds = outputs.argmax(dim=1)
        all_preds.append(preds.cpu().numpy())

all_preds.append(knn_smote.predict(X_test_full))
all_preds.append(log_reg_smote.predict(X_test_full))

all_preds = np.stack(all_preds, axis=1)
y_pred_ensemble = mode(all_preds, axis=1).mode.ravel()

accuracy = accuracy_score(y_test, y_pred_ensemble)
print("## Ensemble of All Models (including KNN and Logistic Regression) with SMOTE:")
print(f"{accuracy:.4f}")
print(classification_report(y_test, y_pred_ensemble, zero_division=0))

X_train_smote_df_save = pd.DataFrame(X_train_smote, columns=feature_names)
y_train_smote_series_save = pd.Series(y_train_smote, name='TenYearCHD')
framingham_extended_df = pd.concat([X_train_smote_df_save, y_train_smote_series_save], axis=1)
try:
    framingham_extended_df.to_csv('framingham_extended.csv', index=False)
    print(f"SMOTE-augmented training dataset saved to 'framingham_extended.csv'")
    print(f"Shape of the extended dataset: {framingham_extended_df.shape}")
except Exception as e:
    print(f"Error saving 'framingham_extended.csv': {e}")

Handling missing values by mean imputation...
Missing values handled.

## Determining Most Important Features (using RandomForest):
            feature  importance
10            sysBP    0.130597
12              BMI    0.127503
1               age    0.126334
9           totChol    0.123492
14          glucose    0.118363
11            diaBP    0.110702
13        heartRate    0.095294
4        cigsPerDay    0.050460
2         education    0.041157
0              male    0.018898
7      prevalentHyp    0.018123
3     currentSmoker    0.012965
5            BPMeds    0.011895
8          diabetes    0.008236
6   prevalentStroke    0.005980


## Logistic Regression without SMOTE:
Accuracy: 0.8443
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.91       719
           1       0.41      0.05      0.10       129

    accuracy                           0.84       848
   macro avg       0.63      0.52      0.51       848
weig