In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
#import pandas_profiling
from pydantic_settings import BaseSettings
%matplotlib inline
df = pd.read_csv('framingham_extended.csv')
df_hd = pd.read_csv('heart_disease_fixed.csv')

In [23]:
# Filling out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)
df.isna().sum()
df['fbs'] = (df['glucose'] > 120).astype(int)  # engineer 'fbs' since it's not available

# Filling values for heart_disease dataset
df_hd['trestbps'].fillna(df_hd.trestbps.mean(), inplace = True)
df_hd['chol'].fillna(df_hd.chol.mean(), inplace = True)
df_hd['result'] = (df_hd['target'] > 0).astype(int)

# Fill in fbs column using sampling
df_hd['fbs'] = pd.to_numeric(df_hd['fbs'], errors='coerce')
fbs_dist = df_hd['fbs'].dropna().value_counts(normalize=True)
p_0 = fbs_dist.get(0.0, 0)
p_1 = fbs_dist.get(1.0, 0)
total = p_0 + p_1
p_0 /= total
p_1 /= total
num_missing = df_hd['fbs'].isna().sum()
random_fill = np.random.choice([0, 1], size=num_missing, p=[p_0, p_1])
df_hd.loc[df_hd['fbs'].isna(), 'fbs'] = random_fill

shared_features = ['age', 'sex', 'trestbps', 'chol', 'fbs']

df.rename(columns={
    'male': 'sex',
    'sysBP': 'trestbps',
    'totChol': 'chol',
    'heartRate': 'thalach'
}, inplace=True)

df['result'] = df['TenYearCHD'].astype(int)

df_combined_fram = pd.concat([df[shared_features+['result']], df_hd[shared_features+['result']]])

In [24]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# separate independent & dependent variables
X = df.loc[:,shared_features]  #independent columns
y = df.iloc[:,-1]    #target column i.e CHD

X_hd = df_hd.loc[:,shared_features]
y_hd = df_hd.loc[:,['result']].iloc[:,0]

X_concat_fram = df_combined_fram.loc[:,shared_features]
y_concat_fram = df_combined_fram.loc[:,['result']].iloc[:,0]

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

X_hd_train, X_hd_test, y_hd_train, y_hd_test = train_test_split(X_hd, y_hd, test_size=0.2)

X_concat_fram_train, X_concat_fram_test, y_concat_fram_train, y_concat_fram_test = train_test_split(X_concat_fram, y_concat_fram, test_size=0.2)

print (X_hd_train.shape, y_hd_train.shape)
print (X_hd_test.shape, y_hd_test.shape)
print (X_concat_fram_train.shape, y_concat_fram_train.shape)
print (X_concat_fram_test.shape, y_concat_fram_test.shape)

(4603, 5) (4603,)
(1151, 5) (1151,)
(736, 5) (736,)
(184, 5) (184,)
(5339, 5) (5339,)
(1335, 5) (1335,)


# ECE228 optimization: MLP

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# 1. Configurable MLP Model
class ConfigurableMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_hidden_layers=2,
                 activation='relu', dropout_rate=0.0):
        super().__init__()

        # Choose activation function
        if activation == 'relu':
            activation_fn = nn.ReLU()
        elif activation == 'sigmoid':
            activation_fn = nn.Sigmoid()
        elif activation == 'tanh':
            activation_fn = nn.Tanh()
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        layers = []

        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(activation_fn)

        # Hidden layers
        for _ in range(num_hidden_layers):
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(activation_fn)

        # Output layer (for binary classification — change if needed)
        layers.append(nn.Linear(hidden_dim, 1))  # output logits
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# 2. train model
def train_model(
    X_train, y_train, X_val, y_val,
    input_dim,
    hidden_dim=64,
    num_hidden_layers=2,
    activation='relu',
    dropout_rate=0.0,
    learning_rate=1e-3,
    batch_size=32,
    epochs=100,
    use_l1=False,
    l1_lambda=1e-5,
    use_l2=False,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
):
    # Build model
    model = ConfigurableMLP(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_hidden_layers=num_hidden_layers,
        activation=activation,
        dropout_rate=dropout_rate if dropout_rate > 0 else 0.0
    )

    # Set optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                           weight_decay=l2_lambda if use_l2 else 0.0)

    # Use BCEWithLogitsLoss for binary classification with logits
    criterion = nn.BCEWithLogitsLoss()

    # DataLoader setup
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)

    best_val_loss = float('inf')
    patience_counter = early_stopping_patience
    best_model_state = None

    # skip training for baseline results
    if skip_training:
        print("[Baseline] Skipping training, evaluating untrained model...")
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1, 1)).item()
        val_loss /= len(val_loader)
        print(f"[Baseline] Untrained Model Val Loss: {val_loss:.4f}")
        return model


    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb.view(-1, 1))


            if use_l1:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_lambda * l1_norm

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1,1)).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {total_loss:.4f}, Val Loss = {val_loss:.4f}")

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = early_stopping_patience
            best_model_state = model.state_dict()
        else:
            patience_counter -= 1
            if patience_counter == 0:
                print("Early stopping triggered.")
                break

    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model





In [27]:
from sklearn.preprocessing import StandardScaler

# Framingham dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit only on Framingham

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Heart Disease dataset — use the SAME SCALER, only transform
X_hd_scaled = scaler.transform(X_hd)
X_hd_tensor = torch.tensor(X_hd_scaled, dtype=torch.float32)
y_hd_tensor = torch.tensor(y_hd, dtype=torch.float32)

# Optional split
X_hd_train, X_hd_val, y_hd_train, y_hd_val = train_test_split(X_hd_tensor, y_hd_tensor, test_size=0.2, random_state=42)

# Concatenated dataset — use the SAME SCALER, only transform
X_concat_scaled = scaler.transform(X_concat_fram)
X_concat_tensor = torch.tensor(X_concat_scaled, dtype=torch.float32)
y_concat_tensor = torch.tensor(y_concat_fram.values, dtype=torch.float32)

# Optional split
X_concat_train, X_concat_val, y_concat_train, y_concat_val = train_test_split(X_concat_tensor, y_concat_tensor, test_size=0.2, random_state=42)


In [28]:
def evaluate_model(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = torch.sigmoid(model(X)).squeeze()
        preds = (outputs >= 0.5).int()
        accuracy = (preds == y.int()).float().mean().item()
    return accuracy


In [29]:
untrained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=128,
    num_hidden_layers=5,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=100,
    skip_training=True
)

[Baseline] Skipping training, evaluating untrained model...
[Baseline] Untrained Model Val Loss: 0.6932


In [30]:
accuracy = evaluate_model(untrained_model, X_hd_val, y_hd_val)
print(f"Baseline Accuracy: {accuracy:.2%}")

Baseline Accuracy: 62.50%


In [31]:
fram_trained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=100,
    skip_training=False
)


Epoch 1/100: Train Loss = 90.3212, Val Loss = 0.5966
Epoch 2/100: Train Loss = 86.2807, Val Loss = 0.5972
Epoch 3/100: Train Loss = 86.0327, Val Loss = 0.6005
Epoch 4/100: Train Loss = 85.6454, Val Loss = 0.5990
Epoch 5/100: Train Loss = 84.9117, Val Loss = 0.6015
Epoch 6/100: Train Loss = 85.1311, Val Loss = 0.6008
Epoch 7/100: Train Loss = 84.8212, Val Loss = 0.5966
Epoch 8/100: Train Loss = 83.9277, Val Loss = 0.5968
Epoch 9/100: Train Loss = 84.3032, Val Loss = 0.5990
Epoch 10/100: Train Loss = 84.3253, Val Loss = 0.5930
Epoch 11/100: Train Loss = 83.7851, Val Loss = 0.5957
Epoch 12/100: Train Loss = 83.4876, Val Loss = 0.5915
Epoch 13/100: Train Loss = 83.4653, Val Loss = 0.5950
Epoch 14/100: Train Loss = 83.6883, Val Loss = 0.5902
Epoch 15/100: Train Loss = 83.1642, Val Loss = 0.5950
Epoch 16/100: Train Loss = 82.9680, Val Loss = 0.5912
Epoch 17/100: Train Loss = 82.2477, Val Loss = 0.5861
Epoch 18/100: Train Loss = 82.5198, Val Loss = 0.5897
Epoch 19/100: Train Loss = 82.6290, V

In [32]:
accuracy = evaluate_model(fram_trained_model, X_hd_val, y_hd_val)
print(f"Framingham Dataset Trained Accuracy: {accuracy:.2%}")

Framingham Dataset Trained Accuracy: 62.50%


In [33]:
hd_trained_model = train_model(
    X_hd_train, y_hd_train,
    X_hd_val, y_hd_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=100,
    skip_training=False
)


Epoch 1/100: Train Loss = 19.1004, Val Loss = 0.6327
Epoch 2/100: Train Loss = 16.0704, Val Loss = 0.6509
Epoch 3/100: Train Loss = 15.9951, Val Loss = 0.6382
Epoch 4/100: Train Loss = 15.7964, Val Loss = 0.6434
Epoch 5/100: Train Loss = 15.5925, Val Loss = 0.6444
Epoch 6/100: Train Loss = 15.6141, Val Loss = 0.6851
Epoch 7/100: Train Loss = 15.6357, Val Loss = 0.6347
Epoch 8/100: Train Loss = 15.3341, Val Loss = 0.6696
Epoch 9/100: Train Loss = 15.2322, Val Loss = 0.6375
Epoch 10/100: Train Loss = 15.4317, Val Loss = 0.6693
Epoch 11/100: Train Loss = 15.1548, Val Loss = 0.6496
Epoch 12/100: Train Loss = 15.4260, Val Loss = 0.6570
Epoch 13/100: Train Loss = 15.2905, Val Loss = 0.6397
Epoch 14/100: Train Loss = 15.2603, Val Loss = 0.6397
Epoch 15/100: Train Loss = 15.1736, Val Loss = 0.6412
Epoch 16/100: Train Loss = 15.4337, Val Loss = 0.6710
Epoch 17/100: Train Loss = 15.1890, Val Loss = 0.6537
Epoch 18/100: Train Loss = 15.3578, Val Loss = 0.6603
Epoch 19/100: Train Loss = 15.1481, V

In [34]:
accuracy = evaluate_model(hd_trained_model, X_hd_val, y_hd_val)
print(f"Heart_Disease Dataset Trained Accuracy: {accuracy:.2%}")

Heart_Disease Dataset Trained Accuracy: 57.07%


In [35]:
print(torch.isnan(X_concat_train).any(), torch.isnan(y_concat_train).any())
print(torch.min(y_concat_train), torch.max(y_concat_train))
print(torch.min(X_concat_train), torch.max(X_concat_train))

tensor(False) tensor(False)
tensor(0.) tensor(1.)
tensor(-9.3399) tensor(620.0317)


In [36]:
concat_model = train_model(
    X_concat_train, y_concat_train, 
    X_concat_val, y_concat_val,
    input_dim=X_concat_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)

Epoch 1/100: Train Loss = 113.2882, Val Loss = 0.6223
Epoch 2/100: Train Loss = 105.0922, Val Loss = 0.6036
Epoch 3/100: Train Loss = 104.1281, Val Loss = 0.5991
Epoch 4/100: Train Loss = 102.8736, Val Loss = 0.5968
Epoch 5/100: Train Loss = 102.4142, Val Loss = 0.5963
Epoch 6/100: Train Loss = 102.3316, Val Loss = 0.5898
Epoch 7/100: Train Loss = 101.8708, Val Loss = 0.5914
Epoch 8/100: Train Loss = 101.2962, Val Loss = 0.5943
Epoch 9/100: Train Loss = 101.5385, Val Loss = 0.5889
Epoch 10/100: Train Loss = 101.1128, Val Loss = 0.5901
Epoch 11/100: Train Loss = 101.6787, Val Loss = 0.5890
Epoch 12/100: Train Loss = 101.4259, Val Loss = 0.5876
Epoch 13/100: Train Loss = 100.8322, Val Loss = 0.5889
Epoch 14/100: Train Loss = 100.7858, Val Loss = 0.5847
Epoch 15/100: Train Loss = 100.9045, Val Loss = 0.5922
Epoch 16/100: Train Loss = 100.5446, Val Loss = 0.5910
Epoch 17/100: Train Loss = 100.4566, Val Loss = 0.5911
Epoch 18/100: Train Loss = 100.0802, Val Loss = 0.5866
Epoch 19/100: Train

In [37]:
accuracy = evaluate_model(concat_model, X_hd_val, y_hd_val)
print(f"Combining Datasets Trained Accuracy: {accuracy:.2%}")

Combining Datasets Trained Accuracy: 61.96%
