In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
#import pandas_profiling
from pydantic_settings import BaseSettings
%matplotlib inline
df = pd.read_csv('framingham.csv')
df_hd = pd.read_csv('heart_disease_fixed.csv')

In [2]:
# Filling out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)
df.isna().sum()
df['fbs'] = (df['glucose'] > 120).astype(int)  # engineer 'fbs' since it's not available

# Filling values for heart_disease dataset
df_hd['trestbps'].fillna(df_hd.trestbps.mean(), inplace = True)
df_hd['chol'].fillna(df_hd.chol.mean(), inplace = True)
df_hd['result'] = (df_hd['target'] > 0).astype(int)

# Fill in fbs column using sampling
df_hd['fbs'] = pd.to_numeric(df_hd['fbs'], errors='coerce')
fbs_dist = df_hd['fbs'].dropna().value_counts(normalize=True)
p_0 = fbs_dist.get(0.0, 0)
p_1 = fbs_dist.get(1.0, 0)
total = p_0 + p_1
p_0 /= total
p_1 /= total
num_missing = df_hd['fbs'].isna().sum()
random_fill = np.random.choice([0, 1], size=num_missing, p=[p_0, p_1])
df_hd.loc[df_hd['fbs'].isna(), 'fbs'] = random_fill

shared_features = ['age', 'sex', 'trestbps', 'chol', 'fbs']

df.rename(columns={
    'male': 'sex',
    'sysBP': 'trestbps',
    'totChol': 'chol',
    'heartRate': 'thalach'
}, inplace=True)

df['result'] = df['TenYearCHD'].astype(int)

df_combined_fram = pd.concat([df[shared_features+['result']], df_hd[shared_features+['result']]])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BPMeds'].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['glucose'].fillna(df.glucose.mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# separate independent & dependent variables
X = df.loc[:,shared_features]  #independent columns
y = df.iloc[:,-1]    #target column i.e CHD

X_hd = df_hd.loc[:,shared_features]
y_hd = df_hd.loc[:,['result']].iloc[:,0]

X_concat_fram = df_combined_fram.loc[:,shared_features]
y_concat_fram = df_combined_fram.loc[:,['result']].iloc[:,0]




In [4]:
from sklearn.model_selection import train_test_split

#y = df['TenYearCHD'] #target variable
#X = df.drop(['TenYearCHD'], axis = 1) #features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

X_hd_train, X_hd_test, y_hd_train, y_hd_test = train_test_split(X_hd, y_hd, test_size=0.2)

X_concat_fram_train, X_concat_fram_test, y_concat_fram_train, y_concat_fram_test = train_test_split(X_concat_fram, y_concat_fram, test_size=0.2)

print (X_hd_train.shape, y_hd_train.shape)
print (X_hd_test.shape, y_hd_test.shape)
print (X_concat_fram_train.shape, y_concat_fram_train.shape)
print (X_concat_fram_test.shape, y_concat_fram_test.shape)

(3392, 5) (3392,)
(848, 5) (848,)
(736, 5) (736,)
(184, 5) (184,)
(4128, 5) (4128,)
(1032, 5) (1032,)


# ECE228 optimization: MLP

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# 1. Configurable MLP Model
class ConfigurableMLP(nn.Module):
    # Note this is supposed to be a binary classification task, bacause the target output "10 year CHD" is 
    # either 1 or 0
    def __init__(self, input_dim, hidden_dim, num_hidden_layers=2,
                 activation='relu', dropout_rate=0.0):
        super().__init__()

        # Choose activation function
        if activation == 'relu':
            activation_fn = nn.ReLU()
        elif activation == 'sigmoid':
            activation_fn = nn.Sigmoid()
        elif activation == 'tanh':
            activation_fn = nn.Tanh()
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        layers = []

        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(activation_fn)

        # Hidden layers
        for _ in range(num_hidden_layers):
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(activation_fn)

        # Output layer (for binary classification — change if needed)
        layers.append(nn.Linear(hidden_dim, 1))  # output logits
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# 2. train model
def train_model(
    X_train, y_train, X_val, y_val,
    input_dim,
    hidden_dim=64,
    num_hidden_layers=2,
    activation='relu',
    dropout_rate=0.0,
    learning_rate=1e-3,
    batch_size=32,
    epochs=100,
    use_l1=False,
    l1_lambda=1e-5,
    use_l2=False,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
):
    # Build model
    model = ConfigurableMLP(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_hidden_layers=num_hidden_layers,
        activation=activation,
        dropout_rate=dropout_rate if dropout_rate > 0 else 0.0
    )

    # Set optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                           weight_decay=l2_lambda if use_l2 else 0.0)

    # Use BCEWithLogitsLoss for binary classification with logits
    criterion = nn.BCEWithLogitsLoss()

    # DataLoader setup
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)

    best_val_loss = float('inf')
    patience_counter = early_stopping_patience
    best_model_state = None

    # skip training for baseline results
    if skip_training:
        print("[Baseline] Skipping training, evaluating untrained model...")
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1, 1)).item()
        val_loss /= len(val_loader)
        print(f"[Baseline] Untrained Model Val Loss: {val_loss:.4f}")
        return model


    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb.view(-1, 1))


            if use_l1:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_lambda * l1_norm

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1,1)).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {total_loss:.4f}, Val Loss = {val_loss:.4f}")

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = early_stopping_patience
            best_model_state = model.state_dict()
        else:
            patience_counter -= 1
            if patience_counter == 0:
                print("Early stopping triggered.")
                break

    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model





In [6]:
from sklearn.preprocessing import StandardScaler

# Framingham dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit only on Framingham

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Heart Disease dataset — use the SAME SCALER, only transform
X_hd_scaled = scaler.transform(X_hd)
X_hd_tensor = torch.tensor(X_hd_scaled, dtype=torch.float32)
y_hd_tensor = torch.tensor(y_hd, dtype=torch.float32)

# Optional split
X_hd_train, X_hd_val, y_hd_train, y_hd_val = train_test_split(X_hd_tensor, y_hd_tensor, test_size=0.2, random_state=42)

# Concatenated dataset — use the SAME SCALER, only transform
X_concat_scaled = scaler.transform(X_concat_fram)
X_concat_tensor = torch.tensor(X_concat_scaled, dtype=torch.float32)
y_concat_tensor = torch.tensor(y_concat_fram.values, dtype=torch.float32)

# Optional split
X_concat_train, X_concat_val, y_concat_train, y_concat_val = train_test_split(X_concat_tensor, y_concat_tensor, test_size=0.2, random_state=42)


In [7]:
def evaluate_model(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = torch.sigmoid(model(X)).squeeze()
        preds = (outputs >= 0.5).int()
        accuracy = (preds == y.int()).float().mean().item()
    return accuracy


In [8]:
untrained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=True
)

[Baseline] Skipping training, evaluating untrained model...
[Baseline] Untrained Model Val Loss: 0.6734


In [9]:
accuracy = evaluate_model(untrained_model, X_hd_val, y_hd_val)
print(f"Baseline Accuracy: {accuracy:.2%}")

Baseline Accuracy: 37.50%


In [10]:
fram_trained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)


Epoch 1/100: Train Loss = 49.3481, Val Loss = 0.3843
Epoch 2/100: Train Loss = 41.9267, Val Loss = 0.3816
Epoch 3/100: Train Loss = 42.4705, Val Loss = 0.3784
Epoch 4/100: Train Loss = 41.4811, Val Loss = 0.3804
Epoch 5/100: Train Loss = 41.5735, Val Loss = 0.3751
Epoch 6/100: Train Loss = 41.5623, Val Loss = 0.3759
Epoch 7/100: Train Loss = 41.1991, Val Loss = 0.3768
Epoch 8/100: Train Loss = 41.3729, Val Loss = 0.3759
Epoch 9/100: Train Loss = 41.0033, Val Loss = 0.3762
Epoch 10/100: Train Loss = 41.2843, Val Loss = 0.3775
Epoch 11/100: Train Loss = 41.1010, Val Loss = 0.3763
Epoch 12/100: Train Loss = 41.0032, Val Loss = 0.3833
Epoch 13/100: Train Loss = 41.0102, Val Loss = 0.3770
Epoch 14/100: Train Loss = 41.1156, Val Loss = 0.3771
Epoch 15/100: Train Loss = 40.8913, Val Loss = 0.3762
Early stopping triggered.


In [11]:
accuracy = evaluate_model(fram_trained_model, X_hd_val, y_hd_val)
print(f"Framingham Dataset Trained Accuracy: {accuracy:.2%}")

Framingham Dataset Trained Accuracy: 37.50%


In [12]:
hd_trained_model = train_model(
    X_hd_train, y_hd_train,
    X_hd_val, y_hd_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)


Epoch 1/100: Train Loss = 15.7062, Val Loss = 0.6472
Epoch 2/100: Train Loss = 14.9668, Val Loss = 0.6040
Epoch 3/100: Train Loss = 14.5995, Val Loss = 0.6031
Epoch 4/100: Train Loss = 14.2314, Val Loss = 0.6010
Epoch 5/100: Train Loss = 13.9495, Val Loss = 0.5852
Epoch 6/100: Train Loss = 13.5278, Val Loss = 0.5902
Epoch 7/100: Train Loss = 13.4055, Val Loss = 0.6029
Epoch 8/100: Train Loss = 13.5146, Val Loss = 0.5861
Epoch 9/100: Train Loss = 13.4337, Val Loss = 0.5948
Epoch 10/100: Train Loss = 13.3234, Val Loss = 0.5945
Epoch 11/100: Train Loss = 13.1462, Val Loss = 0.6014
Epoch 12/100: Train Loss = 13.2307, Val Loss = 0.5947
Epoch 13/100: Train Loss = 13.0501, Val Loss = 0.6024
Epoch 14/100: Train Loss = 13.5495, Val Loss = 0.5849
Epoch 15/100: Train Loss = 13.2137, Val Loss = 0.5991
Epoch 16/100: Train Loss = 13.0916, Val Loss = 0.5996
Epoch 17/100: Train Loss = 13.1638, Val Loss = 0.5882
Epoch 18/100: Train Loss = 12.8896, Val Loss = 0.6025
Epoch 19/100: Train Loss = 13.1170, V

In [13]:
accuracy = evaluate_model(hd_trained_model, X_hd_val, y_hd_val)
print(f"Heart_Disease Dataset Trained Accuracy: {accuracy:.2%}")

Heart_Disease Dataset Trained Accuracy: 72.28%


In [14]:
print(torch.isnan(X_concat_train).any(), torch.isnan(y_concat_train).any())
print(torch.min(y_concat_train), torch.max(y_concat_train))
print(torch.min(X_concat_train), torch.max(X_concat_train))

tensor(False) tensor(False)
tensor(0.) tensor(1.)
tensor(-59.7535) tensor(10.3627)


In [15]:
concat_model = train_model(
    X_concat_train, y_concat_train, 
    X_concat_val, y_concat_val,
    input_dim=X_concat_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)

Epoch 1/100: Train Loss = 67.2950, Val Loss = 0.4714
Epoch 2/100: Train Loss = 59.6189, Val Loss = 0.4513
Epoch 3/100: Train Loss = 58.6660, Val Loss = 0.4448
Epoch 4/100: Train Loss = 58.6064, Val Loss = 0.4439
Epoch 5/100: Train Loss = 57.7741, Val Loss = 0.4354
Epoch 6/100: Train Loss = 57.4936, Val Loss = 0.4312
Epoch 7/100: Train Loss = 57.5634, Val Loss = 0.4339
Epoch 8/100: Train Loss = 57.8961, Val Loss = 0.4325
Epoch 9/100: Train Loss = 56.7379, Val Loss = 0.4359
Epoch 10/100: Train Loss = 57.1352, Val Loss = 0.4323
Epoch 11/100: Train Loss = 56.7633, Val Loss = 0.4316
Epoch 12/100: Train Loss = 57.1597, Val Loss = 0.4323
Epoch 13/100: Train Loss = 56.9338, Val Loss = 0.4341
Epoch 14/100: Train Loss = 56.6923, Val Loss = 0.4342
Epoch 15/100: Train Loss = 56.8582, Val Loss = 0.4319
Epoch 16/100: Train Loss = 57.0295, Val Loss = 0.4371
Early stopping triggered.


In [16]:
accuracy = evaluate_model(concat_model, X_hd_val, y_hd_val)
print(f"Combining Datasets Trained Accuracy: {accuracy:.2%}")

Combining Datasets Trained Accuracy: 58.15%
