In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
#import pandas_profiling
from pydantic_settings import BaseSettings
%matplotlib inline
df = pd.read_csv('framingham_extended.csv')
df_hd = pd.read_csv('heart_disease_fixed.csv')

In [2]:
# Filling out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)
df.isna().sum()
df['fbs'] = (df['glucose'] > 120).astype(int)  # engineer 'fbs' since it's not available

# Filling values for heart_disease dataset
df_hd['trestbps'].fillna(df_hd.trestbps.mean(), inplace = True)
df_hd['chol'].fillna(df_hd.chol.mean(), inplace = True)
df_hd['result'] = (df_hd['target'] > 0).astype(int)

# Fill in fbs column using sampling
df_hd['fbs'] = pd.to_numeric(df_hd['fbs'], errors='coerce')
fbs_dist = df_hd['fbs'].dropna().value_counts(normalize=True)
p_0 = fbs_dist.get(0.0, 0)
p_1 = fbs_dist.get(1.0, 0)
total = p_0 + p_1
p_0 /= total
p_1 /= total
num_missing = df_hd['fbs'].isna().sum()
random_fill = np.random.choice([0, 1], size=num_missing, p=[p_0, p_1])
df_hd.loc[df_hd['fbs'].isna(), 'fbs'] = random_fill

shared_features = ['age', 'sex', 'trestbps', 'chol', 'fbs']

df.rename(columns={
    'male': 'sex',
    'sysBP': 'trestbps',
    'totChol': 'chol',
    'heartRate': 'thalach'
}, inplace=True)

df['result'] = df['TenYearCHD'].astype(int)

df_combined_fram = pd.concat([df[shared_features+['result']], df_hd[shared_features+['result']]])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BPMeds'].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['glucose'].fillna(df.glucose.mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# separate independent & dependent variables
X = df.loc[:,shared_features]  #independent columns
y = df.iloc[:,-1]    #target column i.e CHD

X_hd = df_hd.loc[:,shared_features]
y_hd = df_hd.loc[:,['result']].iloc[:,0]

X_concat_fram = df_combined_fram.loc[:,shared_features]
y_concat_fram = df_combined_fram.loc[:,['result']].iloc[:,0]




In [4]:
from sklearn.model_selection import train_test_split

#y = df['TenYearCHD'] #target variable
#X = df.drop(['TenYearCHD'], axis = 1) #features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

X_hd_train, X_hd_test, y_hd_train, y_hd_test = train_test_split(X_hd, y_hd, test_size=0.2)

X_concat_fram_train, X_concat_fram_test, y_concat_fram_train, y_concat_fram_test = train_test_split(X_concat_fram, y_concat_fram, test_size=0.2)

print (X_hd_train.shape, y_hd_train.shape)
print (X_hd_test.shape, y_hd_test.shape)
print (X_concat_fram_train.shape, y_concat_fram_train.shape)
print (X_concat_fram_test.shape, y_concat_fram_test.shape)

(4603, 5) (4603,)
(1151, 5) (1151,)
(736, 5) (736,)
(184, 5) (184,)
(5339, 5) (5339,)
(1335, 5) (1335,)


# ECE228 optimization: MLP

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# 1. Configurable MLP Model
class ConfigurableMLP(nn.Module):
    # Note this is supposed to be a binary classification task, bacause the target output "10 year CHD" is 
    # either 1 or 0
    def __init__(self, input_dim, hidden_dim, num_hidden_layers=2,
                 activation='relu', dropout_rate=0.0):
        super().__init__()

        # Choose activation function
        if activation == 'relu':
            activation_fn = nn.ReLU()
        elif activation == 'sigmoid':
            activation_fn = nn.Sigmoid()
        elif activation == 'tanh':
            activation_fn = nn.Tanh()
        else:
            raise ValueError(f"Unsupported activation: {activation}")

        layers = []

        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(activation_fn)

        # Hidden layers
        for _ in range(num_hidden_layers):
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(activation_fn)

        # Output layer (for binary classification — change if needed)
        layers.append(nn.Linear(hidden_dim, 1))  # output logits
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# 2. train model
def train_model(
    X_train, y_train, X_val, y_val,
    input_dim,
    hidden_dim=64,
    num_hidden_layers=2,
    activation='relu',
    dropout_rate=0.0,
    learning_rate=1e-3,
    batch_size=32,
    epochs=100,
    use_l1=False,
    l1_lambda=1e-5,
    use_l2=False,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
):
    # Build model
    model = ConfigurableMLP(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_hidden_layers=num_hidden_layers,
        activation=activation,
        dropout_rate=dropout_rate if dropout_rate > 0 else 0.0
    )

    # Set optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                           weight_decay=l2_lambda if use_l2 else 0.0)

    # Use BCEWithLogitsLoss for binary classification with logits
    criterion = nn.BCEWithLogitsLoss()

    # DataLoader setup
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)

    best_val_loss = float('inf')
    patience_counter = early_stopping_patience
    best_model_state = None

    # skip training for baseline results
    if skip_training:
        print("[Baseline] Skipping training, evaluating untrained model...")
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1, 1)).item()
        val_loss /= len(val_loader)
        print(f"[Baseline] Untrained Model Val Loss: {val_loss:.4f}")
        return model


    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb.view(-1, 1))


            if use_l1:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_lambda * l1_norm

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                output = model(xb)
                val_loss += criterion(output, yb.view(-1,1)).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {total_loss:.4f}, Val Loss = {val_loss:.4f}")

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = early_stopping_patience
            best_model_state = model.state_dict()
        else:
            patience_counter -= 1
            if patience_counter == 0:
                print("Early stopping triggered.")
                break

    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model





In [6]:
from sklearn.preprocessing import StandardScaler

# Framingham dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit only on Framingham

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Heart Disease dataset — use the SAME SCALER, only transform
X_hd_scaled = scaler.transform(X_hd)
X_hd_tensor = torch.tensor(X_hd_scaled, dtype=torch.float32)
y_hd_tensor = torch.tensor(y_hd, dtype=torch.float32)

# Optional split
X_hd_train, X_hd_val, y_hd_train, y_hd_val = train_test_split(X_hd_tensor, y_hd_tensor, test_size=0.2, random_state=42)

# Concatenated dataset — use the SAME SCALER, only transform
X_concat_scaled = scaler.transform(X_concat_fram)
X_concat_tensor = torch.tensor(X_concat_scaled, dtype=torch.float32)
y_concat_tensor = torch.tensor(y_concat_fram.values, dtype=torch.float32)

# Optional split
X_concat_train, X_concat_val, y_concat_train, y_concat_val = train_test_split(X_concat_tensor, y_concat_tensor, test_size=0.2, random_state=42)


In [7]:
def evaluate_model(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = torch.sigmoid(model(X)).squeeze()
        preds = (outputs >= 0.5).int()
        accuracy = (preds == y.int()).float().mean().item()
    return accuracy


In [8]:
untrained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=True
)

[Baseline] Skipping training, evaluating untrained model...
[Baseline] Untrained Model Val Loss: 0.6915


In [9]:
accuracy = evaluate_model(untrained_model, X_hd_val, y_hd_val)
print(f"Baseline Accuracy: {accuracy:.2%}")

Baseline Accuracy: 37.50%


In [25]:
fram_trained_model = train_model(
    X_train, y_train,
    X_val, y_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)


Epoch 1/100: Train Loss = 91.0478, Val Loss = 0.6025
Epoch 2/100: Train Loss = 86.5467, Val Loss = 0.6009
Epoch 3/100: Train Loss = 86.1404, Val Loss = 0.5990
Epoch 4/100: Train Loss = 85.5784, Val Loss = 0.5989
Epoch 5/100: Train Loss = 84.8546, Val Loss = 0.6015
Epoch 6/100: Train Loss = 85.0186, Val Loss = 0.5986
Epoch 7/100: Train Loss = 84.9082, Val Loss = 0.5972
Epoch 8/100: Train Loss = 83.9163, Val Loss = 0.5971
Epoch 9/100: Train Loss = 84.0052, Val Loss = 0.5949
Epoch 10/100: Train Loss = 83.7731, Val Loss = 0.5945
Epoch 11/100: Train Loss = 83.7235, Val Loss = 0.5944
Epoch 12/100: Train Loss = 82.9720, Val Loss = 0.5931
Epoch 13/100: Train Loss = 82.8679, Val Loss = 0.6035
Epoch 14/100: Train Loss = 82.8945, Val Loss = 0.6048
Epoch 15/100: Train Loss = 83.0291, Val Loss = 0.5929
Epoch 16/100: Train Loss = 82.6071, Val Loss = 0.5962
Epoch 17/100: Train Loss = 82.4932, Val Loss = 0.5920
Epoch 18/100: Train Loss = 82.5254, Val Loss = 0.5932
Epoch 19/100: Train Loss = 82.0840, V

In [26]:
accuracy = evaluate_model(fram_trained_model, X_hd_val, y_hd_val)
print(f"Framingham Dataset Trained Accuracy: {accuracy:.2%}")

Framingham Dataset Trained Accuracy: 62.50%


In [35]:
hd_trained_model = train_model(
    X_hd_train, y_hd_train,
    X_hd_val, y_hd_val,
    input_dim=X_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)


Epoch 1/100: Train Loss = 19.2025, Val Loss = 0.6525
Epoch 2/100: Train Loss = 15.8703, Val Loss = 0.6857
Epoch 3/100: Train Loss = 15.8176, Val Loss = 0.7093
Epoch 4/100: Train Loss = 15.9485, Val Loss = 0.6266
Epoch 5/100: Train Loss = 15.6668, Val Loss = 0.6378
Epoch 6/100: Train Loss = 15.3651, Val Loss = 0.6417
Epoch 7/100: Train Loss = 15.4975, Val Loss = 0.6325
Epoch 8/100: Train Loss = 15.6853, Val Loss = 0.6361
Epoch 9/100: Train Loss = 15.5890, Val Loss = 0.6468
Epoch 10/100: Train Loss = 15.5735, Val Loss = 0.6325
Epoch 11/100: Train Loss = 15.4798, Val Loss = 0.6361
Epoch 12/100: Train Loss = 15.4368, Val Loss = 0.6474
Epoch 13/100: Train Loss = 15.4688, Val Loss = 0.6335
Epoch 14/100: Train Loss = 15.2243, Val Loss = 0.6407
Early stopping triggered.


In [36]:
accuracy = evaluate_model(hd_trained_model, X_hd_val, y_hd_val)
print(f"Heart_Disease Dataset Trained Accuracy: {accuracy:.2%}")

Heart_Disease Dataset Trained Accuracy: 55.98%


In [14]:
print(torch.isnan(X_concat_train).any(), torch.isnan(y_concat_train).any())
print(torch.min(y_concat_train), torch.max(y_concat_train))
print(torch.min(X_concat_train), torch.max(X_concat_train))

tensor(False) tensor(False)
tensor(0.) tensor(1.)
tensor(-9.3399) tensor(620.0317)


In [15]:
concat_model = train_model(
    X_concat_train, y_concat_train, 
    X_concat_val, y_concat_val,
    input_dim=X_concat_train.shape[1],
    hidden_dim=64,
    num_hidden_layers=3,
    activation='relu',
    dropout_rate=0.2,
    learning_rate=1e-3,
    use_l1=False,
    use_l2=True,
    l1_lambda=1e-5,
    l2_lambda=1e-4,
    early_stopping_patience=10,
    skip_training=False
)

Epoch 1/100: Train Loss = 114.0467, Val Loss = 0.6259
Epoch 2/100: Train Loss = 105.6265, Val Loss = 0.6123
Epoch 3/100: Train Loss = 104.2090, Val Loss = 0.6061
Epoch 4/100: Train Loss = 103.2867, Val Loss = 0.5964
Epoch 5/100: Train Loss = 102.7792, Val Loss = 0.5964
Epoch 6/100: Train Loss = 102.4325, Val Loss = 0.5935
Epoch 7/100: Train Loss = 101.6774, Val Loss = 0.5919
Epoch 8/100: Train Loss = 101.8833, Val Loss = 0.5881
Epoch 9/100: Train Loss = 101.8309, Val Loss = 0.5883
Epoch 10/100: Train Loss = 100.8807, Val Loss = 0.5863
Epoch 11/100: Train Loss = 100.7786, Val Loss = 0.5882
Epoch 12/100: Train Loss = 100.6362, Val Loss = 0.5886
Epoch 13/100: Train Loss = 100.8818, Val Loss = 0.5900
Epoch 14/100: Train Loss = 100.3747, Val Loss = 0.5865
Epoch 15/100: Train Loss = 100.3565, Val Loss = 0.5873
Epoch 16/100: Train Loss = 100.7458, Val Loss = 0.5847
Epoch 17/100: Train Loss = 99.9089, Val Loss = 0.5844
Epoch 18/100: Train Loss = 99.7747, Val Loss = 0.5854
Epoch 19/100: Train L

In [16]:
accuracy = evaluate_model(concat_model, X_hd_val, y_hd_val)
print(f"Combining Datasets Trained Accuracy: {accuracy:.2%}")

Combining Datasets Trained Accuracy: 63.04%
