In [10]:
import torch
import random
from torch.utils.data import DataLoader, Dataset

import torch.nn as nn
import torch.optim as optim
import pandas as pd

import numpy as np

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import average_precision_score

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using device: MPS")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using device: CUDA")
else:
    device = torch.device("cpu")
    print("Using device: CPU")
print(f"Using device: {device}")

Using device: CUDA
Using device: cuda


In [11]:
def set_seed(seed):
    torch.manual_seed(seed)  # Sets the seed for CPU operations
    torch.cuda.manual_seed(seed)  # Sets the seed for CUDA GPU operations
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    random.seed(seed)  # Python's random library
    np.random.seed(seed)  # NumPy
    
    # For determinism in certain CUDA operations
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [12]:
set_seed(688)

expected_columns = [
    'Age', 'Gender', 'Height', 'Weight', 'Albumin', 'ALP', 'ALT', 'AST',
    'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine', 'FiO2', 'DiasABP',
    'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'Mg', 'MAP',
    'MechVent', 'Na', 'NIDiasABP', 'NIMAP', 'NISysABP', 'PaCO2', 'PaO2',
    'pH', 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'TroponinI',
    'TroponinT', 'Urine', 'WBC', 'RecordID'
]

# Load the data
outcomes_df_a = pd.read_csv('../../data/Outcomes-a.txt')
outcomes_df_a.set_index('RecordID', inplace=True)

outcomes_df_b = pd.read_csv('../../data/Outcomes-b.txt')
outcomes_df_b.set_index('RecordID', inplace=True)

outcomes_df_c = pd.read_csv('../../data/Outcomes-c.txt')
outcomes_df_c.set_index('RecordID', inplace=True)


# Define static parameters
static_params = ['Age','Gender','Height', 'Weight', 'RecordID']

# Load the parquet file
# 49 rows per patient with timestamps 0 to 48 ( 49 rows per patient)
df_a = pd.read_parquet("../../data/set-a-imputed-scaled.parquet")
df_a['Time'] = df_a['Time'].str[:2].astype(float)
df_b = pd.read_parquet("../../data/set-b-imputed-scaled.parquet")
df_b['Time'] = df_b['Time'].str[:2].astype(float)
df_c = pd.read_parquet("../../data/set-c-imputed-scaled.parquet")
df_c['Time'] = df_c['Time'].str[:2].astype(float)

# Merge labels into df_a
df_a = df_a.merge(outcomes_df_a, on='RecordID')
df_b = df_b.merge(outcomes_df_b, on='RecordID')
df_c = df_c.merge(outcomes_df_c, on='RecordID')

# Group data by RecordID to create sequences
grouped_a = df_a.groupby('RecordID')
grouped_b = df_b.groupby('RecordID')
grouped_c = df_c.groupby('RecordID')

In [13]:
# Prepare data and labels for the Transformer
data_a = []
labels_a = []
for record_id, group in grouped_a:
    group = group.sort_values(by='Time')  # Ensure rows are sorted by time
    # print(f"RecordID {record_id}: Columns in group = {group.columns.tolist()}, Number of columns = {len(group.columns.tolist())}")
    
    features = group.drop(columns=['RecordID', 'Time', 'SAPS-I','SOFA','Length_of_stay','Survival', 'In-hospital_death']).values
    
    label = group['In-hospital_death'].iloc[0]
    # Debug: Print the shape and first few rows of the features
    # print(f"RecordID {record_id}: Features shape = {features.shape}")
    # print(f"RecordID {record_id}: First row of features = {features[0]}")
    data_a.append(features)  # Drop non-feature columns
    labels_a.append(label)


    # print(f"RecordID {record_id}:")
    # print("Features shape:", features.shape)
    # print("First row of features:", features[0])  # Print the first row of features
    # print("Label:", label)
    # Debug: Print the shape of each patient's sequence
    # print(f"RecordID {record_id}: features shape = {features.shape}, label = {label}")

# print("Feature columns in training data:", df_a.drop(columns=['RecordID', 'Time', 'SAPS-I','SOFA','Length_of_stay','Survival', 'In-hospital_death']).columns.tolist())
# Convert data and labels to PyTorch tensors
data_a = torch.tensor(data_a, dtype=torch.float32)  # Shape: (num_patients, 49, 45)
labels_a = torch.tensor(labels_a, dtype=torch.float32)  # Shape: (num_patients,)

# Debug: Print the shapes of the tensors


data_b = []
labels_b = []
for record_id, group in grouped_b:
    group = group.sort_values(by='Time')  # Ensure rows are sorted by time
    features = group.drop(columns=['RecordID', 'Time', 'SAPS-I','SOFA','Length_of_stay','Survival', 'In-hospital_death']).values
    label = group['In-hospital_death'].iloc[0]
    data_b.append(features)  # Drop non-feature columns
    labels_b.append(label)

    # Debug: Print the shape of each patient's sequence
    # print(f"RecordID {record_id}: features shape = {features.shape}, label = {label}")
# Convert data and labels to PyTorch tensors
data_b = torch.tensor(data_b, dtype=torch.float32)  # Shape: (num_patients, 49, 45)
labels_b = torch.tensor(labels_b, dtype=torch.float32)  # Shape: (num_patients,)


data_c = []
labels_c = []
for record_id, group in grouped_c:
    group = group.sort_values(by='Time')  # Ensure rows are sorted by time
    features = group.drop(columns=['RecordID', 'Time', 'SAPS-I','SOFA','Length_of_stay','Survival', 'In-hospital_death']).values
    label = group['In-hospital_death'].iloc[0]
    data_c.append(features)  # Drop non-feature columns
    labels_c.append(label)

    # Debug: Print the shape of each patient's sequence
    # print(f"RecordID {record_id}: features shape = {features.shape}, label = {label}")
# Convert data and labels to PyTorch tensors
data_c = torch.tensor(data_c, dtype=torch.float32)  # Shape: (num_patients, 49, 45)
labels_c = torch.tensor(labels_c, dtype=torch.float32)  # Shape: (num_patients,)

print("First training sample:", data_a[0][0])
print("First validation sample:", data_b[0][0])
print(f"Total sequences in data_a: {len(data_a)}")
print(f"Total sequences in data_b: {len(data_b)}")
print(f"Total sequences in data_c: {len(data_c)}")

First training sample: tensor([ 1.0000e+00,  0.0000e+00, -5.8361e-01, -1.7973e-16,  6.5631e-14,
         9.3959e-16, -5.7303e-16,  8.7475e-16, -3.0446e-14, -8.2541e-16,
         1.0968e-12, -5.0793e-13, -6.3809e-13, -3.2301e-14,  3.3457e-14,
        -7.3069e-15, -7.0036e-15,  2.8909e-16,  1.2499e-15,  5.4436e-13,
         1.5882e-16,  2.7398e-16, -1.2818e-13, -1.2364e-15, -2.2249e-16,
        -2.0550e-16,  0.0000e+00,  0.0000e+00,  2.5846e-15, -1.6663e-15,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         4.0089e-12,  0.0000e+00,  1.1195e-14, -8.8764e-16,  3.9742e-15,
         0.0000e+00])
First validation sample: tensor([ 1.0000e+00,  1.0000e+00,  3.2761e-01, -5.4593e-03, -2.9277e-02,
         2.2058e-02,  5.4288e-03,  8.7770e-03, -9.5059e-03, -1.0698e-03,
         7.1603e-01, -7.1664e-03, -2.3243e-02,  4.2705e-03,  1.8854e-02,
         3.6200e-02,  3.9522e-02, -2.0674e-02, -1.3409e-02, -1.9250e-02,
        -1.5094e-03,  1.2054e-02,  1.0751e-01, -1.2245

In [14]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, num_classes):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 100, d_model))  # Adjust max sequence length as needed
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        # Add positional encoding
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = x.permute(1, 0, 2)  # Transformer expects (seq_len, batch_size, d_model)
        x = self.transformer(x, x)
        x = x.mean(dim=0)  # Global average pooling
        x = self.fc(x)
        return x

In [15]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


In [16]:
# Hyperparameters
input_dim = 41  # Number of features in time series
d_model = 128
nhead = 8
num_layers = 3
num_classes = 1  # Binary classification (True/False)
learning_rate = 0.00005
batch_size = 8
num_epochs = 10
weight_decay = 0.00016


# Create Dataset and DataLoader
dataset = TimeSeriesDataset(data_a, labels_a)
validate_dataset = TimeSeriesDataset(data_b, labels_b)
test_dataset = TimeSeriesDataset(data_c, labels_c)

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
validate_dataloader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [17]:
# Initialize the Transformer model
model = TransformerModel(
    input_dim=41, 
    d_model=d_model, 
    nhead=nhead, 
    num_layers=num_layers, 
    num_classes=num_classes
).to(device)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


best_auroc = 0.0  # Track the best AuROC on the validation set
best_model_state = None

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)

        # Print actual vs. predicted labels
        predictions = torch.sigmoid(outputs).squeeze() > 0.5  # Convert logits to binary predictions
        # print(f"Batch {batch_idx}:")
        # print(f"Actual Labels   : {targets.cpu().int().tolist()}")
        # print(f"Predicted Labels: {predictions.cpu().int().tolist()}")


        loss = criterion(outputs.squeeze(), targets)
        train_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Average training loss for the epoch
    train_loss /= len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_val_targets = []
    all_val_predictions = []

    with torch.no_grad():
        for inputs, targets in validate_dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            val_loss += loss.item()

            # Store predictions and targets for metrics
            probabilities = torch.sigmoid(outputs).squeeze()  # Convert logits to probabilities
            all_val_predictions.extend(probabilities.cpu().tolist())  # Store probabilities
            all_val_targets.extend(targets.cpu().tolist())  # Store actual labels

            # Calculate binary predictions for accuracy
            predictions = probabilities > 0.5  # Binary classification threshold
            correct += (predictions == targets).sum().item()
            total += targets.size(0)

    # Calculate validation metrics
    avg_val_loss = val_loss / len(validate_dataloader)
    val_accuracy = correct / total
    val_auroc = roc_auc_score(all_val_targets, all_val_predictions)
    val_auprc = average_precision_score(all_val_targets, all_val_predictions)

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation AuROC: {val_auroc:.4f}, Validation AuPRC: {val_auprc:.4f}")

    # Save the best model based on validation AuROC
    if val_auroc > best_auroc:
        best_auroc = val_auroc
        best_model_state = model.state_dict()  # Save the model's state
        print(f"New best model found at epoch {epoch+1} with Validation AuROC: {val_auroc:.4f}")

# Save the best model to a file
if best_model_state is not None:
    torch.save(best_model_state, "best_model.pth")
    print(f"Best model saved with Validation AuROC: {best_auroc:.4f}")



Epoch [1/10], Training Loss: 0.3449
Validation Loss: 0.3319, Validation Accuracy: 0.8665
Validation AuROC: 0.8231, Validation AuPRC: 0.4456
New best model found at epoch 1 with Validation AuROC: 0.8231
Epoch [2/10], Training Loss: 0.3186
Validation Loss: 0.3225, Validation Accuracy: 0.8590
Validation AuROC: 0.8359, Validation AuPRC: 0.4636
New best model found at epoch 2 with Validation AuROC: 0.8359
Epoch [3/10], Training Loss: 0.2992
Validation Loss: 0.3227, Validation Accuracy: 0.8680
Validation AuROC: 0.8225, Validation AuPRC: 0.4631
Epoch [4/10], Training Loss: 0.2904
Validation Loss: 0.3325, Validation Accuracy: 0.8708
Validation AuROC: 0.8197, Validation AuPRC: 0.4534
Epoch [5/10], Training Loss: 0.2733
Validation Loss: 0.3239, Validation Accuracy: 0.8662
Validation AuROC: 0.8224, Validation AuPRC: 0.4565
Epoch [6/10], Training Loss: 0.2624
Validation Loss: 0.3288, Validation Accuracy: 0.8632
Validation AuROC: 0.8197, Validation AuPRC: 0.4414
Epoch [7/10], Training Loss: 0.2520


In [18]:
# Final evaluation on the test dataset
print("\nEvaluating on the test dataset...")
model.load_state_dict(torch.load("best_model.pth"))  # Load the best model
model.eval()
test_loss = 0.0
correct = 0
total = 0
all_test_targets = []
all_test_predictions = []

with torch.no_grad():
    for inputs, targets in test_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        test_loss += loss.item()

        # Store predictions and targets for metrics
        probabilities = torch.sigmoid(outputs).squeeze()  # Convert logits to probabilities
        all_test_predictions.extend(probabilities.cpu().tolist())  # Store probabilities
        all_test_targets.extend(targets.cpu().tolist())  # Store actual labels

        # Calculate binary predictions for accuracy
        predictions = probabilities > 0.5  # Binary classification threshold
        correct += (predictions == targets).sum().item()
        total += targets.size(0)

# Calculate test metrics
avg_test_loss = test_loss / len(test_dataloader)
test_accuracy = correct / total
test_auroc = roc_auc_score(all_test_targets, all_test_predictions)
test_auprc = average_precision_score(all_test_targets, all_test_predictions)

# Print final test performance
print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Test AuROC: {test_auroc:.4f}, Test AuPRC: {test_auprc:.4f}")


Evaluating on the test dataset...


  model.load_state_dict(torch.load("best_model.pth"))  # Load the best model


Test Loss: 0.4326, Test Accuracy: 0.8485
Test AuROC: 0.7884, Test AuPRC: 0.4115
