In [10]:
import random
import torch
from torch.utils.data import DataLoader, Dataset

import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import average_precision_score
from torch.nn.utils.rnn import pad_sequence

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using device: MPS")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using device: CUDA")
else:
    device = torch.device("cpu")
    print("Using device: CPU")
print(f"Using device: {device}")

Using device: CUDA
Using device: cuda


In [11]:
def set_seed(seed):
    torch.manual_seed(seed)  # Sets the seed for CPU operations
    torch.cuda.manual_seed(seed)  # Sets the seed for CUDA GPU operations
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    random.seed(seed)  # Python's random library
    np.random.seed(seed)  # NumPy
    
    # For determinism in certain CUDA operations
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [12]:
set_seed(688)

# Define expected columns
expected_columns = [
    'Age', 'Gender', 'Height', 'Weight', 'Albumin', 'ALP', 'ALT', 'AST',
    'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine', 'FiO2', 'DiasABP',
    'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'Mg', 'MAP',
    'MechVent', 'Na', 'NIDiasABP', 'NIMAP', 'NISysABP', 'PaCO2', 'PaO2',
    'pH', 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'TroponinI',
    'TroponinT', 'Urine', 'WBC', 'RecordID'
]

# Load the data
outcomes_df_a = pd.read_csv('../../data/Outcomes-a.txt')
outcomes_df_a.set_index('RecordID', inplace=True)

outcomes_df_b = pd.read_csv('../../data/Outcomes-b.txt')
outcomes_df_b.set_index('RecordID', inplace=True)

outcomes_df_c = pd.read_csv('../../data/Outcomes-c.txt')
outcomes_df_c.set_index('RecordID', inplace=True)

# Define static parameters
static_params = ['Age','Gender','Height', 'Weight', 'RecordID']

# Load the parquet file
# 49 rows per patient with timestamps 0 to 48 ( 49 rows per patient)
df_a = pd.read_parquet('../../data/set-a-triplet.parquet')
df_a['Time'] = df_a['Time'].astype(float)
df_b = pd.read_parquet('../../data/set-b-triplet.parquet')
df_b['Time'] = df_b['Time'].astype(float)
df_c = pd.read_parquet('../../data/set-c-triplet.parquet')
df_c['Time'] = df_c['Time'].astype(float)


# Merge labels into df
df_a = df_a.merge(outcomes_df_a, on='RecordID')
df_b = df_b.merge(outcomes_df_b, on='RecordID')
df_c = df_c.merge(outcomes_df_c, on='RecordID')


# Group data by RecordID to create sequences
grouped_a = df_a.groupby('RecordID')
grouped_b = df_b.groupby('RecordID')
grouped_c = df_c.groupby('RecordID')

In [13]:
# Prepare data and labels for the Transformer (data_a)
data_a = []
labels_a = []
sequence_lengths_a = []

for record_id, group in grouped_a:
    group = group.sort_values(by='Time')  # Ensure rows are sorted by time

    # Include 'Time' in the feature vector
    group['FeatureVector'] = group[['Time', 'Sensor', 'Value']].values.tolist()

    # Create a sequence of feature vectors for the patient
    features = torch.tensor(group['FeatureVector'].tolist(), dtype=torch.float32)  # Shape: (sequence_length, feature_dim)
    label = group['In-hospital_death'].iloc[0]  # Use the first label for the patient
    # print('features shape', features.shape)
    # print('feattures', features)
    data_a.append(features)
    labels_a.append(label)
    sequence_lengths_a.append(features.size(0))  # Store the original sequence length


# Pad sequences to the same length
# labels_a = labels_a[:8]
# data_a = data_a[:8]
padded_data_a = pad_sequence(data_a, batch_first=True, padding_value=0)  # Shape: (num_patients, max_sequence_length, feature_dim)
attention_mask_a = (padded_data_a.sum(dim=-1) != 0).float()  # Shape: (num_patients, max_sequence_length)
labels_a = torch.tensor(labels_a, dtype=torch.float32)  # Shape: (num_patients,)

print(f"Maximum sequence length: {max(sequence_lengths_a)}")
print(f"Sequence lengths: {sequence_lengths_a[:10]}")  # Print the first 10 sequence lengths
# Create an attention mask
attention_mask_a = (padded_data_a.sum(dim=-1) != 0).float()  # Shape: (num_patients, max_sequence_length)
print('data a shape', len(data_a))
print('padded data a shape', padded_data_a.shape)
print('attention mask a shape', attention_mask_a.shape)
print('attentionmask a', attention_mask_a)
# # Debug: Print the shapes of the tensors
# print(f"Shape of data_a tensor: {data_a.shape}")
# print(f"Shape of labels_a tensor: {labels_a.shape}")

# Prepare data and labels for the Transformer (data_b)
data_b = []
labels_b = []
sequence_lengths_b = []

for record_id, group in grouped_b:
    group = group.sort_values(by='Time')  # Ensure rows are sorted by time

    # Include 'Time' in the feature vector
    group['FeatureVector'] = group[['Time', 'Sensor', 'Value']].values.tolist()


    # Create a sequence of feature vectors for the patient
    features = torch.tensor(group['FeatureVector'].tolist(), dtype=torch.float32)  # Shape: (sequence_length, feature_dim)
    label = group['In-hospital_death'].iloc[0]  # Use the first label for the patient

    data_b.append(features)
    labels_b.append(label)
    sequence_lengths_b.append(features.size(0))  # Store the original sequence length

labels_b = labels_b[:8]
data_b = data_b[:8]
# Pad sequences to the same length
padded_data_b = pad_sequence(data_b, batch_first=True, padding_value=0)  # Shape: (num_patients, max_sequence_length, feature_dim)
labels_b = torch.tensor(labels_b, dtype=torch.float32)  # Shape: (num_patients,)
# Create an attention mask
attention_mask_b = (padded_data_b.sum(dim=-1) != 0).float()  # Shape: (num_patients, max_sequence_length)

# Prepare data and labels for the Transformer (data_c)
data_c = []
labels_c = []
sequence_lengths_c = []

for record_id, group in grouped_c:
    group = group.sort_values(by='Time')  # Ensure rows are sorted by time

    # Include 'Time' in the feature vector
    group['FeatureVector'] = group[['Time', 'Sensor', 'Value']].values.tolist()


    # Create a sequence of feature vectors for the patient
    features = torch.tensor(group['FeatureVector'].tolist(), dtype=torch.float32)  # Shape: (sequence_length, feature_dim)
    label = group['In-hospital_death'].iloc[0]  # Use the first label for the patient

    data_c.append(features)
    labels_c.append(label)
    sequence_lengths_c.append(features.size(0))  # Store the original sequence length

# Pad sequences to the same length
padded_data_c = pad_sequence(data_c, batch_first=True)  # Shape: (num_patients, max_sequence_length, feature_dim)
labels_c = torch.tensor(labels_c, dtype=torch.float32)  # Shape: (num_patients,)
# Create an attention mask
attention_mask_c = (padded_data_c.sum(dim=-1) != 0).float()  # Shape: (num_patients, max_sequence_length)


print(f"Total sequences in data_a: {len(data_a)}")
print(f"Total sequences in data_b: {len(data_b)}")
print(f"Total sequences in data_c: {len(data_c)}")

Maximum sequence length: 747
Sequence lengths: [261, 501, 445, 459, 305, 417, 451, 592, 368, 465]
data a shape 4000
padded data a shape torch.Size([4000, 747, 3])
attention mask a shape torch.Size([4000, 747])
attentionmask a tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])
Total sequences in data_a: 4000
Total sequences in data_b: 8
Total sequences in data_c: 4000


In [14]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, labels, masks):
        self.data = data
        self.labels = labels
        self.masks = masks

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx], self.masks[idx]

In [15]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, num_classes):
        super(TransformerModel, self).__init__()
        self.sensor_embed = nn.Embedding(41, d_model//2)
        
        # Continuous value projection
        self.value_proj = nn.Linear(2, d_model//2)  # For time and value
        self.embedding = nn.Linear(input_dim, d_model)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.pooling = nn.AdaptiveAvgPool1d(1)  # Pool across the sequence dimension
        self.fc = nn.Linear(d_model, num_classes)


    def forward(self, x, mask):
        sensor_ids = x[:,:,1].long()  # Sensor indices
        time_values = x[:,:,[0,2]]    # Time and value
        
        # Get embeddings
        sensor_emb = self.sensor_embed(sensor_ids)  # [batch, seq_len, d_model//2]
        value_emb = self.value_proj(time_values)     # [batch, seq_len, d_model//2]
        
        # Combine embeddings
        x = torch.cat([sensor_emb, value_emb], dim=-1)
        
        # Transformer expects mask where 0 means ignore
        padding_mask = (mask == 0)
        x = self.transformer_encoder(x, src_key_padding_mask=(mask == 0))
        x = x.mean(dim=1)  # Average pooling across the sequence dimension
        x = self.fc(x)
        return x  # Apply sigmoid activation for binary classification

In [16]:
# Hyperparameters
input_dim = 3 # Number of features in time series
d_model = 256  #was at 64
nhead = 4
num_layers = 3
num_classes = 1  # Binary classification (True/False)
learning_rate = 0.0001
batch_size = 8
num_epochs = 10


# Create Dataset and DataLoader
# Create datasets
dataset = TimeSeriesDataset(padded_data_a, labels_a, attention_mask_a)
validate_dataset = TimeSeriesDataset(padded_data_b, labels_b, attention_mask_b)
test_dataset = TimeSeriesDataset(padded_data_c, labels_c, attention_mask_c)

# Create DataLoaders
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
validate_dataloader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



In [17]:


# Initialize the Transformer model
model = TransformerModel(
    input_dim=input_dim, 
    d_model=d_model, 
    nhead=nhead, 
    num_layers=num_layers, 
    num_classes=num_classes
).to(device)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0000)

# Initialize variables for tracking the best model
best_auroc = 0.0  # Track the best AuROC on the validation set
best_model_state = None

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    for batch_idx, (inputs, targets, masks) in enumerate(dataloader):
        
        inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
        if torch.isnan(inputs).any() or torch.isinf(inputs).any():
            print(f"NaN or Inf detected in inputs at batch {batch_idx}")
        if torch.isnan(targets).any() or torch.isinf(targets).any():
            print(f"NaN or Inf detected in targets at batch {batch_idx}")

        # Forward pass
        outputs = model(inputs, masks)
        loss = criterion(outputs.squeeze(), targets)
        print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")
        if torch.isnan(loss):
            print(f"NaN detected in loss at batch {batch_idx}")
            print(f"Logits: {outputs}")
            print(f"Targets: {targets}")
            break
        # Print actual vs. predicted labels
        predictions = torch.sigmoid(outputs).squeeze() > 0.5  # Convert logits to binary predictions
        print(f"Batch {batch_idx}:" )
        print(f"Outputs   : {outputs.cpu().tolist()}")
        print(f"Actual Labels   : {targets.cpu().int().tolist()}")
        print(f"Predicted Labels: {predictions.cpu().int().tolist()}")

        train_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Average training loss for the epoch
    train_loss /= len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_val_targets = []
    all_val_predictions = []

    with torch.no_grad():
        for inputs, targets, masks in validate_dataloader:
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
            
            # Forward pass
            outputs = model(inputs, masks)
            loss = criterion(outputs.squeeze(), targets)
            val_loss += loss.item()

            # Store predictions and targets for metrics
            probabilities = torch.sigmoid(outputs).squeeze()  # Convert logits to probabilities
            all_val_predictions.extend(probabilities.cpu().tolist())  # Store probabilities
            all_val_targets.extend(targets.cpu().tolist())  # Store actual labels

            # Calculate binary predictions for accuracy
            predictions = probabilities > 0.5  # Binary classification threshold
            correct += (predictions == targets).sum().item()
            total += targets.size(0)
    # Calculate validation metrics
    avg_val_loss = val_loss / len(validate_dataloader)
    val_accuracy = correct / total
    val_auroc = roc_auc_score(all_val_targets, all_val_predictions)
    val_auprc = average_precision_score(all_val_targets, all_val_predictions)

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation AuROC: {val_auroc:.4f}, Validation AuPRC: {val_auprc:.4f}")

    # Save the best model based on validation AuROC
    if val_auroc > best_auroc:
        best_auroc = val_auroc
        best_model_state = model.state_dict()  # Save the model's state
        print(f"New best model found at epoch {epoch+1} with Validation AuROC: {val_auroc:.4f}")

# Save the best model to a file
if best_model_state is not None:
    torch.save(best_model_state, "best_model.pth")
    print(f"Best model saved with Validation AuROC: {best_auroc:.4f}")

Batch 0, Loss: 0.7081
Batch 0:
Outputs   : [[0.029996328055858612], [0.07343407720327377], [0.05752747505903244], [9.974092245101929e-05], [0.051449015736579895], [0.05335605889558792], [0.02643958479166031], [0.05647147446870804]]
Actual Labels   : [0, 0, 1, 0, 0, 0, 0, 0]
Predicted Labels: [1, 1, 1, 1, 1, 1, 1, 1]
Batch 1, Loss: 0.4053
Batch 1:
Outputs   : [[-1.444733738899231], [-1.3586562871932983], [-1.248844861984253], [-1.376766562461853], [-1.4181690216064453], [-1.4547995328903198], [-1.2913240194320679], [-1.4527316093444824]]
Actual Labels   : [1, 0, 0, 0, 0, 0, 0, 0]
Predicted Labels: [0, 0, 0, 0, 0, 0, 0, 0]
Batch 2, Loss: 0.0881
Batch 2:
Outputs   : [[-2.4764204025268555], [-2.2739038467407227], [-2.3810417652130127], [-2.3592441082000732], [-2.4037256240844727], [-2.3606138229370117], [-2.344684600830078], [-2.4910733699798584]]
Actual Labels   : [0, 0, 0, 0, 0, 0, 0, 0]
Predicted Labels: [0, 0, 0, 0, 0, 0, 0, 0]
Batch 3, Loss: 0.0351
Batch 3:
Outputs   : [[-3.3238954544

In [18]:
# Final evaluation on the test dataset
print("\nEvaluating on the test dataset...")
model.load_state_dict(torch.load("best_model.pth"))  # Load the best model
model.eval()
test_loss = 0.0
correct = 0
total = 0
all_test_targets = []
all_test_predictions = []

with torch.no_grad():
    for inputs, targets, masks in test_dataloader:
        inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
        
        # Forward pass
        outputs = model(inputs, masks)
        loss = criterion(outputs.squeeze(), targets)
        test_loss += loss.item()

        # Store predictions and targets for metrics
        probabilities = torch.sigmoid(outputs).squeeze()  # Convert logits to probabilities
        all_test_predictions.extend(probabilities.cpu().tolist())  # Store probabilities
        all_test_targets.extend(targets.cpu().tolist())  # Store actual labels

        # Calculate binary predictions for accuracy
        predictions = probabilities > 0.5  # Binary classification threshold
        correct += (predictions == targets).sum().item()
        total += targets.size(0)

# Calculate test metrics
avg_test_loss = test_loss / len(test_dataloader)
test_accuracy = correct / total
test_auroc = roc_auc_score(all_test_targets, all_test_predictions)
test_auprc = average_precision_score(all_test_targets, all_test_predictions)

# Print final test performance
print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Test AuROC: {test_auroc:.4f}, Test AuPRC: {test_auprc:.4f}")


Evaluating on the test dataset...


  model.load_state_dict(torch.load("best_model.pth"))  # Load the best model


Test Loss: 0.3628, Test Accuracy: 0.8510
Test AuROC: 0.7866, Test AuPRC: 0.4184
