# Transfomers

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from project_1.config import PROJ_ROOT, PROCESSED_DATA_DIR

[32m2025-03-24 11:25:10.986[0m | [1mINFO    [0m | [36mproject_1.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc[0m


In [2]:
# Load the data from Parquet files
sets_dict = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}_final.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict[f"set_{set_name}"] = temp_set

# Define file names
file_names = ["Outcomes-a.txt", "Outcomes-b.txt", "Outcomes-c.txt"]

# Directory path
base_path = PROJ_ROOT / "data" / "data_1" / "predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0"

# Read files into DataFrames containing all variables
outcomes_a, outcomes_b, outcomes_c = [pd.read_csv(base_path / name) for name in file_names]

# Extract only the "RecordID" and "In-hospital_death" column into separate DataFrames
death_a, death_b, death_c = [df[["RecordID", "In-hospital_death"]] for df in [outcomes_a, outcomes_b, outcomes_c]]
print(death_a.shape)

#CHECK for missing values in the outcome data
"""print(death_a.isnull().sum())
print(death_b.isnull().sum())
print(death_c.isnull().sum())"""
# Assure the loading was correct
print(sets_dict["set_a"].shape)
sets_dict["set_a"].head()

(4000, 2)
(183416, 43)


Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,Urine,WBC,pH,MechVent,TroponinT,ALP,ALT,AST,Bilirubin,TroponinI
0,132539.0,2025-03-10 00:00:00,0.0,-0.950526,-0.23008,-0.596332,1.671639,-0.013487,-0.832594,-0.109176,...,11.571429,0.753623,1.125,0.0,1.923077,0.132075,-0.176471,0.450704,1.545455,0.285714
1,132539.0,2025-03-10 01:00:00,0.0,-0.950526,-0.23008,-0.596332,1.967793,0.172112,-0.608431,-0.109176,...,2.857143,-0.42029,0.125,0.0,-0.246154,0.0,-0.294118,0.43662,0.0,-0.126984
2,132539.0,2025-03-10 02:00:00,0.0,-0.950526,-0.23008,-0.596332,-1.734132,0.125712,0.848629,0.830987,...,-0.357143,-0.014493,-0.875,0.0,0.0,0.773585,-0.205882,-0.380282,0.181818,-0.095238
3,132539.0,2025-03-10 03:00:00,0.0,-0.950526,-0.23008,-0.596332,1.523562,0.38091,-0.832594,-0.579257,...,0.642857,0.188406,-0.375,0.0,0.215385,-0.698113,-0.588235,1.126761,-0.181818,4.650794
4,132539.0,2025-03-10 04:00:00,0.0,-0.950526,-0.23008,-0.596332,0.487023,-0.96468,1.483758,-0.814297,...,-0.142857,-1.144928,1.0,0.0,2.738462,-0.490566,-0.558824,-0.225352,0.363636,0.904762


# Preprocess Data

In [3]:
# Remove ICUType from the data
for set_name in sets:
    if "ICUType" in sets_dict[f"set_{set_name}"].columns:
        sets_dict[f"set_{set_name}"] = sets_dict[f"set_{set_name}"].drop(columns=["ICUType"])

In [4]:
# Only convert sets A and C

# Convert the sets to PyTorch tensors
features_cols = [col for col in sets_dict["set_a"].columns if col not in ["RecordID", "Time"]]

sequences_a = []
sequences_c = []
set_a = sets_dict["set_a"]
set_c = sets_dict["set_c"]
for record_id, group in set_a.groupby("RecordID"):
    seq = group[features_cols].to_numpy(dtype=np.float32)
    sequences_a.append(torch.tensor(seq))

for record_id, group in set_c.groupby("RecordID"):
    seq = group[features_cols].to_numpy(dtype=np.float32)
    sequences_c.append(torch.tensor(seq))

# Now sequences_a is a list of PyTorch tensors, of shape (48, 41) each (48 timesteps, 41 features)
sequences_a[0].shape, sequences_c[0].shape

(torch.Size([48, 40]), torch.Size([47, 40]))

In [5]:
from torch.nn.utils.rnn import pad_sequence

padded_sequences_a = pad_sequence(sequences_a, batch_first=True)
padded_sequences_c = pad_sequence(sequences_c, batch_first=True)

train_X = padded_sequences_a
train_y = torch.tensor(death_a["In-hospital_death"])

test_X = padded_sequences_c
test_y = torch.tensor(death_c["In-hospital_death"])

train_X.shape, train_y.shape, test_X.shape, test_y.shape

(torch.Size([4000, 49, 40]),
 torch.Size([4000]),
 torch.Size([4000, 49, 40]),
 torch.Size([4000]))

In [6]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 64

train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size)

# Basic Implementation of Transformers (and Positional Encodings)

In [7]:
import torch
import torch.nn as nn

class TransformerClassifier(nn.Module):
    def __init__(self, input_size, num_classes=1, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.3):
        super().__init__()
        self.input_size = input_size

        # Project input features to model dimension
        self.embedding = nn.Linear(input_size, dim_feedforward)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(dim_feedforward, dropout)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_feedforward,
            nhead=nhead,
            dim_feedforward=dim_feedforward * 2,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Final classifier
        self.fc = nn.Linear(dim_feedforward, num_classes)

    def forward(self, x):
        # x: (batch, seq_len, input_size)
        x = self.embedding(x)                # (batch, seq_len, d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)      # (batch, seq_len, d_model)

        x = x.mean(dim=1)                    # mean pooling over time
        out = self.fc(x).squeeze()           # (batch,)
        return out


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [8]:
print("contains NaNs:", torch.isnan(train_X).any())
print("contains Infs:", torch.isinf(train_X).any())

contains NaNs: tensor(False)
contains Infs: tensor(False)


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(input_size=train_X.shape[2]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)

        # Sanity check shapes
        assert outputs.shape == batch_y.shape, f"Shape mismatch: {outputs.shape} vs {batch_y.shape}"

        # Convert to float
        batch_y = batch_y.float()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Loss: {running_loss / len(train_loader):.4f}")

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1/10 | Loss: 0.3811
Epoch 2/10 | Loss: 0.3354
Epoch 3/10 | Loss: 0.3307
Epoch 4/10 | Loss: 0.3244
Epoch 5/10 | Loss: 0.3157
Epoch 6/10 | Loss: 0.3128
Epoch 7/10 | Loss: 0.3043
Epoch 8/10 | Loss: 0.3000
Epoch 9/10 | Loss: 0.2949
Epoch 10/10 | Loss: 0.2793


In [10]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import torch.nn.functional as F
import numpy as np

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        # Forward pass
        logits = model(batch_X)  # output is raw score (logits)
        probs = torch.sigmoid(logits)  # convert to [0, 1]

        all_preds.extend(probs.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Thresholding at 0.5 for classification
binary_preds = (all_preds >= 0.5).astype(int)

# Metrics
roc_auc = roc_auc_score(all_labels, all_preds)
auprc = average_precision_score(all_labels, all_preds)
accuracy = accuracy_score(all_labels, binary_preds)

# Report
print(f"\n📊 Evaluation Metrics for Transformer:")
print(f"🔹 ROC AUC:  {roc_auc:.3f}")
print(f"🔹 AUPRC:    {auprc:.3f}")
print(f"🔹 Accuracy: {accuracy:.3f}")


📊 Evaluation Metrics for Transformer:
🔹 ROC AUC:  0.823
🔹 AUPRC:    0.465
🔹 Accuracy: 0.856


# Q2.3 - Tokenizing

In [11]:
# Read the initial data in parquet format
sets_dict_initial = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict_initial[f"set_{set_name}"] = temp_set

print(sets_dict_initial["set_a"].shape)
sets_dict_initial["set_a"].head()

(183416, 43)


Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 01:00:00,,,,15.0,,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 02:00:00,,,,,,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 03:00:00,,,,,,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 04:00:00,,,,15.0,,,,33.7,...,,,,,,,,,,


# Create a DataFrame following Horn et al.

In [12]:
from sklearn.preprocessing import MinMaxScaler

def build_TZV_dataframe(original_df, base_time="2025-03-10 00:00:00", duration_hours=48):
    df = original_df.copy()

    # Convert time to datetime
    df["Time"] = pd.to_datetime(df["Time"])
    start_time = pd.to_datetime(base_time)
    end_time = start_time + pd.Timedelta(hours=duration_hours)

    # Normalize time into [0, 1]
    total_seconds = (end_time - start_time).total_seconds()
    df["T"] = (df["Time"] - start_time).dt.total_seconds() / total_seconds

    # Drop RecordID if not needed
    feature_cols = [col for col in df.columns if col not in ["RecordID", "Time", "T"]]

    # Scale each feature individually (min-max)
    scaler = MinMaxScaler()
    df_scaled = df[feature_cols].copy()
    df_scaled[feature_cols] = scaler.fit_transform(df_scaled[feature_cols])

    # Stack into long format
    long_df = df_scaled.melt(ignore_index=False, value_vars=feature_cols, var_name="Z", value_name="V")
    long_df = long_df.reset_index(drop=True)

    # Add corresponding scaled time
    repeated_T = np.repeat(df["T"].values, len(feature_cols))
    long_df["T"] = repeated_T

    # Remove NaNs (measurements not taken)
    long_df = long_df.dropna(subset=["V"])

    # Map feature names to indices
    feature_to_index = {feat: idx for idx, feat in enumerate(feature_cols)}
    long_df["Z"] = long_df["Z"].map(feature_to_index)

    # Reorder columns
    long_df = long_df[["T", "Z", "V"]].sort_values("T").reset_index(drop=True)

    return long_df, feature_to_index

In [13]:
# Build the TZV dataframes
TZV_a, feature_to_index_a = build_TZV_dataframe(sets_dict["set_a"])
TZV_b, feature_to_index_b = build_TZV_dataframe(sets_dict["set_b"])
TZV_c, feature_to_index_c = build_TZV_dataframe(sets_dict["set_c"])

print(TZV_a.shape)
TZV_a.head()

(7336640, 3)


Unnamed: 0,T,Z,V
0,0.0,0,0.0
1,0.0,9,0.547619
2,0.0,9,0.571429
3,0.0,9,0.57619
4,0.0,16,0.516364


In [14]:
# Count the number of entries in the dataframe that are not NaN, in specified columns
def count_non_nan_entries(df, columns):
    return df[columns].notnull().sum().sum()

specified_columns = [col for col in sets_dict["set_a"].columns if col not in ["RecordID", "Time"]]
non_nan_entries_a = count_non_nan_entries(sets_dict["set_a"], specified_columns)
print(f"Non-NaN entries in set A: {non_nan_entries_a}")

Non-NaN entries in set A: 7336640


Observe how the numbers match. We know have a row for each of the non NaN values in the original DataFrame. Let's go!

## Use this changed DataFrames to train a Transformer and then evaluate it

In [None]:
# Generate new DataLoader and DataSets
# What is the y??

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(input_size=TZV_a.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)

        # Sanity check shapes
        assert outputs.shape == batch_y.shape, f"Shape mismatch: {outputs.shape} vs {batch_y.shape}"

        # Convert to float
        batch_y = batch_y.float()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Loss: {running_loss / len(train_loader):.4f}")