# Question 2

In [28]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from project_1.config import PROJ_ROOT, PROCESSED_DATA_DIR
from project_1.loading import *
from project_1.dataset import *

## Data Loading
For basic LSTM, we load the final datasets

In [29]:
set_a, set_b, set_c = load_final_data_without_ICU()
death_a, death_b, death_c = load_outcomes()

Shapes of the datasets:
Set A: (183416, 42) Set B: (183495, 42) Set C: (183711, 42)
Shapes of labels:
Set A: (4000, 2) Set B: (4000, 2) Set C: (4000, 2)


# Attempt 1 - LSTM - Model Implementation (Last State)
This basic implementation takes the last hidden state to be used for prediction

In [30]:
class LSTM_Model(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, num_classes=1, dropout=0.3):
        super(LSTM_Model, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,       # 41 features per time step
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: (batch_size, seq_len, input_size)
        out, _ = self.lstm(x)           # out: (batch_size, seq_len, hidden_size)
        out = out[:, -1, :]             # Take last time step: (batch_size, hidden_size)
        out = self.fc(out)              # (batch_size, num_classes)
        return out.squeeze()            # (batch_size,) for BCEWithLogitsLoss

# Obtain TensorDatasets from Time Series data

In [31]:
train_dataset = create_dataset_from_timeseries(set_a, death_a["In-hospital_death"])
validation_dataset = create_dataset_from_timeseries(set_b, death_b["In-hospital_death"])
test_dataset = create_dataset_from_timeseries(set_c, death_c["In-hospital_death"])

train_dataset.tensors[0].shape # (batch_size, seq_len, input_size)

torch.Size([4000, 49, 40])

In [32]:
# Convert to DataLoader
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

## Train Loop

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = train_dataset.tensors[0].shape[-1]
model = LSTM_Model(input_size=input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Call the trainig loop (default 10 epochs)
model = train_model_with_validation(model, train_loader, validation_loader, criterion, optimizer, device)

                                                                              

Epoch 1/10
  Train Loss: 0.4448 | AUCROC: 0.5839 | AUPRC: 0.1708
  Val   Loss: 0.3443 | AUCROC: 0.7706 | AUPRC: 0.4298



                                                                              

Epoch 2/10
  Train Loss: 0.3322 | AUCROC: 0.7925 | AUPRC: 0.4176
  Val   Loss: 0.3186 | AUCROC: 0.8301 | AUPRC: 0.4692



                                                                              

Epoch 3/10
  Train Loss: 0.3024 | AUCROC: 0.8429 | AUPRC: 0.4827
  Val   Loss: 0.3260 | AUCROC: 0.8360 | AUPRC: 0.4712



                                                                              

Epoch 4/10
  Train Loss: 0.2888 | AUCROC: 0.8588 | AUPRC: 0.5261
  Val   Loss: 0.3137 | AUCROC: 0.8370 | AUPRC: 0.4728



                                                                              

Epoch 5/10
  Train Loss: 0.2683 | AUCROC: 0.8826 | AUPRC: 0.5939
  Val   Loss: 0.3225 | AUCROC: 0.8322 | AUPRC: 0.4551



                                                                              

Epoch 6/10
  Train Loss: 0.2527 | AUCROC: 0.8981 | AUPRC: 0.6183
  Val   Loss: 0.3455 | AUCROC: 0.8176 | AUPRC: 0.4558



                                                                              

Epoch 7/10
  Train Loss: 0.2385 | AUCROC: 0.9087 | AUPRC: 0.6667
  Val   Loss: 0.3446 | AUCROC: 0.8135 | AUPRC: 0.4282



                                                                              

Epoch 8/10
  Train Loss: 0.2187 | AUCROC: 0.9217 | AUPRC: 0.7218
  Val   Loss: 0.3678 | AUCROC: 0.8134 | AUPRC: 0.4401



                                                                              

Epoch 9/10
  Train Loss: 0.2024 | AUCROC: 0.9335 | AUPRC: 0.7458
  Val   Loss: 0.3784 | AUCROC: 0.8160 | AUPRC: 0.4466



                                                                               

Epoch 10/10
  Train Loss: 0.1847 | AUCROC: 0.9438 | AUPRC: 0.8005
  Val   Loss: 0.3914 | AUCROC: 0.8094 | AUPRC: 0.4425





## Evaluation of Model

In [34]:
avg_loss, aucroc, auprc = evaluate_model(model, test_loader, criterion, device)
print(f"Test Loss: {avg_loss:.4f}, AUC-ROC: {aucroc:.4f}, AUC-PRC: {auprc:.4f}")

                                                                         

Evaluation - Loss: 0.4305 - AUCROC: 0.7935 - AUPRC: 0.4053
Test Loss: 0.4305, AUC-ROC: 0.7935, AUC-PRC: 0.4053


# Attempt 2 - LSTM - Model Implementation (Mean Pooling)

In [35]:
class LSTM_Model_Pooling(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, num_classes=1, dropout=0.3):
        super(LSTM_Model_Pooling, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,       # 40 features per time step
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: (batch_size, seq_len, input_size)
        out, _ = self.lstm(x)           # out: (batch_size, seq_len, hidden_size)
        out = out.mean(dim=1)           # Pooling: (batch_size, hidden_size)   
        out = self.fc(out)              # (batch_size, num_classes)
        return out.squeeze()            # (batch_size,) for BCEWithLogitsLoss

In [36]:
# Use the previous data loaders and train the new model
model_pooling = LSTM_Model_Pooling(input_size=input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_pooling.parameters(), lr=0.001)

model_pooling = train_model_with_validation(model_pooling, train_loader, validation_loader, criterion, optimizer, device)

                                                                              

Epoch 1/10
  Train Loss: 0.4530 | AUCROC: 0.5249 | AUPRC: 0.1445
  Val   Loss: 0.3894 | AUCROC: 0.7208 | AUPRC: 0.3152



                                                                              

Epoch 2/10
  Train Loss: 0.3552 | AUCROC: 0.7490 | AUPRC: 0.3290
  Val   Loss: 0.3491 | AUCROC: 0.7720 | AUPRC: 0.3694



                                                                              

Epoch 3/10
  Train Loss: 0.3279 | AUCROC: 0.8019 | AUPRC: 0.4191
  Val   Loss: 0.3447 | AUCROC: 0.7939 | AUPRC: 0.3932



                                                                              

Epoch 4/10
  Train Loss: 0.3113 | AUCROC: 0.8295 | AUPRC: 0.4636
  Val   Loss: 0.3299 | AUCROC: 0.8112 | AUPRC: 0.4199



                                                                              

Epoch 5/10
  Train Loss: 0.2957 | AUCROC: 0.8514 | AUPRC: 0.5069
  Val   Loss: 0.3317 | AUCROC: 0.8122 | AUPRC: 0.4249



                                                                              

Epoch 6/10
  Train Loss: 0.2783 | AUCROC: 0.8711 | AUPRC: 0.5605
  Val   Loss: 0.3387 | AUCROC: 0.8116 | AUPRC: 0.4315



                                                                              

Epoch 7/10
  Train Loss: 0.2624 | AUCROC: 0.8892 | AUPRC: 0.5917
  Val   Loss: 0.3640 | AUCROC: 0.8048 | AUPRC: 0.4089



                                                                              

Epoch 8/10
  Train Loss: 0.2416 | AUCROC: 0.9081 | AUPRC: 0.6505
  Val   Loss: 0.3730 | AUCROC: 0.8057 | AUPRC: 0.4064



                                                                              

Epoch 9/10
  Train Loss: 0.2235 | AUCROC: 0.9221 | AUPRC: 0.6925
  Val   Loss: 0.3928 | AUCROC: 0.7947 | AUPRC: 0.3784



                                                                               

Epoch 10/10
  Train Loss: 0.2027 | AUCROC: 0.9345 | AUPRC: 0.7506
  Val   Loss: 0.4605 | AUCROC: 0.7880 | AUPRC: 0.3850



In [37]:
# Now evaluate the model
avg_loss, aucroc, auprc = evaluate_model(model_pooling, test_loader, criterion, device)
print(f"Test Loss: {avg_loss:.4f}, AUC-ROC: {aucroc:.4f}, AUC-PRC: {auprc:.4f}")

                                                                         

Evaluation - Loss: 0.4214 - AUCROC: 0.7782 - AUPRC: 0.3861
Test Loss: 0.4214, AUC-ROC: 0.7782, AUC-PRC: 0.3861




# LSTM - Max Pooling

In [50]:
class LSTM_Model_Max_Pooling(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, num_classes=1, dropout=0.3):
        super(LSTM_Model_Max_Pooling, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,       # 40 features per time step
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: (batch_size, seq_len, input_size)
        out, _ = self.lstm(x)           # out: (batch_size, seq_len, hidden_size)
        out, _ = out.max(dim=1)           # Pooling: (batch_size, hidden_size)   
        out = self.fc(out)              # (batch_size, num_classes)
        return out.squeeze()            # (batch_size,) for BCEWithLogitsLoss

In [51]:
# Use the previous data loaders and train the new model
model_max_pooling = LSTM_Model_Max_Pooling(input_size=input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_max_pooling.parameters(), lr=0.001)

model_max_pooling = train_model_with_validation(model_max_pooling, train_loader, validation_loader, criterion, optimizer, device)

                                                                              

Epoch 1/10
  Train Loss: 0.4585 | AUCROC: 0.5158 | AUPRC: 0.1392
  Val   Loss: 0.3912 | AUCROC: 0.7228 | AUPRC: 0.3004



                                                                              

Epoch 2/10
  Train Loss: 0.3527 | AUCROC: 0.7596 | AUPRC: 0.3244
  Val   Loss: 0.3487 | AUCROC: 0.7926 | AUPRC: 0.3817



                                                                              

Epoch 3/10
  Train Loss: 0.3213 | AUCROC: 0.8123 | AUPRC: 0.4183
  Val   Loss: 0.3369 | AUCROC: 0.7984 | AUPRC: 0.3973



                                                                              

Epoch 4/10
  Train Loss: 0.3069 | AUCROC: 0.8350 | AUPRC: 0.4501
  Val   Loss: 0.3360 | AUCROC: 0.8107 | AUPRC: 0.4096



                                                                              

Epoch 5/10
  Train Loss: 0.2984 | AUCROC: 0.8496 | AUPRC: 0.4876
  Val   Loss: 0.3313 | AUCROC: 0.8146 | AUPRC: 0.4152



                                                                              

Epoch 6/10
  Train Loss: 0.2821 | AUCROC: 0.8707 | AUPRC: 0.5320
  Val   Loss: 0.3426 | AUCROC: 0.8116 | AUPRC: 0.4139



                                                                              

Epoch 7/10
  Train Loss: 0.2655 | AUCROC: 0.8859 | AUPRC: 0.5670
  Val   Loss: 0.3458 | AUCROC: 0.8151 | AUPRC: 0.4182



                                                                              

Epoch 8/10
  Train Loss: 0.2476 | AUCROC: 0.9038 | AUPRC: 0.6330
  Val   Loss: 0.3595 | AUCROC: 0.8103 | AUPRC: 0.4109



                                                                              

Epoch 9/10
  Train Loss: 0.2345 | AUCROC: 0.9136 | AUPRC: 0.6582
  Val   Loss: 0.3643 | AUCROC: 0.8095 | AUPRC: 0.4065



                                                                               

Epoch 10/10
  Train Loss: 0.2221 | AUCROC: 0.9222 | AUPRC: 0.7092
  Val   Loss: 0.3815 | AUCROC: 0.8077 | AUPRC: 0.4092



In [52]:
# Now evaluate the model
avg_loss, aucroc, auprc = evaluate_model(model_max_pooling, test_loader, criterion, device)
print(f"Test Loss: {avg_loss:.4f}, AUC-ROC: {aucroc:.4f}, AUC-PRC: {auprc:.4f}")

                                                                        

Evaluation - Loss: 0.3814 - AUCROC: 0.8089 - AUPRC: 0.4311
Test Loss: 0.3814, AUC-ROC: 0.8089, AUC-PRC: 0.4311




# Attempt 3 - Bidirectional LSTM - Model Implementation

In [38]:
class LSTM_Model_Bi(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, num_classes=1, dropout=0.3):
        super(LSTM_Model_Bi, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,       # 41 features per time step
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, num_classes) # *2 for bidirectional

    def forward(self, x):
        # x: (batch_size, seq_len, input_size)
        out, _ = self.lstm(x)           # out: (batch_size, seq_len, hidden_size)
        out = out[:, -1, :]             # Take last time step: (batch_size, hidden_size)
        out = self.fc(out)              # (batch_size, num_classes)
        return out.squeeze()            # (batch_size,) for BCEWithLogitsLoss

In [39]:
# Train the model
model_bi = LSTM_Model_Bi(input_size=input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_bi.parameters(), lr=0.001)

model_bi = train_model_with_validation(model_bi, train_loader, validation_loader, criterion, optimizer, device)

                                                                              

Epoch 1/10
  Train Loss: 0.4357 | AUCROC: 0.6063 | AUPRC: 0.1974
  Val   Loss: 0.3376 | AUCROC: 0.7849 | AUPRC: 0.4241



                                                                              

Epoch 2/10
  Train Loss: 0.3227 | AUCROC: 0.8101 | AUPRC: 0.4333
  Val   Loss: 0.3157 | AUCROC: 0.8297 | AUPRC: 0.4548



                                                                              

Epoch 3/10
  Train Loss: 0.2944 | AUCROC: 0.8525 | AUPRC: 0.5175
  Val   Loss: 0.3113 | AUCROC: 0.8367 | AUPRC: 0.4732



                                                                              

Epoch 4/10
  Train Loss: 0.2767 | AUCROC: 0.8752 | AUPRC: 0.5510
  Val   Loss: 0.3242 | AUCROC: 0.8244 | AUPRC: 0.4472



                                                                              

Epoch 5/10
  Train Loss: 0.2643 | AUCROC: 0.8858 | AUPRC: 0.6020
  Val   Loss: 0.3300 | AUCROC: 0.8246 | AUPRC: 0.4332



                                                                              

Epoch 6/10
  Train Loss: 0.2456 | AUCROC: 0.9061 | AUPRC: 0.6318
  Val   Loss: 0.3292 | AUCROC: 0.8215 | AUPRC: 0.4475



                                                                              

Epoch 7/10
  Train Loss: 0.2266 | AUCROC: 0.9223 | AUPRC: 0.6769
  Val   Loss: 0.3692 | AUCROC: 0.8155 | AUPRC: 0.4376



                                                                              

Epoch 8/10
  Train Loss: 0.2147 | AUCROC: 0.9299 | AUPRC: 0.7253
  Val   Loss: 0.3747 | AUCROC: 0.8074 | AUPRC: 0.4172



                                                                              

Epoch 9/10
  Train Loss: 0.1957 | AUCROC: 0.9428 | AUPRC: 0.7626
  Val   Loss: 0.3631 | AUCROC: 0.8074 | AUPRC: 0.4196



                                                                               

Epoch 10/10
  Train Loss: 0.1675 | AUCROC: 0.9592 | AUPRC: 0.8227
  Val   Loss: 0.4653 | AUCROC: 0.7952 | AUPRC: 0.4122





In [40]:
# Now evaluate
avg_loss, aucroc, auprc = evaluate_model(model_bi, test_loader, criterion, device)
print(f"Test Loss: {avg_loss:.4f}, AUC-ROC: {aucroc:.4f}, AUC-PRC: {auprc:.4f}")

                                                                        

Evaluation - Loss: 0.4649 - AUCROC: 0.7875 - AUPRC: 0.4095
Test Loss: 0.4649, AUC-ROC: 0.7875, AUC-PRC: 0.4095




# Transformers - Model Implementation

In [41]:
import torch.nn

class TransformerClassifier(nn.Module):
    def __init__(self, input_size, num_classes=1, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.3):
        super().__init__()
        self.input_size = input_size

        # Project input features to model dimension
        self.embedding = nn.Linear(input_size, dim_feedforward)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(dim_feedforward, dropout)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_feedforward,
            nhead=nhead,
            dim_feedforward=dim_feedforward * 2,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Final classifier
        self.fc = nn.Linear(dim_feedforward, num_classes)

    def forward(self, x):
        # x: (batch, seq_len, input_size)

        if x.dim() == 2:
            x = x.unsqueeze(1) # (batch, 1, input_size)

        x = self.embedding(x)                # (batch, seq_len, d_model)
        #print("After embedding:", x.shape)  # Debug print
        x = self.pos_encoder(x)
        #print("After pos encoding:", x.shape)  # Debug print
        x = self.transformer_encoder(x)      # (batch, seq_len, d_model)
        #print("After transformer encoder:", x.shape)

        x = x.mean(dim=1)                    # mean pooling over time
        #print("After pooling:", x.shape)     # Debug print
        out = self.fc(x).squeeze()           # (batch,)
        #print("After fc:", out.shape)        # Debug print
        return out


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [42]:
# Train the Transformer model
model_transformer = TransformerClassifier(input_size=input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_transformer.parameters(), lr=0.001)

model_transformer = train_model_with_validation(model_transformer, train_loader, validation_loader, criterion, optimizer, device)

                                                                              

Epoch 1/10
  Train Loss: 0.3751 | AUCROC: 0.6917 | AUPRC: 0.2804
  Val   Loss: 0.3474 | AUCROC: 0.7974 | AUPRC: 0.3945



                                                                              

Epoch 2/10
  Train Loss: 0.3468 | AUCROC: 0.7645 | AUPRC: 0.3664
  Val   Loss: 0.3305 | AUCROC: 0.8070 | AUPRC: 0.4199



                                                                              

Epoch 3/10
  Train Loss: 0.3320 | AUCROC: 0.7975 | AUPRC: 0.3856
  Val   Loss: 0.3406 | AUCROC: 0.8108 | AUPRC: 0.4231



                                                                              

Epoch 4/10
  Train Loss: 0.3185 | AUCROC: 0.8179 | AUPRC: 0.4327
  Val   Loss: 0.3310 | AUCROC: 0.8222 | AUPRC: 0.4498



                                                                              

Epoch 5/10
  Train Loss: 0.3166 | AUCROC: 0.8187 | AUPRC: 0.4544
  Val   Loss: 0.3287 | AUCROC: 0.8185 | AUPRC: 0.4276



                                                                              

Epoch 6/10
  Train Loss: 0.3059 | AUCROC: 0.8349 | AUPRC: 0.4808
  Val   Loss: 0.3317 | AUCROC: 0.8225 | AUPRC: 0.4531



                                                                              

Epoch 7/10
  Train Loss: 0.3004 | AUCROC: 0.8457 | AUPRC: 0.4809
  Val   Loss: 0.4012 | AUCROC: 0.7772 | AUPRC: 0.4076



                                                                              

Epoch 8/10
  Train Loss: 0.2944 | AUCROC: 0.8518 | AUPRC: 0.5068
  Val   Loss: 0.4364 | AUCROC: 0.7420 | AUPRC: 0.3674



                                                                              

Epoch 9/10
  Train Loss: 0.2895 | AUCROC: 0.8585 | AUPRC: 0.5173
  Val   Loss: 0.3946 | AUCROC: 0.7857 | AUPRC: 0.3622



                                                                               

Epoch 10/10
  Train Loss: 0.2806 | AUCROC: 0.8673 | AUPRC: 0.5594
  Val   Loss: 0.4266 | AUCROC: 0.7459 | AUPRC: 0.3902





In [43]:
# Evaluate the model
avg_loss, aucroc, auprc = evaluate_model(model_transformer, test_loader, criterion, device)
print(f"Test Loss: {avg_loss:.4f}, AUC-ROC: {aucroc:.4f}, AUC-PRC: {auprc:.4f}")

                                                                        

Evaluation - Loss: 0.3494 - AUCROC: 0.8292 - AUPRC: 0.4783
Test Loss: 0.3494, AUC-ROC: 0.8292, AUC-PRC: 0.4783




# Q2.3 - Tokenizing

In [44]:
# For this part, we need to load the initial data
set_a_initial, set_b_initial, set_c_initial = load_basic_data()
set_a_initial.head()

Shapes of the datasets:
Set A: (183416, 43) Set B: (183495, 43) Set C: (183711, 43)


Unnamed: 0,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 01:00:00,,,,15.0,,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 02:00:00,,,,,,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 03:00:00,,,,,,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 04:00:00,,,,15.0,,,,33.7,...,,,,,,,,,,


## Create the TZV Dataframe (following Horn et al.)

In [45]:
from sklearn.preprocessing import MinMaxScaler

def build_TZV_dataframe(original_df, label_df, base_time="2025-03-10 00:00:00", duration_hours=48):
    """
    Build a long-format dataframe with columns [T, Z, V, y] from an original wide dataframe.
    
    Parameters:
        original_df (pd.DataFrame): DataFrame with columns [RecordID, Time, f1, f2, ..., f41].
        label_df (pd.DataFrame): DataFrame with columns [RecordID, y] containing the label for each RecordID.
        base_time (str): Base time used for normalizing the Time column.
        duration_hours (int): The duration (in hours) from base_time over which Time is normalized (here, 48 hours).
    
    Returns:
        long_df (pd.DataFrame): Long-format dataframe with columns:
                                T: normalized time [0, 1],
                                Z: index of the feature,
                                V: scaled measurement value,
                                y: label corresponding to RecordID.
        feature_to_index (dict): Mapping from original feature names to integer indices.
    """
    # Merge the labels with the original dataframe using RecordID.
    df = original_df.copy().merge(label_df, on="RecordID", how="left")
    
    # Convert Time to datetime and compute normalized time T.
    df["Time"] = pd.to_datetime(df["Time"])
    start_time = pd.to_datetime(base_time)
    end_time = start_time + pd.Timedelta(hours=duration_hours)
    total_seconds = (end_time - start_time).total_seconds()
    df["T"] = (df["Time"] - start_time).dt.total_seconds() / total_seconds
    
    # Identify feature columns: all columns except RecordID, Time, T, and y.
    feature_cols = [col for col in df.columns if col not in ["RecordID", "Time", "T", "In-hospital_death"]]
    
    # Scale each feature individually using MinMaxScaler.
    scaler = MinMaxScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    # Melt the dataframe from wide to long format.
    # The id_vars ("T" and "y") are preserved for each measurement.
    long_df = pd.melt(df, id_vars=["T", "In-hospital_death"], value_vars=feature_cols, 
                      var_name="Z", value_name="V")
    
    # Map feature names to indices for the "Z" column.
    feature_to_index = {feat: idx for idx, feat in enumerate(feature_cols)}
    long_df["Z"] = long_df["Z"].map(feature_to_index)
    
    # Sort the final dataframe by normalized time T and reset the index.
    long_df = long_df.sort_values("T").reset_index(drop=True)
    long_df = long_df.dropna(subset=["V"])
    
    return long_df, feature_to_index

In [46]:
# Build the TZV dataframes
TZV_a, feature_to_index_a = build_TZV_dataframe(set_a_initial, death_a)
TZV_b, feature_to_index_b = build_TZV_dataframe(set_b_initial, death_b)
TZV_c, feature_to_index_c = build_TZV_dataframe(set_c_initial, death_c)

print(TZV_a.shape)
TZV_a.head(10)

(1456736, 4)


Unnamed: 0,T,In-hospital_death,Z,V
0,0.0,0,0,0.52
417,0.0,0,34,0.16791
551,0.0,0,34,0.186567
565,0.0,1,8,0.528571
566,0.0,0,8,0.347619
1229,0.0,0,8,0.390476
1307,0.0,0,8,0.433333
1606,0.0,0,34,0.160448
1607,0.0,1,8,0.542857
1618,0.0,0,34,0.175373


In [47]:
# Check for the total number of not NaN values under some specified columns
selected_cols = [col for col in set_a_initial.columns if col not in ["RecordID", "Time"]]
set_a_initial[selected_cols].notna().sum().sum()

1456736

Checked that the number of not NaN values is the same as the rows of the new dataframe! Let's go
(We have to believe in this format)

## Train the TZV Format with a Transformer

In [48]:
# Remove the In-hospital_death column from the TZV dataframes, but save it
y_a = TZV_a.pop("In-hospital_death")
y_b = TZV_b.pop("In-hospital_death")
y_c = TZV_c.pop("In-hospital_death")

# Convert the TZV dataframes to PyTorch tensors
X_a = torch.tensor(TZV_a[["T", "Z", "V"]].values, dtype=torch.float32)
X_b = torch.tensor(TZV_b[["T", "Z", "V"]].values, dtype=torch.float32)
X_c = torch.tensor(TZV_c[["T", "Z", "V"]].values, dtype=torch.float32)
print(X_a.shape, X_b.shape, X_c.shape)

# Create the datasets and dataloaders
from torch.utils.data import TensorDataset

dataset_a = TensorDataset(X_a, torch.tensor(y_a.values, dtype=torch.float32))
dataset_b = TensorDataset(X_b, torch.tensor(y_b.values, dtype=torch.float32))
dataset_c = TensorDataset(X_c, torch.tensor(y_c.values, dtype=torch.float32))

loader_a = DataLoader(dataset_a, batch_size=64, shuffle=True)
loader_b = DataLoader(dataset_b, batch_size=64, shuffle=False)
loader_c = DataLoader(dataset_c, batch_size=64, shuffle=False)

torch.Size([1456736, 3]) torch.Size([1459862, 3]) torch.Size([1454964, 3])


In [49]:
model_tvz = TransformerClassifier(input_size=3).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_tvz.parameters(), lr=0.001)

model_tvz = train_model_with_validation(model_tvz, loader_a, loader_b, criterion, optimizer, device)

                                                                                    

KeyboardInterrupt: 

Training takes a lot of time, it's 1 million of rows per table...

I could make a script.py and run it on Euler...