# Demo Notebook:
## Single Risk Survival Transformer For Causal Sequence Modelling 

Including time, tabular values

In [1]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-{node_type}'
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
    print(f"Added path '{venv_site_pkgs}' at start of search paths.")
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

!pwd

%load_ext autoreload
%autoreload 2

Added path '/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-icelake/lib/python3.10/site-packages' at start of search paths.
/rds/homes/g/gaddcz/Projects/CPRD/examples/modelling/SurvStreamGPT/notebooks/SingleRisk


In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import logging
from pycox.evaluation import EvalSurv
from tqdm import tqdm
from hydra import compose, initialize
from omegaconf import OmegaConf
from CPRD.examples.modelling.SurvStreamGPT.experiment import run
from CPRD.data.foundational_loader import FoundationalDataModule
from CPRD.src.models.survival.task_heads.causal import SurvStreamGPTForCausalModelling

torch.manual_seed(1337)
torch.set_float32_matmul_precision('medium')

logging.basicConfig(level=logging.INFO)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = "cpu"    # if more informative debugging statements are needed
print(f"Using device: {device}.")

Using device: cuda.


# Demo Version of SurvStreamGPT

## Build configurations

In [3]:
# load the configuration file, override any settings 
with initialize(version_base=None, config_path="../../confs", job_name="testing_notebook"):
    cfg = compose(config_name="config_SingleRisk11M", overrides=[])


# cfg.data.batch_size = 16
# cfg.transformer.block_size = 32
# # cfg.transformer.n_layer = 10

In [4]:
print(OmegaConf.to_yaml(cfg))

is_decoder: true
data:
  batch_size: 64
  unk_freq_threshold: 0.0
  min_workers: 20
  global_diagnoses: false
  meta_information_path: /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/meta_information.pickle
experiment:
  project_name: SurvStreamGPT_${head.SurvLayer}
  run_id: PreTrain_${head.SurvLayer}_11M_${experiment.seed}
  train: true
  test: true
  verbose: true
  seed: 1337
  log: true
  log_dir: /rds/projects/s/subramaa-mum-predict/CharlesGadd_Oxford/FoundationModelOutput/
  ckpt_dir: /rds/projects/s/subramaa-mum-predict/CharlesGadd_Oxford/FoundationModelOutput/checkpoints/
optim:
  num_epochs: 1
  learning_rate: 0.0001
  val_check_interval: 1000
  early_stop: false
  early_stop_patience: 5
  log_every_n_steps: 20
  limit_val_batches: 0.05
  limit_test_batches: 0.05
transformer:
  block_type: Neo
  block_size: 128
  n_layer: 6
  n_head: 6
  n_embd: 384
  layer_norm_bias: false
  attention_type: global
  bias: true
  dropout: 0.0
  attention_dropout: 0.0
  res

In [5]:
 # TODO: define an env variable to fix for a local hpc environment issue, this shouldn't be needed
%env SLURM_NTASKS_PER_NODE=28      

# TODO: with above this trains, but due to widgets issue on hpc it does not print progress to notebook
# cfg.experiment.train = False
# cfg.experiment.test = False
# cfg.experiment.log = False
# model, dm = run(cfg)     


env: SLURM_NTASKS_PER_NODE=28


## Or define training process by hand
### Create data loader

In [6]:
# Build 
path_to_db = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/"
dm = FoundationalDataModule(path_to_db=path_to_db,
                            load=True,
                            tokenizer="tabular",
                            batch_size=cfg.data.batch_size,
                            max_seq_length=cfg.transformer.block_size,
                            global_diagnoses=cfg.data.global_diagnoses,
                            freq_threshold=cfg.data.unk_freq_threshold,
                            min_workers=cfg.data.min_workers,
                            overwrite_meta_information=cfg.data.meta_information_path,
                           )

vocab_size = dm.train_set.tokenizer.vocab_size
print(f"{vocab_size} vocab elements")

# list of univariate measurements to model with Normal distribution
# Extract the measurements, using the fact that the diagnoses are all up upper case.
measurements_for_univariate_regression = [record for record in dm.tokenizer._event_counts["EVENT"] if record.upper() != record]
cfg.head.tokens_for_univariate_regression = dm.encode(measurements_for_univariate_regression) 
# display(measurements_for_univariate_regression)


INFO:root:Using meta information from /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/meta_information.pickle
INFO:root:Using train file-row count dictionary from /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/file_row_count_dict_train.pickle
INFO:root:Using test file-row count dictionary from /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/file_row_count_dict_test.pickle
INFO:root:Using val file-row count dictionary from /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/file_row_count_dict_val.pickle
INFO:root:Tokenzier created based on 3584.43M tokens
INFO:root:Using tabular tokenizer, created from meta information and containing 184 tokens
INFO:root:Loaded /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/split=train/ dataset, with 23,343,104 samples
INFO:root:Loaded /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/split=test/ dataset, with 1,263,168 samples
INFO:root:

184 vocab elements


In [8]:
# dm.train_set.view_sample(1000, report_time=True) # max_dynamic_events=120,

### Train

In [7]:
model = SurvStreamGPTForCausalModelling(cfg, vocab_size).to(device)
model = model.to(device)

loss_curves_train = []
loss_curves_train_surv = []
loss_curves_train_values = []

loss_curves_val = []
loss_curves_val_surv = []
loss_curves_val_values = []    
print(f"Training model with {sum(p.numel() for p in model.parameters())/1e6} M parameters")

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.optim.learning_rate)

best_val, epochs_since_best = np.inf, 0
for epoch in range(10):
    
    epoch_loss, epoch_surv_loss, epoch_values_loss = 0, 0, 0
    model.train()
    for i, batch in tqdm(enumerate(dm.train_dataloader()), desc=f"Training epoch {epoch}", total=len(dm.train_dataloader())):
        
            
        # evaluate the loss
        _, loss_dict, _ = model(tokens=batch['tokens'].to(device),
                                ages=batch['ages'].to(device),
                                values=batch['values'].to(device),
                                covariates=batch["static_covariates"].to(device),
                                attention_mask=batch['attention_mask'].to(device)
                               )
        optimizer.zero_grad(set_to_none=True)
        loss_dict["loss"].backward()
        optimizer.step()

        # record
        epoch_loss += loss_dict["loss"].item()            
        epoch_surv_loss += loss_dict["loss_desurv"].item()
        epoch_values_loss += loss_dict["loss_values"].item()
        
        if i > 1000:
            break
    
    epoch_loss /= i
    epoch_surv_loss /= i
    epoch_values_loss /= i
    loss_curves_train.append(epoch_loss)
    loss_curves_train_surv.append(epoch_surv_loss)
    loss_curves_train_values.append(epoch_values_loss)

    # evaluate the loss on val set
    with torch.no_grad(): 
        model.eval()
        if epoch % 1 == 0 or epoch == cfg.optim.num_epochs - 1:
            val_loss, val_surv_loss, val_values_loss = 0, 0, 0
            for j, batch in tqdm(enumerate(dm.val_dataloader()), desc=f"Validation epoch {epoch}", total=len(dm.val_dataloader())):
                if j > 100:
                    break
                _, loss_dict, _ = model(tokens=batch['tokens'].to(device),
                                        ages=batch['ages'].to(device),
                                        values=batch['values'].to(device),
                                        covariates=batch["static_covariates"].to(device),
                                        attention_mask=batch['attention_mask'].to(device)
                                       )
                # record
                val_loss += loss_dict["loss"].item()                    
                val_surv_loss += loss_dict["loss_desurv"].item()
                val_values_loss += loss_dict["loss_values"].item()
                
            val_loss /= j
            val_surv_loss /= j
            val_values_loss /= j
            loss_curves_val.append(val_loss)
            loss_curves_val_surv.append(val_surv_loss)
            loss_curves_val_values.append(val_values_loss)

            print(f"Epoch {epoch}:\tTrain loss {epoch_loss:.2f}: ({epoch_surv_loss:.2f}, {epoch_values_loss:.2f}). Val loss {val_loss:.2f}: ({val_surv_loss:.2f}, {val_values_loss:.2f})")          
            # TODO: Note not fully accurate as last batch is likely not the same size, will be fixed with lightning

        if val_loss >= best_val:
            epochs_since_best += 1
            if epochs_since_best >= 5:
                break
        else:
            best_val = val_loss
            epochs_since_best = 0

            # Save best seen model
            # torch.save(model.state_dict(), path_to_db + "polars/CR.pt")
            

INFO:root:Using Temporal Positional Encoding. This module uses the patient's age at an event within their time series.
INFO:root:Using Single-Risk DeSurvival head. This module predicts a separate survival curve for each possible future event
INFO:root:Internally scaling time in survival head by 1825 days
INFO:root:In generation forwarding DeSurv on the grid between [0.0, 1825.0]
INFO:root:with 1826 intervals of delta=1.0


Training model with 11.167512 M parameters


Training epoch 0:   0%|          | 1001/364736 [13:37<82:32:18,  1.22it/s]
Validation epoch 0:   1%|          | 101/19276 [00:38<2:01:40,  2.63it/s]


Epoch 0:	Train loss -1.89: (3.12, -6.90). Val loss -2.59: (2.86, -8.03)


Training epoch 1:   0%|          | 1001/364736 [09:57<60:20:38,  1.67it/s]
Validation epoch 1:   1%|          | 101/19276 [00:38<2:00:26,  2.65it/s]


Epoch 1:	Train loss -3.48: (2.62, -9.57). Val loss -3.37: (2.55, -9.29)


Training epoch 2:   0%|          | 1001/364736 [09:59<60:28:47,  1.67it/s]
Validation epoch 2:   1%|          | 101/19276 [00:37<1:58:57,  2.69it/s]


Epoch 2:	Train loss -4.24: (2.29, -10.77). Val loss -3.81: (2.20, -9.82)


Training epoch 3:   0%|          | 1001/364736 [09:59<60:33:24,  1.67it/s]
Validation epoch 3:   1%|          | 101/19276 [00:37<1:58:19,  2.70it/s]


Epoch 3:	Train loss -4.68: (2.05, -11.41). Val loss -4.18: (2.05, -10.41)


Training epoch 4:   0%|          | 1001/364736 [09:59<60:33:36,  1.67it/s]
Validation epoch 4:   1%|          | 101/19276 [00:37<1:58:50,  2.69it/s]


Epoch 4:	Train loss -5.11: (1.90, -12.12). Val loss -4.47: (1.93, -10.86)


Training epoch 5:   0%|          | 1001/364736 [10:01<60:43:12,  1.66it/s]
Validation epoch 5:   1%|          | 101/19276 [00:37<1:58:41,  2.69it/s]


Epoch 5:	Train loss -5.41: (1.82, -12.64). Val loss -4.02: (1.85, -9.89)


Training epoch 6:   0%|          | 1001/364736 [10:02<60:50:06,  1.66it/s]
Validation epoch 6:   1%|          | 101/19276 [00:37<1:59:04,  2.68it/s]


Epoch 6:	Train loss -5.62: (1.75, -12.98). Val loss -4.76: (1.78, -11.31)


Training epoch 7:   0%|          | 1001/364736 [10:01<60:45:37,  1.66it/s]
Validation epoch 7:   1%|          | 101/19276 [00:37<2:00:13,  2.66it/s]


Epoch 7:	Train loss -5.79: (1.69, -13.27). Val loss -5.00: (1.74, -11.73)


Training epoch 8:   0%|          | 1001/364736 [10:05<61:09:49,  1.65it/s]
Validation epoch 8:   1%|          | 101/19276 [00:38<2:01:38,  2.63it/s]


Epoch 8:	Train loss -5.92: (1.65, -13.50). Val loss -4.97: (1.70, -11.63)


Training epoch 9:   0%|          | 1001/364736 [10:10<61:37:13,  1.64it/s]
Validation epoch 9:   1%|          | 101/19276 [00:38<2:00:36,  2.65it/s]


Epoch 9:	Train loss -6.07: (1.59, -13.72). Val loss -4.98: (1.68, -11.65)


In [None]:
# Plot loss
plt.figure()
# Training
iterations = np.linspace(0, len(loss_curves_train), len(loss_curves_train)) * opt.eval_interval
plt.plot(iterations, loss_curves_train, label="train")
# Validation
iterations = np.linspace(0, len(loss_curves_val), len(loss_curves_val)) * opt.eval_interval
plt.plot(iterations, loss_curves_val, label="val", linestyle='dashed')
plt.legend()
plt.savefig(f"figs/loss.png")

# Plot DeSurv loss
plt.figure()
# Training
iterations = np.linspace(0, len(loss_curves_train_surv), len(loss_curves_train_surv)) * opt.eval_interval
plt.plot(iterations, loss_curves_train_surv, label="train")
# Validation
iterations = np.linspace(0, len(loss_curves_val_surv), len(loss_curves_val_surv)) * opt.eval_interval
plt.plot(iterations, loss_curves_val_surv, label="val", linestyle='dashed')
plt.legend()
plt.savefig(f"figs/loss_desurv.png")

# Plot value loss
plt.figure()
# Training
iterations = np.linspace(0, len(loss_curves_train_values), len(loss_curves_train_values)) * opt.eval_interval
plt.plot(iterations, loss_curves_train_values, label="train", )
# Validation
iterations = np.linspace(0, len(loss_curves_val_values), len(loss_curves_val_values)) * opt.eval_interval
plt.plot(iterations, loss_curves_val_values, label="val", linestyle='dashed')
plt.legend()
plt.savefig(f"figs/loss_val.png")

# Appendix: model architectures

In [10]:
display(model)

SurvStreamGPTForCausalModelling(
  (transformer): TTETransformer(
    (wpe): TemporalPositionalEncoding()
    (wte): DataEmbeddingLayer(
      (static_proj): Linear(in_features=16, out_features=384, bias=True)
      (dynamic_embedding_layer): SplitDynamicEmbeddingLayer(
        (cat_event_embed_layer): Embedding(184, 384, padding_idx=0)
        (cat_event_proj): Linear(in_features=384, out_features=384, bias=True)
        (num_value_embed_layer): EmbeddingBag(184, 384, mode='sum', padding_idx=0)
        (num_value_proj): Linear(in_features=384, out_features=384, bias=True)
      )
    )
    (drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadedSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_pr

In [11]:
!jupyter nbconvert --to html --no-input single_risk.ipynb

[NbConvertApp] Converting notebook single_risk.ipynb to html
[NbConvertApp] Writing 603283 bytes to single_risk.html
