In [1]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/g/gaddcz/Projects/CPRD/virtual-env-{node_type}'
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
    print(f"Added path '{venv_site_pkgs}' at start of search paths.")
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

Added path '/rds/homes/g/gaddcz/Projects/CPRD/virtual-env-icelake/lib/python3.10/site-packages' at start of search paths.


In [2]:
import torch
from dataclasses import dataclass
from CPRD.data.foundational_loader import FoundationalDataModule
import logging
import time

logging.basicConfig(level=logging.DEBUG)
# torch.manual_seed(1337)
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device)

!pwd
%load_ext autoreload
%autoreload 2

ValueError: source code string cannot contain null bytes

In [None]:
# Set config to be equivalent architecture of kaparthy benchmark, however they are not comparable tasks.
@dataclass
class DemoConfig:
    block_size: int = 128        # what is the maximum context length for predictions?
    n_layer: int = 6
    n_head: int = 6
    n_embd: int = 384
    bias: bool = True
    attention_type: str = "global"    
    dropout: float = 0.0
    unk_freq_threshold: float = 0.0
    TTELayer = "Exponential"                                  # "Geometric"
    tokens_for_univariate_regression = None

config = DemoConfig()

@dataclass
class OptConfig:
    batch_size: int = 16
    eval_interval: int = 1
    learning_rate: float = 3e-4
    epochs: int = 50
    
opt = OptConfig()

In [None]:
# Get a list of patients which fit a reduced set of criterion
# path_to_db = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/archive/Version2/"
path_to_db = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/"

# Build 
dm = FoundationalDataModule(path_to_db=path_to_db,
                            load=False,
                            include_measurements=True,
                            drop_missing_data=True,
                            include_diagnoses=True,
                            drop_empty_dynamic=True,
                            preprocess_measurements=True,
                            tokenizer="tabular",
                            batch_size=opt.batch_size,
                            max_seq_length=config.block_size,
                            unk_freq_threshold=config.unk_freq_threshold,
                            min_workers=1
                           )

vocab_size = dm.train_set.tokenizer.vocab_size

print(f"{len(dm.train_set)} training patients")
print(f"{len(dm.val_set)} validation patients")
print(f"{len(dm.test_set)} test patients")
print(f"{vocab_size} vocab elements")

INFO:root:Building Polars dataset and saving to /rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/polars/
INFO:root:Connected to SQLite database
INFO:root:Chunking by unique practice ID: no inclusion conditions
DEBUG:root:Query: SELECT DISTINCT SUBSTR(PRACTICE_PATIENT_ID, 1, INSTR(PRACTICE_PATIENT_ID, '_') - 1) FROM static_table WHERE PRACTICE_PATIENT_ID LIKE '%_%'
INFO:root:Creating train/test/val splits using practice_patient_ids
INFO:root:Collecting meta information of training split for tokenization/standardisation
  0%|          | 0/1341 [00:00<?, ?it/s]DEBUG:root:p21199
DEBUG:root:Query: SELECT DISTINCT EVENT FROM diagnosis_table;
DEBUG:root:unique_diagnoses: ['AF', 'STROKE_HAEMRGIC', 'HYPERTENSION', 'MINFARCTION', 'PAD_STRICT', 'TYPE1DM', 'TYPE2DIABETES', 'CKDSTAGE3TO5', 'DEPRESSION', 'ANXIETY', 'BIPOLAR', 'EATINGDISORDERS', 'AUTISM', 'SUBSTANCEMISUSE', 'CHRONIC_LIVER_DISEASE_ALCOHOL', 'OTHER_CHRONIC_LIVER_DISEASE_OPTIMAL', 'ULCERATIVE_COLITIS', 'CROHNS_DISEASE', 'ALL

In [None]:
import pyarrow.parquet as pq

dataset1 = pq.ParquetDataset(path_to_db + "polars/split=train/", 
                            filters=[('PRACTICE_ID','=','p20763')]
                            )

import time

start = time.time()   # starting time
df  = dataset1.read().to_pandas()
df = df[df["row_nr"] == 100]
print(df)
print(time.time() - start)


In [None]:

start = time.time()   # starting time
for row_idx, row in enumerate(dm.train_set):
    if row_idx >= opt.batch_size:
        break
print(f"batch loaded in {time.time()-start} seconds")

In [None]:
start = time.time()   # starting time
for batch in dm.train_dataloader():
    break
print(f"batch loaded in {time.time()-start} seconds")    
    
for key in batch.keys():
    print(f"{key}".ljust(20) + f"{batch[key].shape}")

tokens = batch["tokens"][0].tolist()    
sentence = dm.decode(tokens).split(" ")
for token, value in zip(sentence, batch["values"][0].tolist()):
    print(f"{token}:".ljust(40) + f"{value}")