# Creating the parquet dataset from SQLite tables

In [1]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-{node_type}'
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
    print(f"Added path '{venv_site_pkgs}' at start of search paths.")
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

!pwd

%load_ext autoreload
%autoreload 2

Added path '/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-icelake/lib/python3.10/site-packages' at start of search paths.
/rds/homes/g/gaddcz/Projects/CPRD/examples/data/4_build_behrt_pre_training_dataset


In [8]:
import numpy as np
import polars as pl
import pandas as pd
import time
import logging
import pickle as pkl
from tqdm import tqdm

logging.disable(logging.CRITICAL)

from FastEHR.dataloader import FoundationalDataModule
from FastEHR.adapters.BEHRT import BehrtDFBuilder
from CPRD.examples.data.map_to_reduced_names import convert_event_names, EVENT_NAME_SHORT_MAP, EVENT_NAME_LONG_MAP

pl.Config.set_tbl_rows(300)


polars.config.Config

# Load the same dataset used for pre-training SurvivEHR, but add an adapter for BEHRT

In [9]:
path_to_ds = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/FoundationalModel/PreTrain/"

dm = FoundationalDataModule(
    path_to_db = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/FoundationalModel/cprd.db",
    path_to_ds = path_to_ds,
    load = True,
    adapter = "BEHRT",
    min_workers=10,
    batch_size=128,
)

vocab_size = dm.train_set.tokenizer.vocab_size

print(f"{len(dm.train_set)} training patients")
print(f"{len(dm.val_set)} validation patients")
print(f"{len(dm.test_set)} test patients")
print(f"{vocab_size} vocab elements")

23613894 training patients
1426714 validation patients
1508320 test patients
265 vocab elements


## Get the tokenizer for the BEHRT dataset

In [10]:
display(dm.adapter.tokenizer)

{'PAD': 0,
 'UNK': 1,
 'SEP': 2,
 'CLS': 3,
 'MASK': 4,
 'ADDISONS_DISEASE': 5,
 'CYSTICFIBROSIS': 6,
 'SYSTEMIC_SCLEROSIS': 7,
 'SICKLE_CELL_DISEASE_V2': 8,
 'ADDISON_DISEASE': 9,
 'DOWNSSYNDROME': 10,
 'HAEMOCHROMATOSIS_V2': 11,
 'PLASMACELL_NEOPLASM_V2': 12,
 'SJOGRENSSYNDROME': 13,
 'SYSTEMIC_LUPUS_ERYTHEMATOSUS': 14,
 'HIVAIDS': 15,
 'PSORIATICARTHRITIS2021': 16,
 'MS': 17,
 'Plasma_N_terminal_pro_B_type_natriuretic_peptide_conc_70': 18,
 'LEUKAEMIA_PREVALENCEV2': 19,
 'N_terminal_pro_brain_natriuretic_peptide_level_67': 20,
 'ILD_SH': 21,
 'CHRONIC_LIVER_DISEASE_ALCOHOL': 22,
 'PERNICIOUSANAEMIA': 23,
 'MENIERESDISEASE': 24,
 'LYMPHOMA_PREVALENCE_V2': 25,
 'CROHNS_DISEASE': 26,
 'AllHIVdrugs_HIV': 27,
 'Plasma_B_natriuretic_peptide_level_69': 28,
 'CHRONICFATIGUESYNDROMEMM_V2': 29,
 'Plasma_pro_brain_natriuretic_peptide_level_64': 30,
 'STROKE_HAEMRGIC': 31,
 'PARKINSONS': 32,
 'AORTICANEURYSM_V2': 33,
 'BIPOLAR': 34,
 'BRONCHIECTASIS': 35,
 'ULCERATIVE_COLITIS': 36,
 'SCHIZOPHRE

# Create BEHRT dataset

We now have a usable dataloader for a BEHRT model. However, the original code requires all data be saved in a single dataframe. Here we demonstrate how to do this. 

However, this approach taken by BEHRT does not scale with the amount of pre-training data we have, therefore we do not proceed with pre-training BEHRT with this data.

In [11]:
# Save built tokenizer to file
bert_vocab = {'token2idx': dm.adapter.tokenizer}
with open(path_to_ds + "BEHRT/token2idx.pkl", "wb") as f:
    pkl.dump(bert_vocab, f)

In [None]:
builder = BehrtDFBuilder(
    token_map=dm.adapter.tokenizer,
    pad_token="PAD",
    class_token="CLS",
    sep_token="SEP",
    id_prefix="P",
    zfill=7,
    min_seq_len=2,
)

chunks = []
for idx_batch, batch in tqdm(
    enumerate(dm.train_dataloader()),
    desc="Creating BEHRT dataset",
    total=len(dm.train_dataloader())
):

    builder.add_batch(batch["tokens"], batch["ages"])

    if idx_batch % 10 == 0:
        df_chunk = builder.flush()

        if not df_chunk.empty:
            chunks.append(df_chunk)

# Final flush
final_chunk = builder.flush()
if not final_chunk.empty:
    chunks.append(final_chunk)

# Concatenate all chunks (or return empty df)
if chunks:
    df = pd.concat(chunks, ignore_index=True)
    df.to_parquet(data_dir + "adapted/BEHRT/pretrain/dataset.parquet", index=False)

    print(len(df))
    print(df["patid"][0])
    print(df["caliber_id"][0])
    print(df["age"][0])
else:
    logging.warning("No valid data")

Creating BEHRT dataset:   0%|          | 94/184484 [03:40<109:20:33,  2.13s/it]