In [1]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/g/gaddcz/Projects/CPRD/virtual-env-{node_type}'
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
    print(f"Added path '{venv_site_pkgs}' at start of search paths.")
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

%load_ext autoreload
%autoreload 2

%env SQLITE_TMPDIR=/rds/projects/g/gokhalkm-optimal/DataforCharles
%env TMPDIR=/rds/projects/g/gokhalkm-optimal/DataforCharles
!echo $SQLITE_TMPDIR
!echo $TMPDIR
!echo $USERPROFILE

Added path '/rds/homes/g/gaddcz/Projects/CPRD/virtual-env-icelake/lib/python3.10/site-packages' at start of search paths.
env: SQLITE_TMPDIR=/rds/projects/g/gokhalkm-optimal/DataforCharles
env: TMPDIR=/rds/projects/g/gokhalkm-optimal/DataforCharles
/rds/projects/g/gokhalkm-optimal/DataforCharles
/rds/projects/g/gokhalkm-optimal/DataforCharles



In [2]:
import pytorch_lightning
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
import sqlite3
import logging
from CPRD.data.database.build_static_db import Static
from CPRD.data.database.build_diagnosis_db import Diagnoses
from CPRD.data.database.build_measurements_and_tests_db import Measurements

torch.manual_seed(1337)
logging.basicConfig(level=logging.INFO)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using device: {device}.")

!export TMPDIR=/rds/projects/g/gokhalkm-optimal/DataforCharles/tmp
!pwd


Using device: cuda.
/rds/homes/g/gaddcz/Projects/CPRD/data/database


In [3]:
PATH_TO_DB = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/archive/Version2/cprd.db"

## Create/load tables

In [8]:
PATH_TO_STATIC = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/archive/Version2/baseline/masterDataOptimal_v220230323015634.csv"
PATH_TO_DIAGNOSIS = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/archive/Version2/baseline/masterDataOptimal_v220230323015634.csv"
PATH_TO_MEASUREMENTS = "/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/archive/Version2/timeseries/measurement_and_tests/"
load = False

static = Static(PATH_TO_DB, PATH_TO_STATIC, load=load)
diagnosis = Diagnoses(PATH_TO_DB, PATH_TO_DIAGNOSIS, load=load)
measurements = Measurements(PATH_TO_DB, PATH_TO_MEASUREMENTS, load=load)

INFO:root:Creating static_table
INFO:root:Creating diagnosis_table
INFO:root:Creating measurement_table


## Build tables if not loaded

In [9]:
for table in [static, diagnosis,  measurements]:
    if load == False:
        table.build_table(verbose=0, chunksize=1e5)
    print(table)

Building static table: 6it [00:13,  2.29s/it]
INFO:root:Creating indexes on static_table


Static table with 0.55M records.


Building diagnosis table: 6it [00:21,  3.51s/it]
INFO:root:Creating indexes on diagnosis_table


[(0, 'diagnosis_index', 0, 'c', 0)]
Diagnosis table with 1.19M records.


Adding 25-Hydroxyvitamin_D2_level_92                                  : 1it [00:00, 15.76it/s]
Adding 25-Hydroxyvitamin_D3_level_90                                  : 1it [00:00, 15.97it/s]
Adding AST_-_aspartate_transam._SGOT__46                              : 1it [00:00,  3.94it/s]
Adding AST_serum_level_47                                             : 2it [00:00,  2.01it/s]
Adding Albumin___creatinine_ratio_37                                  : 1it [00:00, 23.18it/s]
Adding Basophil_count_22                                              : 20it [00:11,  1.67it/s]
Adding Blood_calcium_level_38                                         : 1it [00:00, 99.46it/s]
Adding Blood_urea_28                                                  : 1it [00:00,  7.83it/s]
Adding Body_mass_index_3                                              : 22it [00:13,  1.63it/s]
Adding Brain_natriuretic_peptide_level_66                             : 1it [00:00, 67.14it/s]
Adding Calcium_adjusted_level_41                

[(0, 'measurement_PRACTICE_PATIENT_ID_idx', 0, 'c', 0)]
[(0, 'measurement_EVENT_idx', 0, 'c', 0), (1, 'measurement_PRACTICE_PATIENT_ID_idx', 0, 'c', 0)]
Measurement table with 79.02M records.


In [6]:
diagnosis.connect()
diagnosis.connection.commit()
print(diagnosis)

Diagnosis table with 0.00M records.


In [18]:
# sql_query = """SELECT name FROM sqlite_master;"""
sql_query = """PRAGMA temp_store = 1"""
diagnosis.connect()
diagnosis.cursor.execute(sql_query)
result = diagnosis.cursor.fetchall()
print(result)
diagnosis.disconnect()


[(0,)]


## Check query execution time

In [21]:
import time

measurements.disconnect()

measurements.connect()

t = time.time()   #  EXPLAIN QUERY PLAN       diagnosis_table            measurement_table        static_table
# measurements.cursor.execute(""" SELECT DISTINCT
#                                 	PRACTICE_PATIENT_ID
#                                 FROM
#                                 	static_table
#                                  ;""")

measurements.cursor.execute(""" SELECT
                                	*
                                FROM
                                	measurement_table
                                WHERE
                                    VALUE = '3720368640.0'  ;""")  #'PRACTICE_PATIENT_ID = 'p20515_297525820515'
a = measurements.cursor.fetchall()
# print(a[:10])
print(a)
eval_time = (time.time() - t)
print(eval_time )

[('p20705_1570204020705', 'Red_blood_cell__RBC__count_10', 3720368640.0, '2013-02-06')]
7.521908521652222


# Collector demo

We can extract unique distinct column values from any table, and use these to chunk the data for processing

* We can add practice level conditions here,
    * e.g. practices that are in the North West region. 

In [12]:
from CPRD.data.dataset.collector import SQLiteDataCollector
import polars as pl
import logging 
logging.basicConfig(level=logging.INFO)

collector = SQLiteDataCollector(PATH_TO_DB)
collector.connect()

practice_ids = collector._extract_distinct(["static_table"], "PRACTICE_ID", conditions=["HEALTH_AUTH = 'North West'"])

print(f"The first ten unique practice IDs: {practice_ids[:10]} from {len(practice_ids)} total")

The first ten unique practice IDs: [20515, 20551, 20713, 20524, 20397, 20684, 20655, 20530, 20659, 20758] from 13 total


## Given these practice IDs, we can chain this query to get the pratice patient ID's in a practice

* We do this as only the static table has all practice_patient_id, patient_id, and practice_id stored and indexed.
    * This means we can very quickly find the keys needed to index on the other tables which only have practice_patient_id indexed.
* We can add patient level conditions here
    * For example, only male, people who died during a period, etc

In [13]:
practice_patient_ids = []
for p_id in practice_ids:
    practice_patient_ids.append(collector._extract_distinct(["static_table"], "PRACTICE_PATIENT_ID", conditions=[f"PRACTICE_ID = '{p_id}' AND SEX = 'M'"]))
                                
print(f"\nThe first five unique practice_patient IDs from practice {practice_ids[0]}:\n\t{practice_patient_ids[0][:5]} from {len(practice_patient_ids[0])} total")
print(f"\nTotal male patients in each practice:\n\t {[len(ppid) for ppid in practice_patient_ids]}")


The first five unique practice_patient IDs from practice 20515:
	['p20515_297522720515', 'p20515_5685611020515', 'p20515_297194820515', 'p20515_2251234720515', 'p20515_297307220515'] from 3352 total

Total male patients in each practice:
	 [3352, 11147, 3346, 6035, 6019, 2332, 11073, 7837, 2112, 8992, 9742, 7067, 7739]


### Create a generator which chunks the .db tables by practice_id or practice_patient_id and lazily batch using Polars (Rust)

* We can also choose to pass this method a list of conditions, one for each table.
   * For example, we may only want to collect measurements within some predfined list, period etc

In the first case, we may want to generate by individual level values. 

Here, we then pass in the list we generate over - which in this instance is the list of practice patient ID's belonging to the first chunk. Here we are then yielding one person per call to the generator.

In [14]:
generator = collector._lazy_generate_by_distinct(practice_patient_ids[0], ["static_table", "measurement_table", "diagnosis_table"], "PRACTICE_PATIENT_ID")
for _idx, (chunk_name, lazy_table_frames_dict) in enumerate(generator):
    break
    
# print(f"Chunk containing practice ID: {chunk_name}\n\n")
display(lazy_table_frames_dict["lazy_static"].collect().head())
display(lazy_table_frames_dict["lazy_measurement"].collect().head())
display(lazy_table_frames_dict["lazy_diagnosis"].collect().head())

PRACTICE_PATIENT_ID,PRACTICE_ID,PATIENT_ID,ETHNICITY,YEAR_OF_BIRTH,SEX,COUNTRY,HEALTH_AUTH,INDEX_DATE,START_DATE,END_DATE
str,i64,i64,str,str,str,str,str,str,str,str
"""p20515_2975227…",20515,297522720515,"""MISSING""","""1966-07-15""","""M""","""E""","""North West""","""2005-01-01""","""2005-01-01""","""2007-09-27"""


PRACTICE_PATIENT_ID,EVENT,VALUE,DATE
str,str,f64,str
"""p20515_2975227…","""Body_mass_inde…",25.1,"""1997-06-02"""
"""p20515_2975227…","""Body_mass_inde…",26.2,"""2004-02-03"""
"""p20515_2975227…","""Diastolic_bloo…",86.0,"""1997-06-02"""
"""p20515_2975227…","""Diastolic_bloo…",78.0,"""2004-02-03"""
"""p20515_2975227…","""Diastolic_bloo…",88.0,"""2006-04-24"""


PRACTICE_PATIENT_ID,EVENT,DATE
str,str,str
"""p20515_2975227…","""DEPRESSION""","""2002-05-17"""


Alternatively, we may which to generate groups/batches of individuals for vectorisation. In that case we can pass in a list of lists. We generate over the outer list, and each call yields all the inner list values.

For example, in the below we have an outer list of all practices, and an inner list of patients within the practice. Consequently, each call to generate yields all records of all patients within a practice.

In [15]:
generator = collector._lazy_generate_by_distinct(practice_patient_ids, ["static_table", "measurement_table", "diagnosis_table"], "PRACTICE_PATIENT_ID")
for _idx, (chunk_name, lazy_table_frames_dict) in enumerate(generator):
    break
    
# print(f"Chunk containing practice ID: {chunk_name}\n\n")
display(lazy_table_frames_dict["lazy_static"].collect().head())
display(lazy_table_frames_dict["lazy_measurement"].collect().head())
display(lazy_table_frames_dict["lazy_diagnosis"].collect().head())


PRACTICE_PATIENT_ID,PRACTICE_ID,PATIENT_ID,ETHNICITY,YEAR_OF_BIRTH,SEX,COUNTRY,HEALTH_AUTH,INDEX_DATE,START_DATE,END_DATE
str,i64,i64,str,str,str,str,str,str,str,str
"""p20515_1099848…",20515,10998487420515,"""MISSING""","""1976-07-15""","""M""","""E""","""North West""","""2022-03-12""","""2022-03-12""","""2022-03-19"""
"""p20515_1037909…",20515,1037909720515,"""WHITE""","""1990-07-15""","""M""","""E""","""North West""","""2019-02-28""","""2019-02-28""","""2022-03-19"""
"""p20515_1066963…",20515,1066963220515,"""WHITE""","""1988-07-15""","""M""","""E""","""North West""","""2019-03-02""","""2019-03-02""","""2022-03-19"""
"""p20515_1066963…",20515,1066963320515,"""WHITE""","""1999-07-15""","""M""","""E""","""North West""","""2019-03-02""","""2019-03-02""","""2022-03-19"""
"""p20515_1066974…",20515,1066974920515,"""WHITE""","""1989-07-15""","""M""","""E""","""North West""","""2019-03-05""","""2019-03-05""","""2019-04-12"""


PRACTICE_PATIENT_ID,EVENT,VALUE,DATE
str,str,f64,str
"""p20515_1002242…","""AST_serum_leve…",27.0,"""2010-08-24"""
"""p20515_1002242…","""AST_serum_leve…",42.0,"""2011-06-03"""
"""p20515_1002242…","""AST_serum_leve…",29.0,"""2012-11-16"""
"""p20515_1002242…","""AST_serum_leve…",30.0,"""2013-11-30"""
"""p20515_1002242…","""AST_serum_leve…",28.0,"""2016-02-01"""


PRACTICE_PATIENT_ID,EVENT,DATE
str,str,str
"""p20515_1002242…","""ALLCA_NOBCC_VF…","""2017-10-01"""
"""p20515_1037909…","""DEPRESSION""","""2010-05-20"""
"""p20515_1037909…","""ANXIETY""","""2020-01-03"""
"""p20515_1037909…","""ALLCA_NOBCC_VF…","""2020-05-11"""
"""p20515_1037909…","""LYMPHOMA_PREVA…","""2020-04-02"""


### Test collation of generated lazy frames

In [16]:
lazy_batch = collector._collate_lazy_tables(**lazy_table_frames_dict)

### View how tables are aligned

In [17]:
batch = lazy_batch.collect()
display(batch)

first_identifier = list(batch["PRACTICE_PATIENT_ID"].unique())[0]
display(first_identifier)
                        
row = batch.filter(pl.col("PRACTICE_PATIENT_ID") == first_identifier)
display(row)


measur = lazy_table_frames_dict["lazy_measurement"].collect()
row = measur.filter(pl.col("EVENT") == "Diastolic_blood_pressure_5")
display(row)

PRACTICE_PATIENT_ID,VALUE,EVENT,DAYS_SINCE_BIRTH,PRACTICE_ID,PATIENT_ID,ETHNICITY,YEAR_OF_BIRTH,SEX,COUNTRY,HEALTH_AUTH,INDEX_DATE,START_DATE,END_DATE
str,list[f64],list[str],list[i64],str,str,str,datetime[μs],str,str,str,datetime[μs],datetime[μs],datetime[μs]
"""p20515_1002242…","[28.8, 90.0, … 140.0]","[""Body_mass_index_3"", ""Diastolic_blood_pressure_5"", … ""Systolic_blood_pressure_4""]","[21961, 21961, … 26533]","""p20515""","""1002242920515""","""WHITE""",1946-07-15 00:00:00,"""M""","""E""","""North West""",2019-02-22 00:00:00,2019-02-22 00:00:00,2020-11-13 00:00:00
"""p20515_1037909…","[null, null, … 110.0]","[""O_E_-_weight_2"", ""ATOPICECZEMA"", … ""Systolic_blood_pressure_4""]","[230, 1010, … 11272]","""p20515""","""1037909720515""","""WHITE""",1990-07-15 00:00:00,"""M""","""E""","""North West""",2019-02-28 00:00:00,2019-02-28 00:00:00,2022-03-19 00:00:00
"""p20515_1066963…","[25.2, 81.0, … null]","[""Body_mass_index_3"", ""Diastolic_blood_pressure_5"", … ""GOUT""]","[6409, 6409, … 12110]","""p20515""","""1066963220515""","""WHITE""",1988-07-15 00:00:00,"""M""","""E""","""North West""",2019-03-02 00:00:00,2019-03-02 00:00:00,2022-03-19 00:00:00
"""p20515_1066963…","[null, null, … 10.4]","[""ATOPICECZEMA"", ""ASTHMA_PUSHASTHMA"", … ""Total_white_cell_count_18""]","[644, 1876, … 8107]","""p20515""","""1066963320515""","""WHITE""",1999-07-15 00:00:00,"""M""","""E""","""North West""",2019-03-02 00:00:00,2019-03-02 00:00:00,2022-03-19 00:00:00
"""p20515_1066974…","[null, null, … 130.0]","[""ATOPICECZEMA"", ""ASTHMA_PUSHASTHMA"", … ""Systolic_blood_pressure_4""]","[1266, 2693, … 10468]","""p20515""","""1066974920515""","""WHITE""",1989-07-15 00:00:00,"""M""","""E""","""North West""",2019-03-05 00:00:00,2019-03-05 00:00:00,2019-04-12 00:00:00
"""p20515_1099618…","[18.3, null, … 53.0]","[""Body_mass_index_3"", ""Never_smoked_tobacco_85"", … ""O_E_-_weight_2""]","[9276, 9276, … 9276]","""p20515""","""10996184120515…","""WHITE""",1991-07-15 00:00:00,"""M""","""E""","""North West""",2022-03-15 00:00:00,2022-03-15 00:00:00,2022-03-19 00:00:00
"""p20515_1099848…","[null, null]","[""DEPRESSION"", ""ANXIETY""]","[14271, 14271]","""p20515""","""10998487420515…","""MISSING""",1976-07-15 00:00:00,"""M""","""E""","""North West""",2022-03-12 00:00:00,2022-03-12 00:00:00,2022-03-19 00:00:00
"""p20515_1137545…","[null, null, … 66.8]","[""ASTHMA_PUSHASTHMA"", ""ATOPICECZEMA"", … ""O_E_-_weight_2""]","[219, 219, … 8970]","""p20515""","""1137545520515""","""WHITE""",1997-07-15 00:00:00,"""M""","""E""","""North West""",2019-03-06 00:00:00,2019-03-06 00:00:00,2022-03-19 00:00:00
"""p20515_1142689…","[24.0, null, … 1.46]","[""Body_mass_index_3"", ""Ex_smoker_84"", … ""Serum_triglycerides_105""]","[9190, 9190, … 12214]","""p20515""","""1142689720515""","""WHITE""",1987-07-15 00:00:00,"""M""","""E""","""North West""",2019-03-07 00:00:00,2019-03-07 00:00:00,2021-06-19 00:00:00
"""p20515_1147235…","[23.1, null, … 317.0]","[""Body_mass_index_3"", ""Never_smoked_tobacco_85"", … ""Serum_vitamin_B12_79""]","[9987, 9987, … 12060]","""p20515""","""1147235520515""","""BLACK""",1988-07-15 00:00:00,"""M""","""E""","""North West""",2019-03-09 00:00:00,2019-03-09 00:00:00,2022-03-19 00:00:00


'p20515_297396920515'

PRACTICE_PATIENT_ID,VALUE,EVENT,DAYS_SINCE_BIRTH,PRACTICE_ID,PATIENT_ID,ETHNICITY,YEAR_OF_BIRTH,SEX,COUNTRY,HEALTH_AUTH,INDEX_DATE,START_DATE,END_DATE
str,list[f64],list[str],list[i64],str,str,str,datetime[μs],str,str,str,datetime[μs],datetime[μs],datetime[μs]
"""p20515_2973969…","[25.8, 80.0, … null]","[""Body_mass_index_3"", ""Diastolic_blood_pressure_5"", … ""ANXIETY""]","[9899, 9899, … 14850]","""p20515""","""297396920515""","""WHITE""",1979-07-15 00:00:00,"""M""","""E""","""North West""",2007-04-26 00:00:00,2007-04-26 00:00:00,2022-03-19 00:00:00


PRACTICE_PATIENT_ID,EVENT,VALUE,DATE
str,str,f64,str
"""p20515_1002242…","""Diastolic_bloo…",90.0,"""2006-08-30"""
"""p20515_1002242…","""Diastolic_bloo…",90.0,"""2008-09-04"""
"""p20515_1002242…","""Diastolic_bloo…",80.0,"""2008-09-19"""
"""p20515_1002242…","""Diastolic_bloo…",80.0,"""2008-10-03"""
"""p20515_1002242…","""Diastolic_bloo…",80.0,"""2009-07-23"""
"""p20515_1002242…","""Diastolic_bloo…",80.0,"""2010-08-13"""
"""p20515_1002242…","""Diastolic_bloo…",78.0,"""2011-06-02"""
"""p20515_1002242…","""Diastolic_bloo…",76.0,"""2012-11-15"""
"""p20515_1002242…","""Diastolic_bloo…",82.0,"""2013-08-30"""
"""p20515_1002242…","""Diastolic_bloo…",84.0,"""2013-11-29"""


# Demo wrapper

All of this is wrapped together (without the option to filter - add your own wrapper specific to you application)

In [18]:
# for _idx, (chunk_name, lazy_table_frames_dict) in enumerate(collector.generator()):
#     lazy_batch = collector._collate_lazy_tables(**lazy_table_frames_dict)
#     break
# print(_idx)
# print(lazy_batch.collect())

In [19]:
!jupyter nbconvert --to html --no-input buildV2.ipynb

[NbConvertApp] Converting notebook buildV2.ipynb to html
[NbConvertApp] Writing 645516 bytes to buildV2.html
