In [None]:
!pip install exeteracovid

In [1]:
source_data = '/home/ben/covid/ds_20201101_full.hdf5'
dest_data = '/home/ben/covid/example_out.hdf5'

In [2]:
from exetera.core.session import Session
from exetera.core.utils import Timer

# Everything starts creating a session. Using 'with' allows the datasets that we open to
# close automatically
with Session() as s:
    # You can open multiple datasets. It is generally wise to write intermediate results
    # to a separate dataset, rather than adding to the base dataset
    src = s.open_dataset(source_data, 'r', 'src')
    dest = s.open_dataset(dest_data, 'w', 'dest')

    # Get the patients and assessments group from the source data, for convenience
    s_ptnts = src['patients']
    s_asmts = src['assessments']
    
    # Create the assessments group for the joined data
    d_asmts = dest.create_group('assessments')

    # When you want to perform a join on a number of fields, it is useful to build a sequence of keys
    ptnt_keys = ('age', 'weight_kg', 'height_cm')

    # Get the tuple of fields that we want to join
    merge_sources = tuple(s.get(s_ptnts[k]) for k in ptnt_keys)
    
    # Get a tuple of empty fields that we are writing the joined data to
    merge_sinks = tuple(s.get(s_ptnts[k]).create_like(d_asmts, k) for k in ptnt_keys)
    
    # Get the patient 'id' field (primary key) and the assessment 'patient_id' field (foreign key)
    p_ids = s.get(s_ptnts['id'])
    a_pids = s.get(s_asmts['patient_id'])
    
    with Timer("merging"):
        s.ordered_merge_left(left_on=a_pids, right_on=p_ids, right_field_sources=merge_sources, left_field_sinks=merge_sinks, right_unique=True)

merging: completed in 388.443642616272 seconds


In [3]:
import numpy as np

# Check that the fields were written 
with Session() as s:
    dest = s.open_dataset(dest_data, 'r', 'dest')
    # Print out the keys for this 
    print([k for k in dest['assessments'].keys()])
    print(np.unique(s.get(dest['assessments']['age']).data[:], return_counts=True))

['age', 'height_cm', 'weight_kg']
(array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
         33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,
         44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,
         55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,
         66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,
         77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,
         88,   89,   90,   91,   92,   93,   94,   95,   96,   97,   98,
         99,  100,  101,  102,  103,  104,  105,  106,  107,  108,  109,
        110,  111,  112,  113,  114,  115,  116,  117,  118,  119,  120,
       1819, 1821, 1823, 1827, 2001, 2019, 2020], dtype=uint32), array([  82969,  220138,  256975,  328739,  380326,  400474,  418018,
        428705,  476300,  5