In [None]:
!pip install exeteracovid

In [None]:
source_data = # the dataset from which the patient and assessment data is read
dest_data = # the dataset to which the patient and assessment data is written

In [None]:
from exetera.core.session import Session
from exetera.core.utils import Timer

# Everything starts creating a session. Using 'with' allows the datasets that we open to
# close automatically
with Session() as s:
    # You can open multiple datasets. It is generally wise to write intermediate results
    # to a separate dataset, rather than adding to the base dataset
    src = s.open_dataset(source_data, 'r', 'src')
    dest = s.open_dataset(dest_data, 'w', 'dest')

    # Get the patients and assessments group from the source data, for convenience
    s_ptnts = src['patients']
    s_asmts = src['assessments']
    
    # Create the assessments group for the joined data
    d_asmts = dest.create_group('assessments')

    # When you want to perform a join on a number of fields, it is useful to build a sequence of keys
    ptnt_keys = ('age', 'weight_kg', 'height_cm')

    # Get the tuple of fields that we want to join
    merge_sources = tuple(s.get(s_ptnts[k]) for k in ptnt_keys)
    
    # Get a tuple of empty fields that we are writing the joined data to
    merge_sinks = tuple(s.get(s_ptnts[k]).create_like(d_asmts, k) for k in ptnt_keys)
    
    # Get the patient 'id' field (primary key) and the assessment 'patient_id' field (foreign key)
    p_ids = s.get(s_ptnts['id'])
    a_pids = s.get(s_asmts['patient_id'])
    
    # Note, the merge may take a few minutes to complete!
    with Timer("merging"):
        s.ordered_merge_left(left_on=a_pids, right_on=p_ids, right_field_sources=merge_sources, left_field_sinks=merge_sinks, right_unique=True)

In [None]:
import numpy as np
from matplotlib import pyplot as plt

# Check that the fields were written 
with Session() as s:
    src = s.open_dataset(source_data, 'r', 'src')
    dest = s.open_dataset(dest_data, 'r', 'dest')

    p_age, p_count = np.unique(s.get(src['patients']['age']).data[:], return_counts=True)
    a_age, a_count = np.unique(s.get(dest['assessments']['age']).data[:], return_counts=True)
    
    fig, ax = plt.subplots(3, 1, figsize=(12, 10))
    ax[0].set_xlabel("Age in years")
    ax[0].set_ylabel("Number of patients")
    ax[0].bar(p_age[:101], p_count[:101])
    
    ax[1].set_xlabel("Age in years")
    ax[1].set_ylabel("Number of assessments")
    ax[1].bar(a_age[:101], a_count[:101])
    
    ax[2].set_xlabel("Age in years")
    ax[2].set_ylabel("Avg. assessments per patient")
    ax[2].bar(a_age[:101], a_count[:101] / p_count[:101])