In [None]:
# default_exp preprocessing.transform

# preprocessing.transform
> Classes and functions to transform cleaned EHR dataset into `Patient` & `PatientList` objects that can be used for neural net training.

In [None]:
#hide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#export
from lemonade.preprocessing.clean import *
from lemonade.preprocessing.vocab import *
from fastai.imports import *
from fastai import *
from datetime import date
import torch.multiprocessing as multiprocessing

In [None]:
#hide
from nbdev.showdoc import *

**Load Cleaned Data**

In [None]:
PATH_1K, PATH_10K

('./datasets/synthea/1K', './datasets/synthea/10K')

In [None]:
CONDITIONS

{'diabetes': '44054006||START',
 'stroke': '230690007||START',
 'alzheimers': '26929004||START',
 'coronary_heart': '53741008||START'}

In [None]:
train_dfs, valid_dfs, test_dfs = load_cleaned_ehrdata(PATH_1K)

In [None]:
#[patients, patient_demographics, observations, allergies, careplans, medications, imaging_studies, procedures, conditions, immunizations]
all_dfs = train_dfs
patients_df, patient_demographics_df, all_rec_dfs = all_dfs[0], all_dfs[1], all_dfs[2:]

In [None]:
# [display(df.head()) for df in all_dfs]

In [None]:
patients_df.head()

Unnamed: 0_level_0,patient,birthdate,diabetes_y,diabetes_age,stroke_y,stroke_age,alzheimers_y,alzheimers_age,coronary_heart_y,coronary_heart_age
indx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,e4361ed5-5ef7-407d-b5d3-576155ed66cd,1988-05-16,False,,False,,False,,False,
1,b28ca05e-a076-4868-a1af-168ae06b59ce,1990-06-12,False,,False,,False,,False,
2,7c2b8c3b-fa87-44e9-b868-01db98dad56c,1988-09-05,False,,False,,False,,False,
3,0731de61-1a8b-4135-83c0-2ace696bcac8,1934-11-27,False,,False,,False,,False,
4,a119d309-9cac-415a-b9bd-c0cf10f15beb,1980-12-23,False,,False,,False,,False,


In [None]:
vals = all_dfs[0].iloc[1].values
vals

array(['b28ca05e-a076-4868-a1af-168ae06b59ce', '1990-06-12', False, nan,
       False, nan, False, nan, False, nan], dtype=object)

In [None]:
vals[0], vals[1], vals[2], vals[4], vals[6], vals[8]

('b28ca05e-a076-4868-a1af-168ae06b59ce',
 '1990-06-12',
 False,
 False,
 False,
 False)

**Load Vocabs**

In [None]:
vocab_list_1K = EhrVocabList.load(PATH_1K)
obs_vocab, alg_vocab, crpl_vocab, med_vocab, img_vocab, proc_vocab, cnd_vocab, imm_vocab = vocab_list_1K.records_vocabs
bday, bmonth, byear, marital, race, ethnicity, gender, birthplace, city, state, zipcode  = vocab_list_1K.demographics_vocabs
age_mean, age_std = vocab_list_1K.age_mean, vocab_list_1K.age_std

In [None]:
#all emb dimensions to be passed into model to determine nh
demographics_dims, recs_dims, demographics_dims_width, recs_dims_width = get_all_emb_dims(EhrVocabList.load(PATH_1K))

In [None]:
demographics_dims

[(33, 8),
 (14, 7),
 (124, 11),
 (5, 5),
 (7, 6),
 (25, 8),
 (4, 5),
 (205, 13),
 (211, 13),
 (3, 5),
 (200, 13)]

In [None]:
recs_dims

[(550, 17),
 (27, 8),
 (54, 9),
 (224, 13),
 (11, 6),
 (128, 12),
 (201, 13),
 (20, 7)]

In [None]:
demographics_dims_width, recs_dims_width

(94, 85)

## Assemble Single Patient

In [None]:
tst_ptid='0731de61-1a8b-4135-83c0-2ace696bcac8'

In [None]:
tst_pt = patients_df[patients_df.patient == tst_ptid]
tst_pt

Unnamed: 0_level_0,patient,birthdate,diabetes_y,diabetes_age,stroke_y,stroke_age,alzheimers_y,alzheimers_age,coronary_heart_y,coronary_heart_age
indx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,0731de61-1a8b-4135-83c0-2ace696bcac8,1934-11-27,False,,False,,False,,False,


In [None]:
demograph_vector = patient_demographics_df.loc[tst_ptid]
demograph_vector

birthdate          1934-11-27
marital                     M
race                    white
ethnicity          portuguese
gender                      F
birthplace             Boston
city               Foxborough
state           Massachusetts
zip                      2035
age_now_days            30670
Name: 0731de61-1a8b-4135-83c0-2ace696bcac8, dtype: object

In [None]:
tst_pt_birthdate = tst_pt.birthdate.values[0]
tst_pt_birthdate

'1934-11-27'

In [None]:
tst_pt_diabetes, tst_pt_stroke, tst_pt_alzheimers, tst_pt_coronaryheart = \
    tst_pt.diabetes_y.values[0], tst_pt.stroke_y.values[0], tst_pt.alzheimers_y.values[0], tst_pt.coronary_heart_y.values[0]

In [None]:
tst_pt_diabetes, tst_pt_stroke, tst_pt_alzheimers, tst_pt_coronaryheart

(False, False, False, False)

### Collate Codes & Offsets

- **Filter out for this patient** 
 - `rec_dfs[[ptid]]` + `demographics[[ptid]]`
 - ~~df already filtered by cutoff age~~
 - This is being done in `PatientList`, this is just for testing, **i.e. this will never be called**
- Use double `[[ptid]]` so that - even when single values are returned, we get a df

In [None]:
def get_rec_dfs(all_rec_dfs, ptid):
    '''Get all dfs for this patient, this is being done in `PatientList` so this is for testing only'''
    rec_dfs = []
    for rec_df in all_rec_dfs:
        try:
            rec_dfs.append(rec_df.loc[[ptid]])
        except KeyError:
            rec_dfs.append(pd.DataFrame())
    return rec_dfs

In [None]:
%time rec_dfs = get_rec_dfs(all_rec_dfs, tst_ptid)

CPU times: user 29.5 ms, sys: 192 µs, total: 29.7 ms
Wall time: 29.4 ms


rec_dfs -- observations, allergies, careplans, medications, imaging_studies, procedures, conditions, immunizations

The following are empty for this ptid (to check and confirm, uncomment following code snippet and run)
- allergies - 1
- ~~imaging_studies - 4~~

In [None]:
# for rec_df, name in zip(rec_dfs, FILENAMES[1:]):
#     print(f'{name}: {rec_df.shape}')
#     display(rec_df.tail())

In [None]:
p_obs, p_alg, p_crpl, p_med, p_img, p_proc, p_cnd, p_immn = rec_dfs

- For empty rec dfs like alg seen above, we have to indicate that nothing was recorded for this particular record (say allergies)
    1. So the codes list will be `xxnone`s of length `age_span`
        - For example for `age_start`=10 and `age_stop`=35, we will get 25 `xxnone`
- For non-empty rec dfs
 1. Collate codes by year or month (depending on `age_in_months` value)
 2. Compute respective offsets

For using age in days or hours (for example in case of hospitalization or ICU datasets)
- This function will need to be modified (in addition to `insert_age()` in `preprocessing.clean`)
- For example for age in days, `insert_age()` will insert a column in each data frame with the age in days, which then can be used by this function to filter for the right `age_span` 

In [None]:
#exports
def collate_codes_offsts(rec_df, age_start, age_stop, age_in_months=False):
    """Return a single patient's EmbeddingBag lookup codes and offsets for the given age span and age units"""
    codes  = []
    offsts = [0]
    age_span = age_stop - age_start
    if rec_df.empty: 
        codes = ['xxnone'] * age_span
        offsts = list(range(age_span))
    else:
        for i in range(age_start, age_stop, 1):
            if age_in_months: res = (rec_df.code[rec_df.age_months == i]).values
            else            : res = (rec_df.code[rec_df.age == i]).values
            if len(res) > 0: 
                codes.extend(res)      
                if i < age_stop - 1: offsts.append(offsts[-1] + len(res))
            else:          
                codes.append('xxnone') 
                if i < age_stop - 1: offsts.append(offsts[-1] + 1)
    
    assert len(offsts) == age_span
    return codes, offsts

**Tests**

Collate EHR codes for our test patient, from month 95 to month 105 

In [None]:
%time all_codes_offsts = [collate_codes_offsts(df, age_start=95, age_stop=105, age_in_months=True) for df in rec_dfs]

CPU times: user 17 ms, sys: 137 µs, total: 17.1 ms
Wall time: 16.9 ms


Observation codes for the above `age_span` is obtained as below ..  

In [None]:
obs_codes = all_codes_offsts[0][0]
obs_codes

['8302-2||136.6||cm||numeric',
 '72514-3||0.6||{score}||numeric',
 '29463-7||23.3||kg||numeric',
 '39156-5||12.5||kg/m2||numeric',
 '8462-4||82.4||mmHg||numeric',
 '8480-6||138.1||mmHg||numeric',
 '72166-2||Never smoker||xxxnan||text',
 'xxnone',
 'xxnone',
 'xxnone',
 '8302-2||138.6||cm||numeric',
 '72514-3||3.1||{score}||numeric',
 '29463-7||24.2||kg||numeric',
 '39156-5||12.6||kg/m2||numeric',
 '8462-4||70.2||mmHg||numeric',
 '8480-6||131.5||mmHg||numeric',
 '72166-2||Never smoker||xxxnan||text',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone']

and the corresponding offsets (to be used for EmbeddingBag lookups) is obtained as below.

In [None]:
obs_offsts = all_codes_offsts[0][1]
obs_offsts

[0, 7, 8, 9, 10, 17, 18, 19, 20, 21]

In [None]:
len(obs_codes), len(obs_offsts)

(22, 10)

Note that: no matter how many observations are recorded, the number of offsets will always be equal to the `age_span`

Collate EHR codes for our test patient, from 10 to 25 years

In [None]:
%time all_codes_offsts = [collate_codes_offsts(df, age_start=10, age_stop=25) for df in rec_dfs]

CPU times: user 25.7 ms, sys: 0 ns, total: 25.7 ms
Wall time: 25.2 ms


### Numericalize
- Once we have all the codes from the vocabs, we need to numericalize them 
- Call to `vocab.numericalize()` must always be a list
- `codes` returned from `collate_codes_offsts()` is always a list even if a single item

In [None]:
# Collated observation codes for patient
obs_codes = all_codes_offsts[0][0]

In [None]:
obs_codes_num = obs_vocab.numericalize(all_codes_offsts[0][0]) #codes numericalized
obs_offsts = all_codes_offsts[0][1] #offsets

In [None]:
# first 10 observation codes for patient
obs_codes[:10]

['8302-2||150.8||cm||numeric',
 '72514-3||3.9||{score}||numeric',
 '29463-7||31.1||kg||numeric',
 '39156-5||13.7||kg/m2||numeric',
 '8462-4||70.1||mmHg||numeric',
 '8480-6||110.2||mmHg||numeric',
 '72166-2||Never smoker||xxxnan||text',
 '8302-2||157.9||cm||numeric',
 '72514-3||1.9||{score}||numeric',
 '29463-7||34.9||kg||numeric']

In [None]:
# the above observation codes numericalized
obs_codes_num[:10]

[5, 9, 13, 87, 72, 77, 497, 5, 8, 13]

In [None]:
len(obs_codes), len(obs_codes_num), len(obs_offsts)

(115, 115, 15)

In [None]:
assert len(obs_codes) == len(obs_codes_num)

In [None]:
# the above observation codes textified 
obs_vocab.textify(obs_codes_num[:10])

[('8302-2||158.45000000000002||cm||numeric', 'Body Height'),
 ('72514-3||4.95||{score}||numeric',
  'Pain severity - 0-10 verbal numeric rating [Score] - Reported'),
 ('29463-7||42.15||kg||numeric', 'Body Weight'),
 ('39156-5||9.1||kg/m2||numeric', 'Body Mass Index'),
 ('8462-4||70.0||mmHg||numeric', 'Diastolic Blood Pressure'),
 ('8480-6||100.0||mmHg||numeric', 'Systolic Blood Pressure'),
 ('72166-2||Never smoker||xxxnan||text', 'Tobacco smoking status NHIS'),
 ('8302-2||158.45000000000002||cm||numeric', 'Body Height'),
 ('72514-3||2.475||{score}||numeric',
  'Pain severity - 0-10 verbal numeric rating [Score] - Reported'),
 ('29463-7||42.15||kg||numeric', 'Body Weight')]

In [None]:
#procedures for test patient
proc_codes = all_codes_offsts[5][0]

In [None]:
proc_codes_num,proc_offsts = proc_vocab.numericalize(all_codes_offsts[5][0]), all_codes_offsts[5][1]

In [None]:
assert len(proc_codes) == len(proc_codes_num)

In [None]:
len(proc_codes), len(proc_codes_num), len(proc_offsts)

(105, 105, 15)

**Putting all this into a function**

In [None]:
#export
def get_codenums_offsts(rec_dfs, all_vocabs, age_start, age_stop, age_in_months):
    '''Get numericalized record codes and offsets for a patient for a given age span'''
    all_codes_offsts = [collate_codes_offsts(df, age_start, age_stop, age_in_months) for df in rec_dfs]
    obs_vocab, alg_vocab, crpl_vocab, med_vocab, img_vocab, proc_vocab, cnd_vocab, imm_vocab = all_vocabs
    
    obs_c,  obs_o  = obs_vocab.numericalize (all_codes_offsts[0][0]), all_codes_offsts[0][1]
    alg_c,  alg_o  = alg_vocab.numericalize (all_codes_offsts[1][0]), all_codes_offsts[1][1]
    crpl_c, crpl_o = crpl_vocab.numericalize(all_codes_offsts[2][0]), all_codes_offsts[2][1]
    med_c,  med_o  = med_vocab.numericalize (all_codes_offsts[3][0]), all_codes_offsts[3][1]
    img_c,  img_o  = img_vocab.numericalize (all_codes_offsts[4][0]), all_codes_offsts[4][1]
    proc_c, proc_o = proc_vocab.numericalize(all_codes_offsts[5][0]), all_codes_offsts[5][1]
    cnd_c,  cnd_o  = cnd_vocab.numericalize (all_codes_offsts[6][0]), all_codes_offsts[6][1]
    imm_c,  imm_o  = imm_vocab.numericalize (all_codes_offsts[7][0]), all_codes_offsts[7][1]
    
    all_codenums = [obs_c,alg_c,crpl_c,med_c,img_c,proc_c,cnd_c,imm_c]
    all_offsts   = [obs_o,alg_o,crpl_o,med_o,img_o,proc_o,cnd_o,imm_o]
    
    return all_codenums, all_offsts

In [None]:
#export
def get_demographics(demograph_vector, demographics_vocabs, age_mean, age_std):
    '''Numericalize demographics and normalize age for a given patient'''
    bday, bmonth, byear, marital, race, ethnicity, gender, birthplace, city, state, zipcode = demographics_vocabs
    demograph_vector = demograph_vector.fillna('xxnone')
    demographics = []
    
    birthdate = pd.Timestamp(demograph_vector[0])
    
    demographics.extend(bday.numericalize      ([birthdate.day]))
    demographics.extend(bmonth.numericalize    ([birthdate.month]))
    demographics.extend(byear.numericalize     ([birthdate.year]))
    demographics.extend(marital.numericalize   ([demograph_vector[1]]))
    demographics.extend(race.numericalize      ([demograph_vector[2]]))
    demographics.extend(ethnicity.numericalize ([demograph_vector[3]]))
    demographics.extend(gender.numericalize    ([demograph_vector[4]]))
    demographics.extend(birthplace.numericalize([demograph_vector[5]]))
    demographics.extend(city.numericalize      ([demograph_vector[6]]))
    demographics.extend(state.numericalize     ([demograph_vector[7]]))
    demographics.extend(zipcode.numericalize   ([demograph_vector[8]]))
    age = (demograph_vector[9] - age_mean) / age_std
    
    return demographics, age

**Test - Codes & Offsets**

In [None]:
%time codenums, offsts = get_codenums_offsts(rec_dfs, vocab_list_1K.records_vocabs, age_start=0, age_stop=20, age_in_months=False)

CPU times: user 367 ms, sys: 0 ns, total: 367 ms
Wall time: 366 ms


In [None]:
med_num, med_o = codenums[3],offsts[3]

In [None]:
med_codes, med_offsts = collate_codes_offsts(p_med, age_start=0, age_stop=20)

In [None]:
assert len(med_num) == len(med_codes)
assert med_o == med_offsts

In [None]:
med_vocab.textify(med_num)[2]

('562251||START', {'Amoxicillin 250 MG / Clavulanate 125 MG Oral Tablet'})

In [None]:
med_codes

['xxnone',
 'xxnone',
 '562251||START',
 '562251||STOP',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 '313782||START',
 '313782||STOP',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 '313782||START',
 '313782||STOP',
 'xxnone',
 '562251||START',
 '562251||STOP',
 'xxnone',
 'xxnone',
 'xxnone']

In [None]:
med_offsts, med_o

([0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23],
 [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23])

In [None]:
len(med_codes),len(med_offsts)

(24, 20)

In [None]:
alg_codes, alg_offsts = collate_codes_offsts(p_alg, age_start=0, age_stop=20)

In [None]:
alg_codes

['xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone']

In [None]:
alg_offsts

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [None]:
obs_num, obs_o = codenums[0],offsts[0]

In [None]:
obs_codes, obs_offsts = collate_codes_offsts(p_obs, age_start=0, age_stop=20)

In [None]:
len(obs_codes), len(obs_offsts), len(obs_num), len(obs_o)

(236, 20, 236, 20)

In [None]:
assert len(obs_codes) == len(obs_num)
assert obs_o == obs_offsts

In [None]:
%time codenums1, offsts1 = get_codenums_offsts(rec_dfs, vocab_list_1K.records_vocabs, age_start=20, age_stop=220, age_in_months=True)

CPU times: user 547 ms, sys: 2.57 ms, total: 550 ms
Wall time: 546 ms


In [None]:
for codenum, offst in zip(codenums1, offsts1):
    print(len(codenum))
    assert len(offst) == 200

346
200
201
202
200
200
203
200


**Test - Demographics**

In [None]:
demograph_vector

birthdate          1934-11-27
marital                     M
race                    white
ethnicity          portuguese
gender                      F
birthplace             Boston
city               Foxborough
state           Massachusetts
zip                      2035
age_now_days            30670
Name: 0731de61-1a8b-4135-83c0-2ace696bcac8, dtype: object

In [None]:
get_demographics(demograph_vector, vocab_list_1K.demographics_vocabs, vocab_list_1K.age_mean, vocab_list_1K.age_std)

([28, 12, 36, 2, 2, 4, 3, 5, 4, 2, 4], 1.574773090930408)

In [None]:
dem_vector = ['1988-05-16','S',None,None,'F','North Adams','Marlborough','Massachusetts',1901,11141]

In [None]:
dem_vector = pd.Series(dem_vector)
dem_vector

0       1988-05-16
1                S
2             None
3             None
4                F
5      North Adams
6      Marlborough
7    Massachusetts
8             1901
9            11141
dtype: object

In [None]:
get_demographics(dem_vector, vocab_list_1K.demographics_vocabs, vocab_list_1K.age_mean, vocab_list_1K.age_std)

([17, 6, 90, 3, 0, 0, 3, 90, 8, 2, 48], -0.5053754917098182)

In [None]:
get_demographics(patient_demographics_df.loc['e8b8ebb1-a7b9-4ce8-8c6d-cfd74a284ebf'], vocab_list_1K.demographics_vocabs, \
                 vocab_list_1K.age_mean, vocab_list_1K.age_std) #has null

([20, 6, 112, 0, 2, 4, 3, 10, 9, 2, 9], -1.361550119558768)

In [None]:
# vocab_list_1K.demographics_vocabs[10].ctoi

## `ItemBase` - Patient

- Based on the concept of `ItemBase` as used in fastai v1.x, in our case a single patient

In [None]:
#export
class Patient():
    '''Class defining a patient object that holds all numericalized / transformed data for a single patient'''
    def __init__(self, nums, offsts, demographics, age_now, birthdate, diabetes, stroke, alzheimers, coronaryheart, ptid):
        self.obs_nums  = torch.tensor(nums[0])
        self.alg_nums  = torch.tensor(nums[1])
        self.crpl_nums = torch.tensor(nums[2])
        self.med_nums  = torch.tensor(nums[3])
        self.img_nums  = torch.tensor(nums[4])
        self.proc_nums = torch.tensor(nums[5])
        self.cnd_nums  = torch.tensor(nums[6])
        self.imm_nums  = torch.tensor(nums[7])

        self.obs_offsts  = torch.tensor(offsts[0])
        self.alg_offsts  = torch.tensor(offsts[1])
        self.crpl_offsts = torch.tensor(offsts[2])
        self.med_offsts  = torch.tensor(offsts[3])
        self.img_offsts  = torch.tensor(offsts[4])
        self.proc_offsts = torch.tensor(offsts[5])
        self.cnd_offsts  = torch.tensor(offsts[6])
        self.imm_offsts  = torch.tensor(offsts[7])
        
        self.demographics = torch.tensor(demographics)
        self.age_now      = torch.tensor([age_now])

        self.ptid = ptid
        self.birthdate = birthdate
        self.diabetes = diabetes
        self.stroke = stroke
        self.alzheimers = alzheimers
        self.coronaryheart = coronaryheart
    
    def __repr__(self):
        return f'ptid:{self.ptid}, birthdate:{self.birthdate}, diabetes:{self.diabetes}, device:{self.alg_nums.device}'
    
    @classmethod
    def create(cls, rec_dfs, demograph, vocablist, ptid, birthdate, diabetes, stroke, alzheimers, coronaryheart, age_start, age_stop, age_in_months):
        '''Lookup codes, numericalize and then create patient object - given a patient id'''
        codenums, offsts  = get_codenums_offsts(rec_dfs, vocablist.records_vocabs, age_start, age_stop, age_in_months)
        demographics, age_now = get_demographics(demograph, vocablist.demographics_vocabs, vocablist.age_mean, vocablist.age_std)
        return cls(codenums, offsts, demographics, age_now, birthdate, diabetes, stroke, alzheimers, coronaryheart, ptid)
    
    def to_gpu(self):
        '''Put this patient object on GPU'''
        self.obs_nums  = self.obs_nums.cuda()
        self.alg_nums  = self.alg_nums.cuda()
        self.crpl_nums = self.crpl_nums.cuda()
        self.med_nums  = self.med_nums.cuda()
        self.img_nums  = self.img_nums.cuda()
        self.proc_nums = self.proc_nums.cuda()
        self.cnd_nums  = self.cnd_nums.cuda()
        self.imm_nums  = self.imm_nums.cuda()

        self.obs_offsts  = self.obs_offsts.cuda()
        self.alg_offsts  = self.alg_offsts.cuda()
        self.crpl_offsts = self.crpl_offsts.cuda()
        self.med_offsts  = self.med_offsts.cuda()
        self.img_offsts  = self.img_offsts.cuda()
        self.proc_offsts = self.proc_offsts.cuda()
        self.cnd_offsts  = self.cnd_offsts.cuda()
        self.imm_offsts  = self.imm_offsts.cuda()
        
        self.demographics = self.demographics.cuda()
        self.age_now      = self.age_now.cuda()
        
        return self

In [None]:
show_doc(Patient, title_level=3)

<h3 id="Patient" class="doc_header"><code>class</code> <code>Patient</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>Patient</code>(**`nums`**, **`offsts`**, **`demographics`**, **`age_now`**, **`birthdate`**, **`diabetes`**, **`stroke`**, **`alzheimers`**, **`coronaryheart`**, **`ptid`**)

Class defining a patient object that holds all numericalized / transformed data for a single patient

In [None]:
show_doc(Patient.create)

<h4 id="Patient.create" class="doc_header"><code>Patient.create</code><a href="__main__.py#L36" class="source_link" style="float:right">[source]</a></h4>

> <code>Patient.create</code>(**`rec_dfs`**, **`demograph`**, **`vocablist`**, **`ptid`**, **`birthdate`**, **`diabetes`**, **`stroke`**, **`alzheimers`**, **`coronaryheart`**, **`age_start`**, **`age_stop`**, **`age_in_months`**)

Lookup codes, numericalize and then create patient object - given a patient id

In [None]:
show_doc(Patient.to_gpu)

<h4 id="Patient.to_gpu" class="doc_header"><code>Patient.to_gpu</code><a href="__main__.py#L43" class="source_link" style="float:right">[source]</a></h4>

> <code>Patient.to_gpu</code>()

Put this patient object on GPU

**Tests**

In [None]:
%time p1 = Patient.create(rec_dfs, demograph_vector, vocab_list_1K, tst_ptid, tst_pt_birthdate, tst_pt_diabetes, \
                                  tst_pt_stroke, tst_pt_alzheimers, tst_pt_coronaryheart, age_start=0, age_stop=20, age_in_months=False)

CPU times: user 373 ms, sys: 8.08 ms, total: 381 ms
Wall time: 372 ms


In [None]:
len(p1.obs_nums)

236

In [None]:
assert len(p1.obs_nums) == len(obs_codes)

In [None]:
p1.obs_offsts

tensor([  0,  45,  63,  77,  91,  98, 116, 123, 130, 137, 144, 151, 169, 176,
        183, 190, 197, 215, 222, 229])

In [None]:
p1.diabetes, p1.ptid, p1.birthdate

(False, '0731de61-1a8b-4135-83c0-2ace696bcac8', '1934-11-27')

In [None]:
p1

ptid:0731de61-1a8b-4135-83c0-2ace696bcac8, birthdate:1934-11-27, diabetes:False, device:cpu

In [None]:
p1.demographics

tensor([28, 12, 36,  2,  2,  4,  3,  5,  4,  2,  4])

In [None]:
p1.age_now

tensor([1.5748], dtype=torch.float64)

In [None]:
p1.demographics.shape, p1.age_now.shape

(torch.Size([11]), torch.Size([1]))

In [None]:
torch.cat((p1.demographics, p1.age_now.type(torch.LongTensor)), dim=0)

tensor([28, 12, 36,  2,  2,  4,  3,  5,  4,  2,  4,  1])

In [None]:
p1.to_gpu()

ptid:0731de61-1a8b-4135-83c0-2ace696bcac8, birthdate:1934-11-27, diabetes:False, device:cuda:0

In [None]:
p1.diabetes, p1.ptid, p1.obs_offsts, p1.alg_nums

(False,
 '0731de61-1a8b-4135-83c0-2ace696bcac8',
 tensor([  0,  45,  63,  77,  91,  98, 116, 123, 130, 137, 144, 151, 169, 176,
         183, 190, 197, 215, 222, 229], device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0'))

In [None]:
p1.demographics, p1.age_now

(tensor([28, 12, 36,  2,  2,  4,  3,  5,  4,  2,  4], device='cuda:0'),
 tensor([1.5748], device='cuda:0', dtype=torch.float64))

## `ItemList` - PatientList

- Based on the concept of `ItemList` as used in fastai v1.x, which is a list of `ItemBase` objects
- In our case `PatientList` is a list of `Patient` objects

**Multiprocessing Implementation**

- Chunk total number of patients based on number of cores available on machine
- Send each chunk of patients into a core
    - Let the parallelized sub proc in each core load all data and do the heavy lifting
    - The main proc just sends a list of indxs (patients) to work on

In [None]:
#exports
def get_pckl_dir(path, split, age_start, age_stop, age_in_months):
    '''Util function to construct pickle dir name - for persisting transformed `PatientList`s'''
    dir_name = ''
    dir_name += 'months' if age_in_months else 'years'
    dir_name += f'_{age_start}_to_{age_stop}'
    pckl_dir = Path(f'{path}/processed/{dir_name}/{split}')
    return pckl_dir

In [None]:
#export 
multiprocessing.set_sharing_strategy('file_system')
cpu_cnt = int(multiprocessing.cpu_count())

class PatientList():
    '''A class to hold a list of `Patient` objects'''
    def __init__(self, pts, path, split, age_start, age_stop, age_in_months):
        self.items     = pts
        self.base_path = path
        self.split     = split 
        self.age_start = age_start
        self.age_stop  = age_stop
        self.age_type  = 'months' if age_in_months else 'years'

    def __len__(self): return len(self.items)
    def __iter__(self): return iter(self.items)
    def __getitem__(self, idx):
        if isinstance(idx, (int,slice)): return self.items[idx]
        if isinstance(idx[0],bool):
            assert len(idx)==len(self) # bool mask
            return [o for m,o in zip(idx,self.items) if m]
        return [self.items[i] for i in idx]
    def __repr__(self):
        res  = f'{self.__class__.__name__} ({len(self)} items)\n'
        res += f'base path:{self.base_path}; split:{self.split}; age span:{self.age_stop - self.age_start} {self.age_type}\n'
        res += f'age_start:{self.age_start}; age_stop:{self.age_stop}; age_type:{self.age_type}\n'
        for item in self.items[:10]:
            res += f'{item.__repr__()}\n'
        if len(self)>10: res = res[:-1]+ '...]'
        return res

    def _create_pts_chunk(indx_chnk, all_dfs, vocablist, pckl_dir, age_start, age_stop, age_in_months, verbose):
        '''Parallelized function to run on one core and transform a single chunk of patients and save'''        
        pckl_f = open(f'{pckl_dir}/patients_{indx_chnk[0]}_{indx_chnk[-1]}.ptlist', 'wb')
        pts = []
         
        for indx in indx_chnk:
            vals = all_dfs[0].iloc[indx].values
            ptid, birthdate = vals[0], vals[1]
            diabetes, stroke, alzheimers, coronaryheart = vals[2], vals[4], vals[6], vals[8]
            
            rec_dfs = []
            for rec_df in all_dfs[2:]:
                try:
                    rec_dfs.append(rec_df.loc[[ptid]])
                except KeyError:
                    rec_dfs.append(pd.DataFrame())
            
            demograph = all_dfs[1].loc[ptid]
            pts.append(Patient.create(rec_dfs, demograph, vocablist, ptid, birthdate, diabetes, stroke, alzheimers, coronaryheart, age_start, age_stop, age_in_months))
        
        pickle.dump(pts,pckl_f)
        pckl_f.close()
        if verbose: print(f'{multiprocessing.current_process().name}-- completed {len(indx_chnk)} patients')
        return len(pts)

    @classmethod
    def create_save(cls, all_dfs, vocablist, pckl_dir, age_start, age_stop, age_in_months, verbose=False):
        '''Function to parellelize (based on available CPU cores), transformation for all patients in given dataset and save `PatientList` object'''
        pckl_dir.mkdir(parents=True, exist_ok=True)
        pts, indx_chnks = [], []
        
        total_pts = len(all_dfs[0])
        all_indxs = np.arange(total_pts)
        chnk_sz = total_pts // (cpu_cnt-1)
        for i in range(0, total_pts, chnk_sz):
            indx_chnks.append(list(all_indxs[i:i+chnk_sz]))
        
        pool = multiprocessing.Pool(processes=cpu_cnt)
        parallelize = partial(cls._create_pts_chunk, all_dfs=all_dfs, vocablist=vocablist, pckl_dir=pckl_dir, age_start=age_start, age_stop=age_stop, age_in_months=age_in_months, verbose=verbose)
        all_chunks = pool.map(parallelize, indx_chnks)
        pool.close()
        
        print(f'{sum(all_chunks)} total patients completed, saved patient list to {pckl_dir}')

    @classmethod
    def load(cls, path, split, age_start, age_stop, age_in_months):
        '''Load previously created `PatientList` object'''
        pckl_dir = get_pckl_dir(path, split, age_start, age_stop, age_in_months)
        ptlist = []
        for file in Path(pckl_dir).glob("*.ptlist"):
            with open(file, 'rb') as infile:
                ptlist.extend(pickle.load(infile))
                
        return(cls(ptlist, path, split, age_start, age_stop, age_in_months))

In [None]:
show_doc(PatientList, title_level=3)

<h3 id="PatientList" class="doc_header"><code>class</code> <code>PatientList</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>PatientList</code>(**`pts`**, **`path`**, **`split`**, **`age_start`**, **`age_stop`**, **`age_in_months`**)

A class to hold a list of [`Patient`](/lemonade/preprocessing_transform#Patient) objects

In [None]:
show_doc(PatientList.create_save)

<h4 id="PatientList.create_save" class="doc_header"><code>PatientList.create_save</code><a href="__main__.py#L57" class="source_link" style="float:right">[source]</a></h4>

> <code>PatientList.create_save</code>(**`all_dfs`**, **`vocablist`**, **`pckl_dir`**, **`age_start`**, **`age_stop`**, **`age_in_months`**, **`verbose`**=*`False`*)

Function to parellelize (based on available CPU cores), transformation for all patients in given dataset and save [`PatientList`](/lemonade/preprocessing_transform#PatientList) object

In [None]:
show_doc(PatientList._create_pts_chunk)

<h4 id="PatientList._create_pts_chunk" class="doc_header"><code>PatientList._create_pts_chunk</code><a href="__main__.py#L32" class="source_link" style="float:right">[source]</a></h4>

> <code>PatientList._create_pts_chunk</code>(**`indx_chnk`**, **`all_dfs`**, **`vocablist`**, **`pckl_dir`**, **`age_start`**, **`age_stop`**, **`age_in_months`**, **`verbose`**)

Parallelized function to run on one core and transform a single chunk of patients and save

In [None]:
show_doc(PatientList.load)

<h4 id="PatientList.load" class="doc_header"><code>PatientList.load</code><a href="__main__.py#L76" class="source_link" style="float:right">[source]</a></h4>

> <code>PatientList.load</code>(**`path`**, **`split`**, **`age_start`**, **`age_stop`**, **`age_in_months`**)

Load previously created [`PatientList`](/lemonade/preprocessing_transform#PatientList) object

**Tests**

In [None]:
tst_pckl_dir = get_pckl_dir(PATH_1K, split='train', age_start=0, age_stop=20, age_in_months=False)

In [None]:
tst_pckl_dir

Path('datasets/synthea/1K/processed/years_0_to_20/train')

In [None]:
%time PatientList.create_save(all_dfs, vocab_list_1K, tst_pckl_dir, age_start=0, age_stop=20, age_in_months=False)

664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/train
CPU times: user 1.73 s, sys: 619 ms, total: 2.34 s
Wall time: 19.1 s


In [None]:
#export
def create_all_ptlists(path:Path, age_start:int, age_stop:int, age_in_months:bool, vocab_path:Path=None, verbose:bool=False, delete_existing:bool=True):
    '''Create and save `PatientList`s for train, valid and test given dataset path'''
    if vocab_path is None: vocab_path = path
    all_dfs_splits = load_cleaned_ehrdata(path) #train_dfs, valid_dfs, test_dfs
    splits = ['train', 'valid', 'test']
    vocablist = EhrVocabList.load(vocab_path)
    
    for all_dfs, split in zip(all_dfs_splits, splits):
        pckl_dir = get_pckl_dir(path, split, age_start, age_stop, age_in_months)
        if delete_existing:
            for file in Path(pckl_dir).glob("*.ptlist"):
                file.unlink()
        PatientList.create_save(all_dfs, vocablist, pckl_dir, age_start, age_stop, age_in_months, verbose)

**Tests**

In [None]:
%time create_all_ptlists(PATH_1K, age_start=240, age_stop=360, age_in_months=True) #20 to 30 yrs in mos (seq_len = 120)

664 total patients completed, saved patient list to datasets/synthea/1K/processed/months_240_to_360/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_240_to_360/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_240_to_360/test
CPU times: user 3.65 s, sys: 1.7 s, total: 5.35 s
Wall time: 24.4 s


**Other examples**

```python
%time create_all_ptlists(PATH_1K, age_start=0, age_stop=20, age_in_months=False, verbose=False)
```
```
664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/test
CPU times: user 3.66 s, sys: 1.98 s, total: 5.64 s
Wall time: 32.4 s
```
```python
%time create_all_ptlists(PATH_1K, age_start=0, age_stop=35, age_in_months=False, verbose=False)
```
```
664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/test
CPU times: user 3.66 s, sys: 1.93 s, total: 5.59 s
Wall time: 40.2 s
```
```python
%time create_all_ptlists(PATH_1K, age_start=120, age_stop=360, age_in_months=True) #10 to 30 yrs in mos (seq_len = 240)
```
```
664 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/test
CPU times: user 3.84 s, sys: 1.98 s, total: 5.82 s
Wall time: 46.8 s
```

In [None]:
ptlist_1K = PatientList.load(PATH_1K, 'train', age_start=240, age_stop=360, age_in_months=True) 

In [None]:
len(ptlist_1K)

664

In [None]:
ptlist_1K

PatientList (664 items)
base path:./datasets/synthea/1K; split:train; age span:120 months
age_start:240; age_stop:360; age_type:months
ptid:12b42348-29bb-4cf4-b878-d419a4c8f067, birthdate:1951-09-04, diabetes:False, device:cpu
ptid:4c59df3b-042b-42c2-901a-3783d6d77919, birthdate:1979-09-18, diabetes:False, device:cpu
ptid:8f07e577-7ab4-4e89-ae98-6d394b3929a9, birthdate:1967-01-13, diabetes:False, device:cpu
ptid:653753d9-a52f-4ef3-b285-bdc3c671293c, birthdate:1971-12-26, diabetes:False, device:cpu
ptid:1b801001-0bf9-4ad9-b175-9de9bc2d905f, birthdate:1979-11-26, diabetes:False, device:cpu
ptid:da6203bc-bb24-4628-9aed-2b851571ee36, birthdate:1999-03-14, diabetes:False, device:cpu
ptid:b02cd0ff-e891-4caa-b69b-2da7cb267578, birthdate:1957-09-12, diabetes:True, device:cpu
ptid:49a859d3-318a-4212-864e-cf7d17f5032c, birthdate:2006-04-24, diabetes:False, device:cpu
ptid:cb9293c4-243f-427e-ba70-6834c5872be6, birthdate:1952-10-21, diabetes:True, device:cpu
ptid:cf82e51b-5c34-4010-b904-1583b009eb

In [None]:
len(ptlist_1K.items[300].obs_nums), len(ptlist_1K.items[200].obs_offsts)

(170, 120)

In [None]:
ptlist_1K.items[300].obs_nums

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   5,   8,  14,  88,
         75,  79,  18,  25,  30,  36,  40,  42,  50,  55,  61,  65,  69,  95,
        497,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   5,   8,  14,  88,  75,  79,
         96, 497,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   5,   7,  14,  88,  74,
         80,  20,  24,  30,  35,  39,  46,  48,  53,  58,  65,  67,  94, 497,
          5,   8,  14,  88,  75,  81,  95, 497,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0])

In [None]:
ptlist_1K[300].proc_nums

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Do All Preprocessing

In [None]:
#export
def preprocess_ehr_dataset(path, today, valid_pct=0.2, test_pct=0.2, obs_vocab_buckets=5,
                           age_start=0, age_stop=20, age_in_months=False, vocab_path=None, from_raw_data=False):
    '''Util function to do all preprocessing - split & clean raw dataset, create vocab lists and create patient lists'''
    if from_raw_data:
        print('------------------- Splitting and cleaning raw dataset -------------------')
        clean_raw_ehrdata(path, valid_pct, test_pct, today)
        print('------------------- Creating vocab lists -------------------')
        EhrVocabList.create(path, num_buckets=obs_vocab_buckets).save()
    else:
        print('Since data is pre-cleaned, skipping Cleaning, Splitting and Vocab-creation')
 
    print('------------------- Creating patient lists -------------------')
    create_all_ptlists(path, age_start, age_stop, age_in_months, vocab_path)

## Export -

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 01_preprocessing_clean.ipynb.
Converted 02_preprocessing_vocab.ipynb.
Converted 03_preprocessing_transform.ipynb.
Converted 04_data.ipynb.
Converted 05_metrics.ipynb.
Converted index.ipynb.
