In [None]:
# default_exp preprocessing.vocab

# preprocessing.vocab

In [None]:
#hide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#export
from lemonade.preprocessing.clean import *
from fastai.imports import *
from fastai import *
from datetime import date

In [None]:
#hide
from nbdev.showdoc import *

## `nn.Embedding` and `nn.EmbeddingBag`

**nn.Embedding**

In [None]:
emb1 = nn.Embedding(5,3)

In [None]:
emb1(torch.LongTensor([[0,1,2,3,4]]))

tensor([[[-0.8246, -0.4822, -0.9968],
         [-0.3546,  1.2317,  0.9929],
         [ 0.6679, -0.1507, -2.3095],
         [-1.6767, -0.4624, -0.6374],
         [ 0.7486, -1.2633, -0.1052]]], grad_fn=<EmbeddingBackward>)

An embedding matrix is a lookup table
1. `emb1` above has 5 rows, that is 5 elements
2. but looking up an element, returns a vector for that element.

Given this embedding matrix, looking up elements 1, 2, 4 will look like this .. 

In [None]:
input = torch.LongTensor([[1,2,4]])

In [None]:
emb1(input)

tensor([[[-0.3546,  1.2317,  0.9929],
         [ 0.6679, -0.1507, -2.3095],
         [ 0.7486, -1.2633, -0.1052]]], grad_fn=<EmbeddingBackward>)

Batch of inputs is also possible (in this case a batch of 2, each with 3 elements being looked up) 
- Note that inputs (# of elements being looked up) in a batch have to be of the same size

In [None]:
input = torch.LongTensor([[1,2,4],[0,3,2]])
# input = torch.LongTensor([[1,2,4],[0,3,2,1]]) # this will fail

In [None]:
emb1(input)

tensor([[[-0.3546,  1.2317,  0.9929],
         [ 0.6679, -0.1507, -2.3095],
         [ 0.7486, -1.2633, -0.1052]],

        [[-0.8246, -0.4822, -0.9968],
         [-1.6767, -0.4624, -0.6374],
         [ 0.6679, -0.1507, -2.3095]]], grad_fn=<EmbeddingBackward>)

**`nn.EmbeddingBag`**

In [None]:
embg1 = nn.EmbeddingBag(5,3)

Exactly the same input as in case of `nn.Embedding` above (batch of 2)
- but the result will be averaged across the 3 elements in a batch
- resulting in an output of 2 vectors not 6 like above

In [None]:
input = torch.LongTensor([[1,2,4],[0,3,2]]) # exactly same as above, but o/p is avg'd now

In [None]:
embg1(input)

tensor([[ 0.2607, -0.2411, -0.1735],
        [-0.4628,  0.1948,  0.2288]], grad_fn=<EmbeddingBagBackward>)

Another way to do this is to send in `offsets` rather than separating the inputs into 2 (or x number of) lists

In [None]:
input = torch.LongTensor([1,2,4,0,3,2]) #same as above - 2 of same length 3
offsets = torch.LongTensor([0,3]) # output will be avg'd by default

In [None]:
embg1(input, offsets)

tensor([[ 0.2607, -0.2411, -0.1735],
        [-0.4628,  0.1948,  0.2288]], grad_fn=<EmbeddingBagBackward>)

In [None]:
input = torch.LongTensor([1,2,4,2,0,3,3,2]) #same as above - batch of 2 inputs, but of length 4 each
offsets = torch.LongTensor([0,4])

In [None]:
embg1(input, offsets) #avg'd 2 outputs one for each input batch i.e. avg'd across 4 in each batch

tensor([[ 0.3061, -0.6667,  0.0523],
        [-0.3239,  0.3875,  0.5752]], grad_fn=<EmbeddingBagBackward>)

**Different Sizes**

Offsets allow us to have input batches of different lengths

In [None]:
input = torch.LongTensor([1,2,4,2,0,3,3,2]) #same input as above but .. 
offsets = torch.LongTensor([0,3,5]) #this indicates - 3 batches of different lengths (0,1,2)(3,4)(5,6,7)

In [None]:
embg1(input, offsets)

tensor([[ 0.2607, -0.2411, -0.1735],
        [-0.7404, -0.1908, -0.4639],
        [ 0.2092, -0.0039,  1.3196]], grad_fn=<EmbeddingBagBackward>)

**Application to EHR Data**

Details here

## `itoc`, `ctoi`, `ctod`, `numericalize`, `textify`

I tried to extend fastai vocabs, but found it easier to write from scratch

## Vocabs

In [None]:
code_dfs = load_ehr_vocabcodes(PATH_1K)

In [None]:
pt_codes, obs_codes, alg_codes, crpl_codes, med_codes, img_codes, proc_codes, cnd_codes, immn_codes = code_dfs

In [None]:
#export
class EhrVocab():
    '''Vocab class for most EHR datatypes'''
    def __init__(self, itoc, ctoi, ctod=None):
        self.itoc = itoc
        self.ctoi = ctoi
        if ctod is not None: self.ctod = ctod 
        self.vocab_size = len(self.itoc)
        
    @classmethod
    def create(cls, codes_df):
        '''Create vocab object (itoc, ctoi and maybe ctod) from the codes df'''
        desc_exists = 'desc' in codes_df.columns
        codes_df = codes_df.astype({'code':'str'})
        itoc = list(codes_df.code.unique())  #old --> list(set(codes_df.code))
        itoc.insert(0,'xxnone')
        itoc.insert(1,'xxunk')
        
        ctoi = {code: i for i, code in enumerate(itoc)}
        
        if desc_exists:
            codes_df.set_index('code', inplace=True)
            ctod = {}
            ctod[itoc[0]] = "Nothing recorded"
            ctod[itoc[1]] = "Unknown"
            for code in itoc[2:]: 
                ctod[code] = set(codes_df.loc[code].desc)
        
        return cls(itoc, ctoi, ctod) if desc_exists else cls(itoc, ctoi)
    
    def get_emb_dims(self, αd=0.5736):
        '''Get embedding dimensions'''
        return self.vocab_size, round(6* αd * (self.vocab_size**0.25))
    
    def numericalize(self, codes, verbose=True):
        '''Lookup and return indices for codes'''
        today = date.today().strftime("%Y-%m-%d")
        logfile = f'./log/{today}_numericalize_exceptions.log'
        
        res = []
        try:
            res = [self.ctoi[str(code)] for code in codes] #no big performance benefit
        except KeyError:
            for code in codes:
                try:
                    res.append(self.ctoi[str(code)])
                except KeyError:
                    res.append(self.ctoi['xxunk'])
                    if verbose:
                        with open(logfile, 'a') as log:
                            log.write(f'\ncode: {code}')                      
                    
        return res
    
    def textify(self, indxs):
        '''Lookup and return descriptions for codes'''
        if hasattr(self, 'ctod'):
            res = [ (self.itoc[i], self.ctod[self.itoc[i]]) for i in indxs ]
        else:
            res = [ (self.itoc[i]) for i in indxs ]
        return res

In [None]:
show_doc(EhrVocab.create)

<h4 id="EhrVocab.create" class="doc_header"><code>EhrVocab.create</code><a href="__main__.py#L10" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocab.create</code>(**`codes_df`**)

Create vocab object (itoc, ctoi and maybe ctod) from the codes df

In [None]:
show_doc(EhrVocab.numericalize)

<h4 id="EhrVocab.numericalize" class="doc_header"><code>EhrVocab.numericalize</code><a href="__main__.py#L35" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocab.numericalize</code>(**`codes`**, **`verbose`**=*`True`*)

Lookup and return indices for codes

In [None]:
show_doc(EhrVocab.textify)

<h4 id="EhrVocab.textify" class="doc_header"><code>EhrVocab.textify</code><a href="__main__.py#L55" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocab.textify</code>(**`indxs`**)

Lookup and return descriptions for codes

In [None]:
show_doc(EhrVocab.get_emb_dims)

<h4 id="EhrVocab.get_emb_dims" class="doc_header"><code>EhrVocab.get_emb_dims</code><a href="__main__.py#L31" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocab.get_emb_dims</code>(**`αd`**=*`0.5736`*)

Get embedding dimensions

In [None]:
#export
class ObsVocab (EhrVocab):
    '''Special Vocab class for Observation codes'''
    def __init__(self, vocab_df):
        self.vocab_df = vocab_df
        self.vocab_size = len(vocab_df)
    
    def numericalize(self, codes, verbose=True, log_dir='./log'):
        '''Numericalize observation codes (return indices for codes)'''
        today = date.today().strftime("%Y-%m-%d")
        if verbose:
            if not os.path.isdir(log_dir): os.mkdir(log_dir)
            logfile = f'./log/{today}_numericalize_exceptions.log'
        
        indxs = []
        for code in codes:
            if code in ['xxnone','xxunk']: indxs.extend(self.vocab_df[(self.vocab_df['code'] == code)].index.tolist())
            else: 
                c,v,u,t = code.split('||')
                if t == 'numeric':
                    filt_df = self.vocab_df[(self.vocab_df['code'] == c) & (self.vocab_df['units'] == u) & (self.vocab_df['type'] == t)]
                    res = filt_df.iloc[(filt_df.value - float(v)).abs().argsort()[:1]].index.tolist()
                else:
                    res = self.vocab_df[(self.vocab_df['code'] == c) & (self.vocab_df['value'] == v) & \
                                               (self.vocab_df['units'] == u) & (self.vocab_df['type'] == t)].index.tolist()
                if len(res) == 0: 
                    indxs.extend(self.vocab_df[(self.vocab_df['code'] == 'xxunk')].index.tolist())
                    if verbose:
                        with open(logfile, 'a') as log:
                            log.write(f'\ncode in ObsVocab: {code}')                    
                else            : indxs.extend(res)
        assert len(codes) == len(indxs), "Possible bug, not all codes being numericalized"
        return indxs
    
    def textify(self, indxs):
        '''Textify observation codes (returns codes and descriptions)'''
        txts = []
        for i in indxs:
            c,d,v,u,t = self.vocab_df.iloc[i]
            if i == 0: txts.append((c, d))
            else:      txts.append((f'{c}||{v}||{u}||{t}', d))
        assert len(indxs) == len(txts), "Possible bug, not all indxs being textified"
        return txts

    @classmethod
    def create(cls, obs_codes, num_buckets=5):
        '''Create vocab object from observation codes'''
        numerics = pd.DataFrame(obs_codes.loc[obs_codes['type'] == 'numeric',:])
        texts = pd.DataFrame(obs_codes.loc[obs_codes['type'] == 'text',:])
        numerics = numerics.astype({'value':'float'}, copy=False)
        vocab_rows = []

        for code in numerics.orig_code.unique():
            this_code = numerics.loc[numerics['orig_code'] == code]
            for unit in this_code.units.unique():
                this_unit = this_code.loc[this_code['units'] == unit]
                for val in np.linspace(this_unit.value.min(), this_unit.value.max(), num=num_buckets):
                    vocab_rows.append([code,this_unit.desc.iloc[0],val,unit,'numeric'])

        for code in texts.orig_code.unique():
            this_code = texts.loc[texts['orig_code'] == code]
            for unit in this_code.units.unique():
                this_unit = this_code.loc[this_code['units'] == unit]
                for val in this_unit.value.unique():
                    vocab_rows.append([code,this_unit.desc.iloc[0],val,unit,'text'])

        vocab_rows.insert(0, ['xxnone','Nothing recorded','xxnone','xxnone','xxnone'])
        vocab_rows.insert(1, ['xxunk','Unknown','xxunk','xxunk','xxunk'])
        obs_vocab = pd.DataFrame(data=vocab_rows, columns=['code','desc','value','units','type'])
        assert obs_codes.orig_code.nunique() == obs_vocab.code.nunique()-2, "Possible bug, obs_code nuniques don't match"
        return cls(obs_vocab)

In [None]:
show_doc(ObsVocab.create)

<h4 id="ObsVocab.create" class="doc_header"><code>ObsVocab.create</code><a href="__main__.py#L45" class="source_link" style="float:right">[source]</a></h4>

> <code>ObsVocab.create</code>(**`obs_codes`**, **`num_buckets`**=*`5`*)

Create vocab object from observation codes

In [None]:
show_doc(ObsVocab.numericalize)

<h4 id="ObsVocab.numericalize" class="doc_header"><code>ObsVocab.numericalize</code><a href="__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>ObsVocab.numericalize</code>(**`codes`**, **`verbose`**=*`True`*, **`log_dir`**=*`'./log'`*)

Numericalize observation codes (return indices for codes)

In [None]:
show_doc(ObsVocab.textify)

<h4 id="ObsVocab.textify" class="doc_header"><code>ObsVocab.textify</code><a href="__main__.py#L35" class="source_link" style="float:right">[source]</a></h4>

> <code>ObsVocab.textify</code>(**`indxs`**)

Textify observation codes (returns codes and descriptions)

**`numericalize()` Explanation**
- split incoming concated `code||value||units||type` string
- get a result_df based on everything except value
- then do an `argsort()` on the value column to determine closest value
 - based on example given in [pandas docs - cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#building-criteria)
   - **cookbook example that uses `loc` doesnt work, instead `iloc` [works](https://stackoverflow.com/questions/30112202/how-do-i-find-the-closest-values-in-a-pandas-series-to-an-input-number/53553226)**
 - `argsort()` - [Returns the indices that would sort this array](https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html#numpy.argsort)
 - `[:1]` on that returns the one row with the closest match, index of that is what we want

> Note about logging numericalize errors

In [None]:
obs_codes.head()

Unnamed: 0_level_0,orig_code,desc,value,units,type
indx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8302-2,Body Height,48.8,cm,numeric
1,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,1.3,{score},numeric
2,29463-7,Body Weight,3.4,kg,numeric
3,6690-2,Leukocytes [#/volume] in Blood by Automated count,5.2,10*3/uL,numeric
4,789-8,Erythrocytes [#/volume] in Blood by Automated ...,5.2,10*6/uL,numeric


In [None]:
obs_vocab_obj = ObsVocab.create(obs_codes)

In [None]:
obs_vocab_obj.numericalize(['8302-2||200.3||cm||numeric', \
                            '72514-3||4||{score}||numeric', '33756-8||21.7||mm||numeric','29463-7||181.8||kg||numeric'])

[6, 9, 176, 16]

In [None]:
#Testing unkown code
obs_vocab_obj.numericalize(['blah-2||200.3||cm||numeric', \
                            '72514-3||4||{score}||numeric', '33756-8||21.7||mm||numeric','29463-7||181.8||kg||numeric'])

[1, 9, 176, 16]

In [None]:
obs_vocab_obj.textify([5, 8, 200, 15])

[('8302-2||158.45000000000002||cm||numeric', 'Body Height'),
 ('72514-3||2.475||{score}||numeric',
  'Pain severity - 0-10 verbal numeric rating [Score] - Reported'),
 ('10834-0||3.35||g/dL||numeric', 'Globulin'),
 ('29463-7||122.44999999999999||kg||numeric', 'Body Weight')]

In [None]:
obs_vocab_obj.numericalize(['32465-7||Normal size prostate||{nominal}||text',"80271-0||Positive Murphy's Sign||xxxnan||text",\
                          'xxnone'])

[545, 549, 0]

In [None]:
obs_vocab_obj.textify([545, 549, 0])

[('32465-7||Normal size prostate||{nominal}||text',
  'Physical findings of Prostate'),
 ("80271-0||Positive Murphy's Sign||xxxnan||text",
  'Physical findings of Abdomen by Palpation'),
 ('xxnone', 'Nothing recorded')]

In [None]:
obs_vocab_obj.numericalize(['xxnone','xxunk','72166-2||Never smoker||xxxnan||text'])

[0, 1, 497]

In [None]:
obs_vocab_obj.textify([0, 1, 2, 3, 497])

[('xxnone', 'Nothing recorded'),
 ('xxunk||xxunk||xxunk||xxunk', 'Unknown'),
 ('8302-2||44.6||cm||numeric', 'Body Height'),
 ('8302-2||82.55000000000001||cm||numeric', 'Body Height'),
 ('72166-2||Never smoker||xxxnan||text', 'Tobacco smoking status NHIS')]

## VocabList

In [None]:
#export
class EhrVocabList:
    '''Class to create and hold all vocab objects for an entire dataset'''
    def __init__(self, demographics_vocabs, records_vocabs, age_mean, age_std, path):
        self.demographics_vocabs, self.records_vocabs, self.path = demographics_vocabs, records_vocabs, path
        self.age_mean, self.age_std = age_mean, age_std
    
    @classmethod
    def create(cls, path, num_buckets=5):
        '''Read all code dfs from the dataset path and create all vocab objects'''
        demographics_vocabs, records_vocabs = [], []
        code_dfs = load_ehr_vocabcodes(path)
        
        def _get_demographics_codes(pt_codes):
            code_dfs = []
            code_dfs.extend([pd.DataFrame(range(1, 32, 1), columns=['code'])]) #31 days  
            code_dfs.extend([pd.DataFrame(range(1, 13, 1), columns=['code'])]) #12 months 
            code_dfs.extend([pd.DataFrame(range(1900, pd.Timestamp.today().year + 1, 1), columns=['code'])]) #years 1900 to now
            code_dfs.extend([pd.DataFrame(pt_codes.marital.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.race.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.ethnicity.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.gender.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.birthplace.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.city.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.state.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.zip.dropna().unique(), columns=['code'])])
            age_mean, age_std = pt_codes.age_now_days.mean(), pt_codes.age_now_days.std()
            return code_dfs, age_mean, age_std
        
        demographics_codes, age_mean, age_std = _get_demographics_codes(code_dfs[0])
        demographics_vocabs.extend([EhrVocab.create(codes_df) for codes_df in demographics_codes])
        records_vocabs.extend([ObsVocab.create(code_dfs[1], num_buckets)])
        records_vocabs.extend([EhrVocab.create(codes_df) for codes_df in code_dfs[2:]])
        return cls(demographics_vocabs, records_vocabs, age_mean, age_std, path)    
    
    def save(self):
        '''Save vocablist (containing all vocab objects for the dataset)'''
        pckl_dir = Path(f'{self.path}/processed')
        pckl_dir.mkdir(parents=True, exist_ok=True)
        pckl_f = open(f'{pckl_dir}/vocabs.vocablist', 'wb')
        pickle.dump(self, pckl_f)
        pckl_f.close()
        print(f'Saved vocab lists to {pckl_dir}')
        
    @classmethod
    def load(cls, path):
        '''Load previously created vocablist object (containing all vocab objects for the dataset)'''
        infile = open(f'{path}/processed/vocabs.vocablist','rb')
        ehrVocabList = pickle.load(infile)
        infile.close()
        return ehrVocabList

In [None]:
show_doc(EhrVocabList.create)

<h4 id="EhrVocabList.create" class="doc_header"><code>EhrVocabList.create</code><a href="__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocabList.create</code>(**`path`**, **`num_buckets`**=*`5`*)

Read all code dfs from the dataset path and create all vocab objects

In [None]:
show_doc(EhrVocabList.save)

<h4 id="EhrVocabList.save" class="doc_header"><code>EhrVocabList.save</code><a href="__main__.py#L36" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocabList.save</code>()

Save vocablist (containing all vocab objects for the dataset)

In [None]:
show_doc(EhrVocabList.load)

<h4 id="EhrVocabList.load" class="doc_header"><code>EhrVocabList.load</code><a href="__main__.py#L45" class="source_link" style="float:right">[source]</a></h4>

> <code>EhrVocabList.load</code>(**`path`**)

Load previously created vocablist object (containing all vocab objects for the dataset)

In [None]:
vocab_list_1K = EhrVocabList.create(PATH_1K)

In [None]:
vocab_list_1K.save()

Saved vocab lists to datasets/synthea/1K/processed


#### Tests

In [None]:
vl_1K = EhrVocabList.load(PATH_1K)
obs_vocab, alg_vocab, crpl_vocab, med_vocab, img_vocab, proc_vocab, cnd_vocab, imm_vocab = vl_1K.records_vocabs
bday, bmonth, byear, marital, race, ethnicity, gender, birthplace, city, state, zipcode  = vl_1K.demographics_vocabs

##### `records_vocabs`

In [None]:
obs_vocab.vocab_size

550

In [None]:
proc_vocab.numericalize(['xxnone','65200003','428191000124101'])

[0, 36, 2]

In [None]:
img_vocab.numericalize(['xxnone',344001])

[0, 7]

In [None]:
proc_vocab.numericalize(['65200003']), proc_vocab.numericalize([65200003])

([36], [36])

In [None]:
img_vocab.textify([0,1,2,3,4,5])

[('xxnone', 'Nothing recorded'),
 ('xxunk', 'Unknown'),
 ('51185008', {'Chest', 'Thoracic structure (body structure)'}),
 ('12921003', {'Pelvis'}),
 ('40983000', {'Arm'}),
 ('8205005', {'Wrist'})]

In [None]:
img_vocab.numericalize(['xxnone','xxunk', 51299004,51185008,12921003]) #0,1,6,2,3

[0, 1, 6, 2, 3]

In [None]:
obs_vocab.textify([0,1,2,3,4,5])

[('xxnone', 'Nothing recorded'),
 ('xxunk||xxunk||xxunk||xxunk', 'Unknown'),
 ('8302-2||44.6||cm||numeric', 'Body Height'),
 ('8302-2||82.55000000000001||cm||numeric', 'Body Height'),
 ('8302-2||120.5||cm||numeric', 'Body Height'),
 ('8302-2||158.45000000000002||cm||numeric', 'Body Height')]

In [None]:
obs_vocab.textify([200])

[('10834-0||3.35||g/dL||numeric', 'Globulin')]

In [None]:
#expected 6, 9, 201, 16
obs_vocab.numericalize(['8302-2||200.3||cm||numeric', \
                            '72514-3||4||{score}||numeric', '10834-0||3.7||g/dL||numeric','29463-7||181.8||kg||numeric'])

[6, 9, 201, 16]

In [None]:
obs_vocab.textify([50,150,250,350,450,548])

[('786-4||35.25||g/dL||numeric', 'MCHC [Mass/volume] by Automated count'),
 ('2093-3||266.275||mg/dL||numeric', 'Total Cholesterol'),
 ('6206-7||71.1||kU/L||numeric', 'Peanut IgE Ab in Serum'),
 ('20505-4||1.175||mg/dL||numeric',
  'Bilirubin.total [Mass/volume] in Urine by Test strip'),
 ('2075-0||107.6||mmol/L||numeric', 'Chloride'),
 ('46288-7||Surgical biopsy result abnormal||{nominal}||text',
  'US Guidance for biopsy of Prostate')]

In [None]:
med_vocab.textify([0,1,2,3,4])

[('xxnone', 'Nothing recorded'),
 ('xxunk', 'Unknown'),
 ('834061||START', {'Penicillin V Potassium 250 MG Oral Tablet'}),
 ('282464||START', {'Acetaminophen 160 MG Oral Tablet'}),
 ('313782||START', {'Acetaminophen 325 MG Oral Tablet'})]

In [None]:
med_vocab.itoc[:5]

['xxnone', 'xxunk', '834061||START', '282464||START', '313782||START']

In [None]:
med_vocab.numericalize(['xxnone', 'xxunk', '834061||START','282464||START', '313782||START', '749882||START']) #0,1,2,3,4,5

[0, 1, 2, 3, 4, 5]

In [None]:
med_vocab.numericalize(['834061||START'])

[2]

##### `demographics_vocabs`

In [None]:
for vocab in vl_1K.demographics_vocabs:
    print(vocab.get_emb_dims())

(33, 8)
(14, 7)
(124, 11)
(5, 5)
(7, 6)
(25, 8)
(4, 5)
(205, 13)
(211, 13)
(3, 5)
(200, 13)


In [None]:
bday.numericalize(['xxnone','xxunk', 1,10,31])

[0, 1, 2, 11, 32]

In [None]:
bday.textify([0, 1, 2, 11, 32])

['xxnone', 'xxunk', '1', '10', '31']

In [None]:
bmonth.textify([13])

['12']

In [None]:
byear.numericalize(['1942',1947,])

[44, 49]

In [None]:
byear.numericalize([1948])

[50]

In [None]:
marital.textify([0,1,2,3])

['xxnone', 'xxunk', 'M', 'S']

In [None]:
race.textify([0,1,2,3,4])

['xxnone', 'xxunk', 'white', 'asian', 'black']

In [None]:
vl_1K.age_mean, vl_1K.age_std

(15885.602409638554, 9388.271666254166)

## Get All Embedding Dimensions

In [None]:
#export
def get_all_emb_dims(EhrVocabList, αd=0.5736):
    '''Get embedding dimensions for all vocab objects of the dataset'''
    demographics_dims = [vocab.get_emb_dims(αd) for vocab in EhrVocabList.demographics_vocabs]
    recs_dims          = [vocab.get_emb_dims(αd) for vocab in EhrVocabList.records_vocabs]
    
#     emb_dims_list = [vocab.get_emb_dims() for vocab in vocabs_list]
    demographics_dims_width = recs_dims_width = 0
    for emb_dim in demographics_dims:
        demographics_dims_width += emb_dim[1]
    for emb_dim in recs_dims:
        recs_dims_width += emb_dim[1]
        
    return demographics_dims, recs_dims, demographics_dims_width, recs_dims_width

In [None]:
demographics_dims, recs_dims, demographics_dims_width, recs_dims_width = get_all_emb_dims(EhrVocabList.load(PATH_1K))

In [None]:
demographics_dims

[(33, 8),
 (14, 7),
 (124, 11),
 (5, 5),
 (7, 6),
 (25, 8),
 (4, 5),
 (205, 13),
 (211, 13),
 (3, 5),
 (200, 13)]

In [None]:
recs_dims

[(550, 17),
 (27, 8),
 (54, 9),
 (224, 13),
 (11, 6),
 (128, 12),
 (201, 13),
 (20, 7)]

In [None]:
demographics_dims_width, recs_dims_width

(94, 85)

In [None]:
demographics_dims, recs_dims, demographics_dims_width, recs_dims_width = get_all_emb_dims(EhrVocabList.load(PATH_1K), αd=10)

In [None]:
demographics_dims

[(33, 144),
 (14, 116),
 (124, 200),
 (5, 90),
 (7, 98),
 (25, 134),
 (4, 85),
 (205, 227),
 (211, 229),
 (3, 79),
 (200, 226)]

In [None]:
recs_dims

[(550, 291),
 (27, 137),
 (54, 163),
 (224, 232),
 (11, 109),
 (128, 202),
 (201, 226),
 (20, 127)]

In [None]:
demographics_dims_width, recs_dims_width

(1628, 1487)

## Export -

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 01_preprocessing_clean.ipynb.
Converted 02_preprocessing_vocab.ipynb.
Converted index.ipynb.
