In [None]:
# default_exp preprocessing.vocab

# preprocessing.vocab

In [None]:
#hide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#export
from lemonade.preprocessing import *

In [None]:
code_dfs = load_ehr_vocabcodes(PATH_1K)

In [None]:
pt_codes, obs_codes, alg_codes, crpl_codes, med_codes, img_codes, proc_codes, cnd_codes, immn_codes = code_dfs

## Vocabs

In [None]:
#export
class EhrVocab():
    def __init__(self, itoc, ctoi, ctod=None):
        self.itoc = itoc
        self.ctoi = ctoi
        if ctod is not None: self.ctod = ctod 
        self.vocab_size = len(self.itoc)
        
    @classmethod
    def create(cls, codes_df):
        desc_exists = 'desc' in codes_df.columns
        codes_df = codes_df.astype({'code':'str'})
        itoc = list(codes_df.code.unique())  #old --> list(set(codes_df.code))
        itoc.insert(0,'xxnone')
        itoc.insert(1,'xxunk')
        
        ctoi = {code: i for i, code in enumerate(itoc)}
        
        if desc_exists:
            codes_df.set_index('code', inplace=True)
            ctod = {}
            ctod[itoc[0]] = "Nothing recorded"
            ctod[itoc[1]] = "Unknown"
            for code in itoc[2:]: 
                ctod[code] = set(codes_df.loc[code].desc)
        
        return cls(itoc, ctoi, ctod) if desc_exists else cls(itoc, ctoi)
    
    def get_emb_dims(self, αd=0.5736):
        return self.vocab_size, round(6* αd * (self.vocab_size**0.25))
    
    def numericalize(self, codes, verbose=True):
        today = date.today().strftime("%Y-%m-%d")
        logfile = f'./log/{today}_numericalize_exceptions.log'
        
        res = []
        try:
            res = [self.ctoi[str(code)] for code in codes] #no big performance benefit
        except KeyError:
            for code in codes:
                try:
                    res.append(self.ctoi[str(code)])
                except KeyError:
                    res.append(self.ctoi['xxunk'])
                    if verbose:
                        with open(logfile, 'a') as log:
                            log.write(f'\ncode: {code}')                      
                    
        return res
    
    def textify(self, indxs):
        if hasattr(self, 'ctod'):
            res = [ (self.itoc[i], self.ctod[self.itoc[i]]) for i in indxs ]
        else:
            res = [ (self.itoc[i]) for i in indxs ]
        return res

### Class `ObsVocab`

In [None]:
#export
class ObsVocab (EhrVocab):
    def __init__(self, vocab_df):
        self.vocab_df = vocab_df
        self.vocab_size = len(vocab_df)
    
    def numericalize(self, codes, verbose=True):
        today = date.today().strftime("%Y-%m-%d")
        logfile = f'./log/{today}_numericalize_exceptions.log'
        
        indxs = []
        for code in codes:
            if code in ['xxnone','xxunk']: indxs.extend(self.vocab_df[(self.vocab_df['code'] == code)].index.tolist())
            else: 
                c,v,u,t = code.split('||')
                if t == 'numeric':
                    filt_df = self.vocab_df[(self.vocab_df['code'] == c) & (self.vocab_df['units'] == u) & (self.vocab_df['type'] == t)]
                    res = filt_df.iloc[(filt_df.value - float(v)).abs().argsort()[:1]].index.tolist()
                else:
                    res = self.vocab_df[(self.vocab_df['code'] == c) & (self.vocab_df['value'] == v) & \
                                               (self.vocab_df['units'] == u) & (self.vocab_df['type'] == t)].index.tolist()
                if len(res) == 0: 
                    indxs.extend(self.vocab_df[(self.vocab_df['code'] == 'xxunk')].index.tolist())
                    if verbose:
                        with open(logfile, 'a') as log:
                            log.write(f'\ncode in ObsVocab: {code}')                    
                else            : indxs.extend(res)
        assert len(codes) == len(indxs), "Possible bug, not all codes being numericalized"
        return indxs
    
    def textify(self, indxs):
        txts = []
        for i in indxs:
            c,d,v,u,t = self.vocab_df.iloc[i]
            if i == 0: txts.append((c, d))
            else:      txts.append((f'{c}||{v}||{u}||{t}', d))
        assert len(indxs) == len(txts), "Possible bug, not all indxs being textified"
        return txts

    @classmethod
    def create(cls, obs_codes, num_buckets=5):
        numerics = pd.DataFrame(obs_codes.loc[obs_codes['type'] == 'numeric',:])
        texts = pd.DataFrame(obs_codes.loc[obs_codes['type'] == 'text',:])
        numerics = numerics.astype({'value':'float'}, copy=False)
        vocab_rows = []

        for code in numerics.orig_code.unique():
            this_code = numerics.loc[numerics['orig_code'] == code]
            for unit in this_code.units.unique():
                this_unit = this_code.loc[this_code['units'] == unit]
                for val in np.linspace(this_unit.value.min(), this_unit.value.max(), num=num_buckets):
                    vocab_rows.append([code,this_unit.desc.iloc[0],val,unit,'numeric'])

        for code in texts.orig_code.unique():
            this_code = texts.loc[texts['orig_code'] == code]
            for unit in this_code.units.unique():
                this_unit = this_code.loc[this_code['units'] == unit]
                for val in this_unit.value.unique():
                    vocab_rows.append([code,this_unit.desc.iloc[0],val,unit,'text'])

        vocab_rows.insert(0, ['xxnone','Nothing recorded','xxnone','xxnone','xxnone'])
        vocab_rows.insert(1, ['xxunk','Unknown','xxunk','xxunk','xxunk'])
        obs_vocab = pd.DataFrame(data=vocab_rows, columns=['code','desc','value','units','type'])
        assert obs_codes.orig_code.nunique() == obs_vocab.code.nunique()-2, "Possible bug, obs_code nuniques don't match"
        return cls(obs_vocab)

**`numericalize()` Explanation**
- split incoming concated `code||value||units||type` string
- get a result_df based on everything except value
- then do an `argsort()` on the value column to determine closest value
 - based on example given in [pandas docs - cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#building-criteria)
   - **cookbook example that uses `loc` doesnt work, instead `iloc` [works](https://stackoverflow.com/questions/30112202/how-do-i-find-the-closest-values-in-a-pandas-series-to-an-input-number/53553226)**
 - `argsort()` - [Returns the indices that would sort this array](https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html#numpy.argsort)
 - `[:1]` on that returns the one row with the closest match, index of that is what we want

## VocabList

In [None]:
#export
class EhrVocabList:
    def __init__(self, demographics_vocabs, records_vocabs, age_mean, age_std, path):
        self.demographics_vocabs, self.records_vocabs, self.path = demographics_vocabs, records_vocabs, path
        self.age_mean, self.age_std = age_mean, age_std
    
    @classmethod
    def create(cls, path, num_buckets=5):
        demographics_vocabs, records_vocabs = [], []
        code_dfs = load_ehr_vocabcodes(path)
        
        def _get_demographics_codes(pt_codes):
            code_dfs = []
            code_dfs.extend([pd.DataFrame(range(1, 32, 1), columns=['code'])]) #31 days  
            code_dfs.extend([pd.DataFrame(range(1, 13, 1), columns=['code'])]) #12 months 
            code_dfs.extend([pd.DataFrame(range(1900, pd.Timestamp.today().year + 1, 1), columns=['code'])]) #years 1900 to now
            code_dfs.extend([pd.DataFrame(pt_codes.marital.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.race.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.ethnicity.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.gender.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.birthplace.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.city.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.state.dropna().unique(), columns=['code'])])
            code_dfs.extend([pd.DataFrame(pt_codes.zip.dropna().unique(), columns=['code'])])
            age_mean, age_std = pt_codes.age_now_days.mean(), pt_codes.age_now_days.std()
            return code_dfs, age_mean, age_std
        
        demographics_codes, age_mean, age_std = _get_demographics_codes(code_dfs[0])
        demographics_vocabs.extend([EhrVocab.create(codes_df) for codes_df in demographics_codes])
        records_vocabs.extend([ObsVocab.create(code_dfs[1], num_buckets)])
        records_vocabs.extend([EhrVocab.create(codes_df) for codes_df in code_dfs[2:]])
        return cls(demographics_vocabs, records_vocabs, age_mean, age_std, path)    
    
    def save(self):
        pckl_dir = Path(f'{self.path}/processed')
        pckl_dir.mkdir(parents=True, exist_ok=True)
        pckl_f = open(f'{pckl_dir}/vocabs.vocablist', 'wb')
        pickle.dump(self, pckl_f)
        pckl_f.close()
        print(f'Saved vocab lists to {pckl_dir}')
        
    @classmethod
    def load(cls, path):
        infile = open(f'{path}/processed/vocabs.vocablist','rb')
        ehrVocabList = pickle.load(infile)
        infile.close()
        return ehrVocabList

## Getting Embedding Dimensions

In [None]:
#export
def get_all_emb_dims(EhrVocabList, αd=0.5736):
    demographics_dims = [vocab.get_emb_dims(αd) for vocab in EhrVocabList.demographics_vocabs]
    recs_dims          = [vocab.get_emb_dims(αd) for vocab in EhrVocabList.records_vocabs]
    
#     emb_dims_list = [vocab.get_emb_dims() for vocab in vocabs_list]
    demographics_dims_width = recs_dims_width = 0
    for emb_dim in demographics_dims:
        demographics_dims_width += emb_dim[1]
    for emb_dim in recs_dims:
        recs_dims_width += emb_dim[1]
        
    return demographics_dims, recs_dims, demographics_dims_width, recs_dims_width

#hide
## Export

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 01_preprocessing_clean.ipynb.
Converted 02_preprocessing_vocabs.ipynb.
Converted index.ipynb.
