In [5]:
# Built-in libraries
import os
import re
import sys
import pickle
import warnings
from typing import List
warnings.filterwarnings("ignore")

# Installed libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
from torchtext.vocab import build_vocab_from_iterator
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# warnings are switched since they are not important for preprocessing


try:
    root = os.path.dirname(os.path.dirname(oa.path.abspath(__file__)))
except NameError:
    root = os.path.dirname(os.getcwd())
sys.path.append(root)
print('Project root: {}'.format(root))

Project root: /Users/cmetzner/Desktop/Study/PhD/research/ORNL/Biostatistics and Multiscale System Modeling/attention_mechanisms


[nltk_data] Downloading package punkt to /Users/cmetzner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cmetzner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Custom libraries
def rel2abs(x: str, flag_proc: bool = True) -> str:
    """ Function that transform relative ICD-9 code into absolute code

    Parameters
    ----------
    x : str
        relative ICD-9 code
    flag_proc : bool; default=True
        flag indicating if relative code is a procedure or diagnosis code

    Returns
    -------
    str
        absolute ICD-9 code, e.g., procedure: XX.XX / diagnosis: XXX.XX

    """
    if flag_proc:
        # Some codes are billable with only two or three digits, do not add period
        if len(x) == 2:  # Procedure codes
            return x
        else:
            return f'{x[:2]}.{x[2:]}'
    else:
        if len(x) == 3:  # Diagnosis codes
            return x
        else:
            if x[0] == 'E':
                if len(x) == 4:
                    return x
                else:
                    return f'{x[:4]}.{x[4:]}'
            else:
                return f'{x[:3]}.{x[3:]}'
            
            
def clean_desc(text: str) -> List[str]:
    """
    Function that preprocesses the text code/category descriptions by cleaning, tokenizing, and removing
    english stopwords.
    This function requires to have the following "nltk" packages installed:
        - nltk.download('punkt')
        - nltk.download('stopwords')
    
    Parameter
    ----------
    text : str
        Input code descriptions
    
    Return
    ------
    List[str]
        Preprocessed tokens for respective code description.
        
    """
    
    # 1. Transform text to be lowercase
    text = text.lower()
    # 2. Replace all newline characters to be spaces
    text = text.replace('\n', ' ')
    # 3. Remove excessive whitespace
    text = re.sub(' +', ' ', text)  
    # 4. Remove punctuations from string
    punc = ['.', '?', '!', ',', '#', ':', ';', '(', ')', '%', '/', '-', '+', '=', '&', '_']
    for p in punc:
        text = re.sub('\%s' % p, '', text)
    # 5. Tokenize string
    tokens = word_tokenize(text)
    # 6. Remove stopwords
    tokens = [word for word in tokens if not word in stopwords.words()]
    return tokens

# 1. Load Label Descriptions
- 1.1 ICD-9 procedure and diagnosis code descriptions (https://physionet.org/content/mimiciii/1.4/)
- SEER Cancer Pathology Report Classification tasks
    - Site
    - Subsite
    - Laterality
    - Grade
    - Histology
    - Behavior

## 1.1 MIMIC-III - ICD-9 Procedure and Diagnosis Code Descriptions

In [3]:
# DIAGNOSES CODES
diag_desc = pd.read_csv(os.path.join(root, 'data', 'external', 'D_ICD_DIAGNOSES.csv'), dtype={'ICD9_CODE': str})
diag_desc = diag_desc.drop(['ROW_ID', 'SHORT_TITLE'], axis=1)

# PROCEDURE CODES
proc_desc = pd.read_csv(os.path.join(root, 'data', 'external', 'D_ICD_PROCEDURES.csv'), dtype={'ICD9_CODE': str})
proc_desc = proc_desc.drop(['ROW_ID', 'SHORT_TITLE'], axis=1)


# Transform relative codes to absolute codes
diag_desc['ICD9_CODE'] = diag_desc.apply(lambda x: rel2abs(x.ICD9_CODE, flag_proc=False), axis=1)
proc_desc['ICD9_CODE'] = proc_desc.apply(lambda x: rel2abs(x.ICD9_CODE, flag_proc=True), axis=1)

display(diag_desc.head(5))
display(proc_desc.head(5))

Unnamed: 0,ICD9_CODE,LONG_TITLE
0,11.66,"Tuberculous pneumonia [any form], tubercle bac..."
1,11.7,"Tuberculous pneumothorax, unspecified"
2,11.71,"Tuberculous pneumothorax, bacteriological or h..."
3,11.72,"Tuberculous pneumothorax, bacteriological or h..."
4,11.73,"Tuberculous pneumothorax, tubercle bacilli fou..."


Unnamed: 0,ICD9_CODE,LONG_TITLE
0,8.51,Canthotomy
1,8.52,Blepharorrhaphy
2,8.59,Other adjustment of lid position
3,8.61,Reconstruction of eyelid with skin flap or graft
4,8.62,Reconstruction of eyelid with mucous membrane ...


# 2. Load MIMIC-III Data

In [4]:
# Load processed ICD-9 codes in complete MIMIC-III dataset
codes = pd.read_csv(os.path.join(root, 'data', 'processed', 'ALL_CODES.csv'), dtype={'ICD9_CODE': str})
display(codes.head(5))

# Load processed MimicFull dataset
MimicFull = pd.read_pickle(os.path.join(root, 'data', 'processed', 'data_MimicFull', 'DATA_MimicFull.pkl'))
display(MimicFull.head(5))

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,ICD9_TYPE
0,109,172335,403.01,diagnosis
1,109,172335,486.0,diagnosis
2,109,172335,582.81,diagnosis
3,109,172335,585.5,diagnosis
4,109,172335,425.4,diagnosis


Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,TEXT
0,3,145834,"[038.9, 263.9, 38.93, 410.71, 425.4, 427.5, 42...","[admission, date, :, deidentified, discharge, ..."
1,4,185777,"[041.11, 042, 136.3, 276.3, 33.23, 38.93, 571....","[admission, date, :, deidentified, discharge, ..."
2,6,107064,"[275.3, 276.6, 276.7, 285.9, 38.06, 39.57, 403...","[admission, date, :, deidentified, discharge, ..."
3,9,150750,"[276.5, 401.9, 428.0, 431, 507.0, 584.9, 96.04...","[admission, date, :, deidentified, discharge, ..."
4,10,184167,"[765.15, 765.25, 774.2, 96.6, 99.15, 99.83, V2...","[admission, date, :, deidentified, discharge, ..."


In [5]:
# Next we want to make sure to only include codes that are actually associated with the hospital admission ids
# in the MimicFull dataset
MimicFull_hadm = MimicFull.HADM_ID.unique().tolist()

# Now use these hospital admission ids to subset the complete codes dataset
codes_full = codes[codes['HADM_ID'].isin(MimicFull_hadm)].copy()
codes_full = codes_full[['ICD9_CODE', 'ICD9_TYPE']]
codes_full = codes_full.drop_duplicates()

# Split this dataframe into diagnosis codes and procedure codes
codes_diag = codes_full[codes_full['ICD9_TYPE'] == 'diagnosis']
codes_proc = codes_full[codes_full['ICD9_TYPE'] == 'procedure']

# 3. Retrieve available code descriptions

In [6]:
# Retrieve all available code descriptions for the codes in the MimicFull dataset
diag_desc_full = diag_desc[diag_desc['ICD9_CODE'].isin(codes_diag.ICD9_CODE.unique().tolist())]
proc_desc_full = proc_desc[proc_desc['ICD9_CODE'].isin(codes_proc.ICD9_CODE.unique().tolist())]

print(f'Number of retrieved diagnosis code descriptions: {diag_desc_full.shape[0]}/{codes_diag.shape[0]}')
print(f'Number of retrieved procedure code descriptions: {proc_desc_full.shape[0]}/{codes_proc.shape[0]}')
print()
print('Retrieved sample diagnosis codes with description:')
display(diag_desc_full.sample(10))
print('Retrieved sample procedure codes with description:')
display(proc_desc_full.sample(10))

Number of retrieved diagnosis code descriptions: 6776/6918
Number of retrieved procedure code descriptions: 1817/1989

Retrieved sample diagnosis codes with description:


Unnamed: 0,ICD9_CODE,LONG_TITLE
6692,649.43,"Epilepsy complicating pregnancy, childbirth, o..."
692,032.89,Other specified diphtheria
11822,E933.1,Antineoplastic and immunosuppressive drugs cau...
11723,935.2,Foreign body in stomach
12076,E937.9,Unspecified sedatives and hypnotics causing ad...
12714,815.03,Closed fracture of shaft of metacarpal bone(s)
4446,426.4,Right bundle branch block
2697,212.5,Benign neoplasm of mediastinum
2666,209.71,Secondary neuroendocrine tumor of distant lymp...
14260,851.46,Cerebellar or brain stem contusion without men...


Retrieved sample procedure codes with description:


Unnamed: 0,ICD9_CODE,LONG_TITLE
3439,86.89,Other repair and reconstruction of skin and su...
1966,75.52,Repair of current obstetric laceration of corp...
1507,52.83,Heterotransplant of pancreas
1586,54.98,Peritoneal dialysis
475,31.79,Other repair and plastic operations on trachea
2866,88.61,Phlebography of veins of head and neck using c...
472,31.73,Closure of other fistula of trachea
990,17.7,Intravenous infusion of clofarabine
554,35.98,Other operations on septa of heart
3496,87.53,Intraoperative cholangiogram


Let's take a look at the ICD-9 codes in MIMIC-III where we were unable to retrieve a respective code description.

In [7]:
# Show ICD-9 codes not retrieved
diag_desc_full_not = codes_diag[~codes_diag['ICD9_CODE'].isin(diag_desc_full.ICD9_CODE.unique().tolist())]

print('Sample diagnosis codes unable to retrieve appropriate descriptions')
display(diag_desc_full_not.sort_values('ICD9_CODE').sample(10))

proc_desc_full_not = codes_proc[~codes_proc['ICD9_CODE'].isin(proc_desc_full.ICD9_CODE.unique().tolist())]

print('Sample procedure codes unable to retrieve appropriate descriptions')
display(proc_desc_full_not.sort_values('ICD9_CODE').sample(10))

Sample diagnosis codes unable to retrieve appropriate descriptions


Unnamed: 0,ICD9_CODE,ICD9_TYPE
46713,523.3,diagnosis
3919,729.9,diagnosis
108162,995.2,diagnosis
172949,621.3,diagnosis
633,444.0,diagnosis
832,519.1,diagnosis
71243,V51,diagnosis
11073,618.0,diagnosis
164821,523.4,diagnosis
568432,173.1,diagnosis


Sample procedure codes unable to retrieve appropriate descriptions


Unnamed: 0,ICD9_CODE,ICD9_TYPE
652354,16.0,procedure
768168,29.6,procedure
885201,71.6,procedure
673903,94.2,procedure
846233,83.0,procedure
657890,63.9,procedure
745172,65.0,procedure
715777,40.0,procedure
825625,82.0,procedure
651244,30.9,procedure


### Let's cross-compare with original list of ICD-9 code descriptions (DIAGNOSES AND PROCEDURES) retrieved from (https://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/codes - Version 32 Full and Abbreviated Code Titles  – Effective October 1, 2014 (ZIP)) 

Reasons why code descriptions were not retrieved
- Assigned code (in MIMIC-III) was overly generic and not specific enough (Shi et al. 2017)
    - See for diagnosis codes: 771.8 vs 771.81 / 771.82 / 771.83 / 771.89 (https://www.aapc.com/codes/icd9-codes-range/105/)
    - See for procedure codes: 72 vs. 72.0 / 72.1 (https://www.aapc.com/codes/icd9-codes-vol3-range/15/)

In [8]:
# Load additional icd-9 code information
diag_desc2 = pd.read_excel(os.path.join(root, 'data', 'external', 'CMS32_DESC_LONG_SHORT_DX.xlsx'))
proc_desc2 = pd.read_excel(os.path.join(root, 'data', 'external', 'CMS32_DESC_LONG_SHORT_SG.xlsx'),
                          converters={'PROCEDURE CODE': str})

In [9]:
diag_desc2['ICD9_CODE'] = diag_desc2.apply(lambda x: rel2abs(x['DIAGNOSIS CODE'], flag_proc=False), axis=1)
diag_desc2 = diag_desc2.drop(['DIAGNOSIS CODE', 'SHORT DESCRIPTION'], axis=1)
diag_desc2

Unnamed: 0,LONG DESCRIPTION,ICD9_CODE
0,Cholera due to vibrio cholerae,001.0
1,Cholera due to vibrio cholerae el tor,001.1
2,"Cholera, unspecified",001.9
3,Typhoid fever,002.0
4,Paratyphoid fever A,002.1
5,Paratyphoid fever B,002.2
6,Paratyphoid fever C,002.3
7,"Paratyphoid fever, unspecified",002.9
8,Salmonella gastroenteritis,003.0
9,Salmonella septicemia,003.1


In [10]:
proc_desc2['ICD9_CODE'] = proc_desc2.apply(lambda x: rel2abs(x['PROCEDURE CODE'], flag_proc=True), axis=1)
proc_desc2 = proc_desc2.drop(['PROCEDURE CODE', 'SHORT DESCRIPTION'], axis=1)
display(proc_desc2.head(10))

Unnamed: 0,LONG DESCRIPTION,ICD9_CODE
0,Therapeutic ultrasound of vessels of head and ...,0.01
1,Therapeutic ultrasound of heart,0.02
2,Therapeutic ultrasound of peripheral vascular ...,0.03
3,Other therapeutic ultrasound,0.09
4,Implantation of chemotherapeutic agent,0.1
5,Infusion of drotrecogin alfa (activated),0.11
6,Administration of inhaled nitric oxide,0.12
7,Injection or infusion of nesiritide,0.13
8,Injection or infusion of oxazolidinone class o...,0.14
9,High-dose infusion interleukin-2 [IL-2],0.15


In [11]:
# Combine procedure and diagnosis label descriptions
descs = pd.concat([diag_desc_full, proc_desc_full]).sort_values('ICD9_CODE')

display(descs.head(5))
descs.shape

Unnamed: 0,ICD9_CODE,LONG_TITLE
68,3.0,Salmonella gastroenteritis
69,3.1,Salmonella septicemia
76,3.8,Other specified salmonella infections
77,3.9,"Salmonella infection, unspecified"
79,4.1,Shigella flexneri


(8593, 2)

In [12]:
# Clean available code-descriptions
descs['desc_clean'] = descs.apply(lambda x: clean_desc(text=x.LONG_TITLE), axis=1)
descs = descs.drop(['LONG_TITLE'], axis=1)
descs['desc_len'] = descs.apply(lambda x: len(x.desc_clean), axis=1) 
max_len = descs.desc_len.max() 
print(f'Longest code description contains {max_len} tokens.')

Longest code description contains 32 tokens.


In [13]:
docs = descs['desc_clean'].tolist()  # corpus to learn code descriptions - one doc is one label description
tags = descs['ICD9_CODE'].tolist()  # assign labels as tags

# Compute Label Embedding Matrix using Doc2Vec

In [17]:
with open(os.path.join(root, 'data', 'processed', 'data_MimicFull', 'l_codes_MimicFull.pkl'), 'rb') as f:
    labels_full = pickle.load(f)
    
with open(os.path.join(root, 'data', 'processed', 'data_Mimic50', 'l_codes_Mimic50.pkl'), 'rb') as f:
    labels_50 = pickle.load(f)
    
# Compute pre-trained word embeddings for embedding layer using Word2Vec
documents = [TaggedDocument(doc, [tag]) for doc, tag in zip(docs, tags)]
embedding_dims = [100, 200, 300]
for dim in embedding_dims:
    print()
    print(f'Create word embedding matrices with dimenions: {dim}')
    model = Doc2Vec(documents, vector_size=dim, window=2, min_count=1, workers=4)
    model_shape = model.dv.vectors.shape
    print('Shape of document embedding matrix.')
    print(f'Number of word embeddings: {model_shape[0]}')
    print(f'Word embedding dimension: {model_shape[1]}')
    
    print()
    print('Create label embedding matrix for MimicFull:')
    
    label_embedding_matrix = np.zeros((len(labels_full), dim))
    count = 0
    for i, label in enumerate(labels_full):
        try:
            dv = model.dv[label]
        except:
            count += 1
            # create randomly initialized code description embedding vector 
            dv = np.random.uniform(low=-0.05, high=0.05, size=(dim, ))
            
        label_embedding_matrix[i,:] = dv
    
    print(f'Number of randomly initialized label description embedding vectors {count}.')
    
    with open(os.path.join(root, 'data', 'processed', 'code_embeddings', f'code_embedding_matrix_MimicFull_{dim}.pkl'), 'wb') as f:
        pickle.dump(label_embedding_matrix, f)
        
    print()
    print('Create label embedding matrix for Mimic50:')
        
    # Get label embedding matrix for Mimic50 - 50 most frequent labels
    label_embedding_matrix = np.zeros((len(labels_50), dim))
    count = 0
    for i, label in enumerate(labels_50):
        try:
            dv = model.dv[label]
        except:
            count += 1
            dv = np.random.uniform(low=-0.05, high=0.05, size=(dim, ))
        
        label_embedding_matrix[i,:] = dv
    
    print(f'Number of randomly initialized label description embedding vectors {count}.')
    
    with open(os.path.join(root, 'data', 'processed', 'code_embeddings', f'code_embedding_matrix_Mimic50_{dim}.pkl'), 'wb') as f:
        pickle.dump(label_embedding_matrix, f)


Create word embedding matrices with dimenions: 100
Shape of document embedding matrix.
Number of word embeddings: 8593
Word embedding dimension: 100

Create label embedding matrix for MimicFull:
Number of randomly initialized label description embedding vectors 314.

Create label embedding matrix for Mimic50:
Number of randomly initialized label description embedding vectors 0.

Create word embedding matrices with dimenions: 200
Shape of document embedding matrix.
Number of word embeddings: 8593
Word embedding dimension: 200

Create label embedding matrix for MimicFull:
Number of randomly initialized label description embedding vectors 314.

Create label embedding matrix for Mimic50:
Number of randomly initialized label description embedding vectors 0.

Create word embedding matrices with dimenions: 300
Shape of document embedding matrix.
Number of word embeddings: 8593
Word embedding dimension: 300

Create label embedding matrix for MimicFull:
Number of randomly initialized label des

# Preprocess High-Level ICD-9 Code Categories

In [24]:
# Dictionary containing ICD-9 diagnosis (3 digits + V/E) / procedure codes (2 digits)
# Diagnosis codes: http://www.icd9data.com/2015/Volume1/default.htm
# Procedure codes: http://www.icd9data.com/2015/Volume3/default.htm

d_cat_desc = {
    '001-139': 'Infectious And Parasitic Diseases',
    '140-239': 'Neoplasms',
    '240-279': 'Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders',
    '280-289': 'Diseases Of The Blood And Blood-Forming Organs',
    '290-319': 'Mental Disorders',
    '320-389': 'Diseases Of The Nervous System And Sense Organs',
    '390-459': 'Diseases Of The Circulatory System',
    '460-519': 'Diseases Of The Respiratory System',
    '520-579': 'Diseases Of The Digestive System',
    '580-629': 'Diseases Of The Genitourinary System',
    '630-679': 'Complications Of Pregnancy, Childbirth, And The Puerperium',
    '680-709': 'Diseases Of The Skin And Subcutaneous Tissue',
    '710-739': 'Diseases Of The Musculoskeletal System And Connective Tissue',
    '740-759': 'Congenital Anomalies',
    '760-779': 'Certain Conditions Originating In The Perinatal Period',
    '780-799': 'Symptoms, Signs, And Ill-Defined Conditions',
    '800-999': 'Injury And Poisoning',
    'V01-V91': 'Supplementary Classification Of Factors Influencing Health Status And Contact With Health Services',
    'E000-E999': 'Supplementary Classification Of External Causes Of Injury And Poisoning',
    '00-00': 'Procedures And Interventions Not Elsewhere Classified',
    '01-05': 'Operations On The Nervous System',
    '06-07': 'Operations On The Endocrine System',
    '08-16': 'Operations On The Eye',
    '17-17': 'Other Miscellaneous Diagnostic And Therapeutic Procedures',
    '18-20': 'Operations On The Ear',
    '21-29': 'Operations On The Nose, Mouth, And Pharynx',
    '30-34': 'Operations On The Respiratory System',
    '35-39': 'Operations On The Cardiovascular System',
    '40-41': 'Operations On The Hemic And Lymphatic System',
    '42-54': 'Operations On The Digestive System',
    '55-59': 'Operations On The Urinary System',
    '60-64': 'Operations On The Male Genital Organs',
    '65-71': 'Operations On The Female Genital Organs',
    '72-75': 'Obstetrical Procedures',
    '76-84': 'Operations On The Musculoskeletal System',
    '85-86': 'Operations On The Integumentary System',
    '87-99': 'Miscellaneous Diagnostic And Therapeutic Procedures'
}

In [26]:
df_cat_desc = pd.DataFrame.from_dict(data=d_cat_desc.items())
df_cat_desc.columns = ['categories', 'description']
df_cat_desc

df_cat_desc['desc_clean'] = df_cat_desc.apply(lambda x: clean_desc(text=x.description), axis=1)
display(df_cat_desc)


docs = df_cat_desc['desc_clean'].tolist()  # corpus to learn code descriptions - one doc is one label description
tags = df_cat_desc['categories'].tolist()  # assign labels as tags

Unnamed: 0,categories,description
0,001-139,Infectious And Parasitic Diseases
1,140-239,Neoplasms
2,240-279,"Endocrine, Nutritional And Metabolic Diseases,..."
3,280-289,Diseases Of The Blood And Blood-Forming Organs
4,290-319,Mental Disorders
5,320-389,Diseases Of The Nervous System And Sense Organs
6,390-459,Diseases Of The Circulatory System
7,460-519,Diseases Of The Respiratory System
8,520-579,Diseases Of The Digestive System
9,580-629,Diseases Of The Genitourinary System


In [31]:
# Compute pre-trained word embeddings for embedding layer using Word2Vec
documents = [TaggedDocument(doc, [tag]) for doc, tag in zip(docs, tags)]
embedding_dims = [100, 200, 300]
for dim in embedding_dims:
    print()
    print(f'Create word embedding matrices with dimenions: {dim}')
    model = Doc2Vec(documents, vector_size=dim, window=2, min_count=1, workers=4)
    model_shape = model.dv.vectors.shape
    print('Shape of document embedding matrix.')
    print(f'Number of document embeddings: {model_shape[0]}')
    print(f'Document embedding dimension: {model_shape[1]}')
    
    print()
    print('Create label embedding matrix for MimicFull:')
    
    label_embedding_matrix = model.dv.vectors
    with open(os.path.join(root, 'data', 'processed', 'code_embeddings', f'cat_embedding_matrix_MimicFull_{dim}.pkl'), 'wb') as f:
        pickle.dump(label_embedding_matrix, f)


Create word embedding matrices with dimenions: 100
Shape of document embedding matrix.
Number of document embeddings: 37
Document embedding dimension: 100

Create label embedding matrix for MimicFull:

Create word embedding matrices with dimenions: 200
Shape of document embedding matrix.
Number of document embeddings: 37
Document embedding dimension: 200

Create label embedding matrix for MimicFull:

Create word embedding matrices with dimenions: 300
Shape of document embedding matrix.
Number of document embeddings: 37
Document embedding dimension: 300

Create label embedding matrix for MimicFull:


In [34]:
# Remove all categories that are not in 50 most frequent codes:
dict_cat_desc_50 = {
    '001-139': 'Infectious And Parasitic Diseases',
    '240-279': 'Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders',
    '280-289': 'Diseases Of The Blood And Blood-Forming Organs',
    '290-319': 'Mental Disorders',
    '390-459': 'Diseases Of The Circulatory System',
    '460-519': 'Diseases Of The Respiratory System',
    '520-579': 'Diseases Of The Digestive System',
    '580-629': 'Diseases Of The Genitourinary System',
    '800-999': 'Injury And Poisoning',
    'V01-V91': 'Supplementary Classification Of Factors Influencing Health Status And Contact With Health Services',
    '30-34': 'Operations On The Respiratory System',
    '35-39': 'Operations On The Cardiovascular System',
    '42-54': 'Operations On The Digestive System',
    '87-99': 'Miscellaneous Diagnostic And Therapeutic Procedures'
}

In [35]:
df_cat_desc_50 = pd.DataFrame.from_dict(data=dict_cat_desc_50.items())
df_cat_desc_50.columns = ['categories', 'description']
df_cat_desc_50

Unnamed: 0,categories,description
0,001-139,Infectious And Parasitic Diseases
1,240-279,"Endocrine, Nutritional And Metabolic Diseases,..."
2,280-289,Diseases Of The Blood And Blood-Forming Organs
3,290-319,Mental Disorders
4,390-459,Diseases Of The Circulatory System
5,460-519,Diseases Of The Respiratory System
6,520-579,Diseases Of The Digestive System
7,580-629,Diseases Of The Genitourinary System
8,800-999,Injury And Poisoning
9,V01-V91,Supplementary Classification Of Factors Influe...


In [36]:
df_cat_desc_50['desc_clean'] = df_cat_desc_50.apply(lambda x: clean_desc(text=x.description), axis=1)
display(df_cat_desc_50)


docs = df_cat_desc_50['desc_clean'].tolist()  # corpus to learn code descriptions - one doc is one label description
tags = df_cat_desc_50['categories'].tolist()  # assign labels as tags

Unnamed: 0,categories,description,desc_clean
0,001-139,Infectious And Parasitic Diseases,"[infectious, parasitic, diseases]"
1,240-279,"Endocrine, Nutritional And Metabolic Diseases,...","[endocrine, nutritional, metabolic, diseases, ..."
2,280-289,Diseases Of The Blood And Blood-Forming Organs,"[diseases, blood, bloodforming, organs]"
3,290-319,Mental Disorders,"[mental, disorders]"
4,390-459,Diseases Of The Circulatory System,"[diseases, circulatory, system]"
5,460-519,Diseases Of The Respiratory System,"[diseases, respiratory, system]"
6,520-579,Diseases Of The Digestive System,"[diseases, digestive, system]"
7,580-629,Diseases Of The Genitourinary System,"[diseases, genitourinary, system]"
8,800-999,Injury And Poisoning,"[injury, poisoning]"
9,V01-V91,Supplementary Classification Of Factors Influe...,"[supplementary, classification, factors, influ..."


In [37]:
# Compute pre-trained word embeddings for embedding layer using Word2Vec
documents = [TaggedDocument(doc, [tag]) for doc, tag in zip(docs, tags)]
embedding_dims = [100, 200, 300]
for dim in embedding_dims:
    print()
    print(f'Create word embedding matrices with dimenions: {dim}')
    model = Doc2Vec(documents, vector_size=dim, window=2, min_count=1, workers=4)
    model_shape = model.dv.vectors.shape
    print('Shape of document embedding matrix.')
    print(f'Number of sentence embeddings: {model_shape[0]}')
    print(f'Word embedding dimension: {model_shape[1]}')
    
    print()
    print('Create label embedding matrix for Mimic50:')
    label_embedding_matrix = model.dv.vectors
    with open(os.path.join(root, 'data', 'processed', 'code_embeddings', f'cat_embedding_matrix_Mimic50_{dim}.pkl'), 'wb') as f:
        pickle.dump(label_embedding_matrix, f)
        


Create word embedding matrices with dimenions: 100
Shape of document embedding matrix.
Number of sentence embeddings: 14
Word embedding dimension: 100

Create label embedding matrix for Mimic50:

Create word embedding matrices with dimenions: 200
Shape of document embedding matrix.
Number of sentence embeddings: 14
Word embedding dimension: 200

Create label embedding matrix for Mimic50:

Create word embedding matrices with dimenions: 300
Shape of document embedding matrix.
Number of sentence embeddings: 14
Word embedding dimension: 300

Create label embedding matrix for Mimic50:


# Retrieve Indices associated with categories

In [38]:
# High-level category descriptions taken from: http://www.icd9data.com/2015/Volume1/default.htm
d_cat_desc = {
    '001-139': 'Infectious And Parasitic Diseases',
    '140-239': 'Neoplasms',
    '240-279': 'Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders',
    '280-289': 'Diseases Of The Blood And Blood-Forming Organs',
    '290-319': 'Mental Disorders',
    '320-389': 'Diseases Of The Nervous System And Sense Organs',
    '390-459': 'Diseases Of The Circulatory System',
    '460-519': 'Diseases Of The Respiratory System',
    '520-579': 'Diseases Of The Digestive System',
    '580-629': 'Diseases Of The Genitourinary System',
    '630-679': 'Complications Of Pregnancy, Childbirth, And The Puerperium',
    '680-709': 'Diseases Of The Skin And Subcutaneous Tissue',
    '710-739': 'Diseases Of The Musculoskeletal System And Connective Tissue',
    '740-759': 'Congenital Anomalies',
    '760-779': 'Certain Conditions Originating In The Perinatal Period',
    '780-799': 'Symptoms, Signs, And Ill-Defined Conditions',
    '800-999': 'Injury And Poisoning',
    'V01-V91': 'Supplementary Classification Of Factors Influencing Health Status And Contact With Health Services',
    'E000-E999': 'Supplementary Classification Of External Causes Of Injury And Poisoning',
    '00-00': 'Procedures And Interventions Not Elsewhere Classified',
    '01-05': 'Operations On The Nervous System',
    '06-07': 'Operations On The Endocrine System',
    '08-16': 'Operations On The Eye',
    '17-17': 'Other Miscellaneous Diagnostic And Therapeutic Procedures',
    '18-20': 'Operations On The Ear',
    '21-29': 'Operations On The Nose, Mouth, And Pharynx',
    '30-34': 'Operations On The Respiratory System',
    '35-39': 'Operations On The Cardiovascular System',
    '40-41': 'Operations On The Hemic And Lymphatic System',
    '42-54': 'Operations On The Digestive System',
    '55-59': 'Operations On The Urinary System',
    '60-64': 'Operations On The Male Genital Organs',
    '65-71': 'Operations On The Female Genital Organs',
    '72-75': 'Obstetrical Procedures',
    '76-84': 'Operations On The Musculoskeletal System',
    '85-86': 'Operations On The Integumentary System',
    '87-99': 'Miscellaneous Diagnostic And Therapeutic Procedures'
}

In [46]:
df_cat = pd.DataFrame.from_dict(data=d_cat_desc.items())
df_cat.columns = ['categories', 'description']

cat_bounds = df_cat['categories'].str.split("-", n = 1, expand = True)
df_cat['cat_lower'] = cat_bounds[0]
df_cat['cat_upper'] = cat_bounds[1]
df_cat

Unnamed: 0,categories,description,cat_lower,cat_upper
0,001-139,Infectious And Parasitic Diseases,001,139
1,140-239,Neoplasms,140,239
2,240-279,"Endocrine, Nutritional And Metabolic Diseases,...",240,279
3,280-289,Diseases Of The Blood And Blood-Forming Organs,280,289
4,290-319,Mental Disorders,290,319
5,320-389,Diseases Of The Nervous System And Sense Organs,320,389
6,390-459,Diseases Of The Circulatory System,390,459
7,460-519,Diseases Of The Respiratory System,460,519
8,520-579,Diseases Of The Digestive System,520,579
9,580-629,Diseases Of The Genitourinary System,580,629


In [61]:
lower_bounds = list(cat_bounds[0])
upper_bounds = list(cat_bounds[1])

In [62]:
lower_bounds

['001',
 '140',
 '240',
 '280',
 '290',
 '320',
 '390',
 '460',
 '520',
 '580',
 '630',
 '680',
 '710',
 '740',
 '760',
 '780',
 '800',
 'V01',
 'E000',
 '00',
 '01',
 '06',
 '08',
 '17',
 '18',
 '21',
 '30',
 '35',
 '40',
 '42',
 '55',
 '60',
 '65',
 '72',
 '76',
 '85',
 '87']

In [78]:
cat2label_mapping_full = []
for label in labels_full:
    for i, (lower, upper) in enumerate(zip(lower_bounds, upper_bounds)):
        if '.' in label:
            if label[2] == '.':  # procedure code
                if len(lower) == 2:
                    if lower <= label[:2] <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_full.append(i)
            else:  # diagnosis code
                if len(lower) == 3:
                    if lower <= label[:3] <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_full.append(i)
                elif len(lower) == 4:  # codes starting with 'E'
                    if lower <= label[:4] <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_full.append(i)
        else:
            if len(label) == 2:
                if len(lower) == 2:
                    if lower <= label <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_full.append(i)
            elif len(label) == 3:
                if len(lower) == 3:
                    if lower <= label <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_full.append(i)
            elif len(label) == 4:
                if len(lower) == 4:
                    if lower <= label <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_full.append(i)

Code: 003.0 -> Cat: 0
Code: 003.1 -> Cat: 0
Code: 003.8 -> Cat: 0
Code: 003.9 -> Cat: 0
Code: 004.1 -> Cat: 0
Code: 004.8 -> Cat: 0
Code: 004.9 -> Cat: 0
Code: 005.1 -> Cat: 0
Code: 005.81 -> Cat: 0
Code: 005.9 -> Cat: 0
Code: 007.1 -> Cat: 0
Code: 007.4 -> Cat: 0
Code: 008.04 -> Cat: 0
Code: 008.41 -> Cat: 0
Code: 008.43 -> Cat: 0
Code: 008.45 -> Cat: 0
Code: 008.47 -> Cat: 0
Code: 008.5 -> Cat: 0
Code: 008.61 -> Cat: 0
Code: 008.62 -> Cat: 0
Code: 008.63 -> Cat: 0
Code: 008.69 -> Cat: 0
Code: 008.8 -> Cat: 0
Code: 009.0 -> Cat: 0
Code: 009.1 -> Cat: 0
Code: 009.2 -> Cat: 0
Code: 009.3 -> Cat: 0
Code: 010.85 -> Cat: 0
Code: 011.23 -> Cat: 0
Code: 011.36 -> Cat: 0
Code: 011.64 -> Cat: 0
Code: 011.86 -> Cat: 0
Code: 011.90 -> Cat: 0
Code: 011.93 -> Cat: 0
Code: 011.94 -> Cat: 0
Code: 012.05 -> Cat: 0
Code: 012.15 -> Cat: 0
Code: 013.00 -> Cat: 0
Code: 013.04 -> Cat: 0
Code: 013.25 -> Cat: 0
Code: 013.30 -> Cat: 0
Code: 013.54 -> Cat: 0
Code: 014.02 -> Cat: 0
Code: 014.05 -> Cat: 0
Code:

In [83]:
# Remove all categories that are not in 50 most frequent codes:
d_cat_desc_50 = {
    '001-139': 'Infectious And Parasitic Diseases',
    '240-279': 'Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders',
    '280-289': 'Diseases Of The Blood And Blood-Forming Organs',
    '290-319': 'Mental Disorders',
    '390-459': 'Diseases Of The Circulatory System',
    '460-519': 'Diseases Of The Respiratory System',
    '520-579': 'Diseases Of The Digestive System',
    '580-629': 'Diseases Of The Genitourinary System',
    '800-999': 'Injury And Poisoning',
    'V01-V91': 'Supplementary Classification Of Factors Influencing Health Status And Contact With Health Services',
    '30-34': 'Operations On The Respiratory System',
    '35-39': 'Operations On The Cardiovascular System',
    '42-54': 'Operations On The Digestive System',
    '87-99': 'Miscellaneous Diagnostic And Therapeutic Procedures'
}

In [84]:
df_cat_50 = pd.DataFrame.from_dict(data=d_cat_desc_50.items())
df_cat_50.columns = ['categories', 'description']

cat_bounds_50 = df_cat_50['categories'].str.split("-", n = 1, expand = True)
df_cat_50['cat_lower'] = cat_bounds_50[0]
df_cat_50['cat_upper'] = cat_bounds_50[1]

lower_bounds_50 = list(cat_bounds_50[0])
upper_bounds_50 = list(cat_bounds_50[1])



In [85]:
cat2label_mapping_50 = []
for label in labels_50:
    for i, (lower, upper) in enumerate(zip(lower_bounds_50, upper_bounds_50)):
        if '.' in label:
            if label[2] == '.':  # procedure code
                if len(lower) == 2:
                    if lower <= label[:2] <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_50.append(i)
            else:  # diagnosis code
                if len(lower) == 3:
                    if lower <= label[:3] <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_50.append(i)
                elif len(lower) == 4:  # codes starting with 'E'
                    if lower <= label[:4] <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_50.append(i)
        else:
            if len(label) == 2:
                if len(lower) == 2:
                    if lower <= label <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_50.append(i)
            elif len(label) == 3:
                if len(lower) == 3:
                    if lower <= label <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_50.append(i)
            elif len(label) == 4:
                if len(lower) == 4:
                    if lower <= label <= upper:
                        print(f'Code: {label} -> Cat: {i}')
                        cat2label_mapping_50.append(i)

Code: 038.9 -> Cat: 0
Code: 244.9 -> Cat: 1
Code: 250.00 -> Cat: 1
Code: 272.0 -> Cat: 1
Code: 272.4 -> Cat: 1
Code: 276.1 -> Cat: 1
Code: 276.2 -> Cat: 1
Code: 285.1 -> Cat: 2
Code: 285.9 -> Cat: 2
Code: 287.5 -> Cat: 2
Code: 305.1 -> Cat: 3
Code: 311 -> Cat: 3
Code: 33.24 -> Cat: 10
Code: 36.15 -> Cat: 11
Code: 37.22 -> Cat: 11
Code: 38.91 -> Cat: 11
Code: 38.93 -> Cat: 11
Code: 39.61 -> Cat: 11
Code: 39.95 -> Cat: 11
Code: 401.9 -> Cat: 4
Code: 403.90 -> Cat: 4
Code: 410.71 -> Cat: 4
Code: 412 -> Cat: 4
Code: 414.01 -> Cat: 4
Code: 424.0 -> Cat: 4
Code: 427.31 -> Cat: 4
Code: 428.0 -> Cat: 4
Code: 45.13 -> Cat: 12
Code: 486 -> Cat: 5
Code: 496 -> Cat: 5
Code: 507.0 -> Cat: 5
Code: 511.9 -> Cat: 5
Code: 518.81 -> Cat: 5
Code: 530.81 -> Cat: 6
Code: 584.9 -> Cat: 7
Code: 585.9 -> Cat: 7
Code: 599.0 -> Cat: 7
Code: 88.56 -> Cat: 13
Code: 88.72 -> Cat: 13
Code: 93.90 -> Cat: 13
Code: 96.04 -> Cat: 13
Code: 96.6 -> Cat: 13
Code: 96.71 -> Cat: 13
Code: 96.72 -> Cat: 13
Code: 99.04 -> Cat:

In [86]:
with open(os.path.join(root, 'data', 'processed', 'code_embeddings', f'embedding_matrix_MimicFull_mapping.pkl'), 'wb') as f:
    pickle.dump(cat2label_mapping_full, f)

with open(os.path.join(root, 'data', 'processed', 'code_embeddings', f'embedding_matrix_Mimic50_mapping.pkl'), 'wb') as f:
    pickle.dump(cat2label_mapping_50, f)