In [None]:
"""
use UMAP to visualize trained ICD kge.
Color all broad codes from MIMIC by broad categories, and grey out the rest
-get entities.tsv from data_dir to create broad labels


we could also do a multipanel figure where we show the progression from KGE to some other representation post-training
"""

In [1]:
import numpy as np
import pandas as pd
import os

import umap
import umap.plot
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline


  warn("Tensorflow not installed; ParametricUMAP will be unavailable")


In [27]:
kge = np.load('/home/dc925/project/ckpts/RotatE_ICD9_2/ICD9_RotatE_entity.npy')

In [28]:
kge.shape

(20979, 128)

In [29]:
entities = pd.read_csv('/home/dc925/project/data/graphmimic/UMLS/ICD_KG/entities.tsv', sep='\t', header=None)
entities.columns = ['ID', 'ICD']

In [30]:
entities

Unnamed: 0,ID,ICD
0,0,130.4
1,1,E981.1
2,2,E981.0
3,3,863.1
4,4,863.2
...,...,...
20974,20974,383.81
20975,20975,543
20976,20976,V17.49
20977,20977,398


In [31]:
icd2id = pd.Series(entities['ID'].values, index=entities['ICD']).to_dict()

In [32]:
id2icd = {v:k for k,v in icd2id.items()}

In [33]:
icd_codes_mortality = pd.read_csv(
    '/home/dc925/project/data/graphmimic/mortality/icd_codes_mortality.txt', header=None)
icd_codes_readmission = pd.read_csv(
    '/home/dc925/project/data/graphmimic/readmission/icd_codes_readmission.txt', header=None)
icd_codes_mortality = icd_codes_mortality[0].tolist()
icd_codes_readmission = icd_codes_readmission[0].tolist()
icd_codes = set(icd_codes_mortality + icd_codes_readmission)

In [34]:
len(icd_codes)

1067

In [44]:
icd_codes = sorted(icd_codes)

In [35]:
i = 0
for c in icd_codes:
    if c in icd2id:
        continue
    else:
        print(c)
        i += 1

In [36]:
i

0

In [37]:
def get_code_category(code):
    if code[0] == 'E':
        category = 'E000-E999: External Causes Of Injury And Poisoning'
    elif code[0] == 'V':
        category = 'V01-V91: Factors Influencing Health Status and Contact with Health Services'
    elif code[0] == 8 or code[0] == 9:
        category = 'Injury and Poisoning'
    elif code[:2] in(['78', '79']):
        category = 'Symptoms, Signs, and Ill-Defined Conditions'
    elif code[:2] in(['76', '77']):
        category = 'Conditions Originating In The Perinatal Period'
    elif code[:2] in(['74', '75']):
        category = 'Congenital Abnormalities'
    elif code[:2] in(['71', '72', '73']):
        category = 'Diseases of the Musculoskeletal System and Connective Tissue'
    elif code[:2] in(['68', '69', '70']):
        category = 'Diseases of the Skin and Subcutaneous Tissue'
    elif code[:2] in(['63', '64', '65', '66', '67']):
        category = 'Complications of Pregnancy, Childbirth, and the Puerperium'
    elif code[:2] in(['58', '59', '60', '61', '62']):
        category = 'Diseases of the Genitourinary System'
    elif code[:2] in(['52', '53', '54', '55', '56', '57']):
        category = 'Diseases of the Digestive System'
    elif code[:2] in(['46', '47', '48', '49', '50', '51']):
        category = 'Diseases of the Respiratory System'
    elif code[:2] in(['39', '40', '41', '42', '43','44','45']):
        category = 'Diseases of the Circulatory System'
    elif code[:2] in(['32', '33', '34', '35', '36', '37', '38']):
        category = 'Diseases of the Nervous System and Sense Organs'
    elif code[:2] in(['29', '30', '31']):
        category = 'Mental Disorders'
    elif code[:2] in(['28']):
        category = 'Diseases of the Blood and Blood-Forming Organs'
    elif code[:2] in(['24', '25', '26', '27']):
        category = 'Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders'
    elif code[:2] in(['14', '15', '16', '17', '18', '19','20', '21', '22', '23']):
        category = 'Neoplasms'
    else:
        category = 'Infectious and Parasitic Diseases'
    return category
        
            
    

In [38]:
entities['category'] = entities['ICD'].apply(lambda x: get_code_category(x))

In [39]:
entities.category.value_counts()

Infectious and Parasitic Diseases                                              6301
Diseases of the Nervous System and Sense Organs                                1911
Complications of Pregnancy, Childbirth, and the Puerperium                     1511
E000-E999: External Causes Of Injury And Poisoning                             1459
Neoplasms                                                                      1421
V01-V91: Factors Influencing Health Status and Contact with Health Services    1295
Diseases of the Musculoskeletal System and Connective Tissue                   1001
Diseases of the Digestive System                                                966
Diseases of the Circulatory System                                              879
Symptoms, Signs, and Ill-Defined Conditions                                     620
Mental Disorders                                                                609
Diseases of the Genitourinary System                                        

In [40]:
entities

Unnamed: 0,ID,ICD,category
0,0,130.4,Infectious and Parasitic Diseases
1,1,E981.1,E000-E999: External Causes Of Injury And Poiso...
2,2,E981.0,E000-E999: External Causes Of Injury And Poiso...
3,3,863.1,Infectious and Parasitic Diseases
4,4,863.2,Infectious and Parasitic Diseases
...,...,...,...
20974,20974,383.81,Diseases of the Nervous System and Sense Organs
20975,20975,543,Diseases of the Digestive System
20976,20976,V17.49,V01-V91: Factors Influencing Health Status and...
20977,20977,398,Diseases of the Circulatory System


In [41]:
def draw_umap(n_neighbors=5, n_components=2, min_dist=0.1, metric='euclidean'):
    fit = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
    mapper = fit.fit(kge)
    umap.plot.points(mapper, labels=entities['category'])
    
    

In [46]:
broad_idx = [icd2id[c] for c in icd_codes]

In [49]:
broad_kge = kge[broad_idx]

In [51]:
broad_kge.shape[1]

128

In [53]:
range(0)

range(0, 0)