In [1]:

"""
Visualization

This notebook is just for generating relevant files for visualizing (mainly, labels)
The actual visualization needs to be done in a .py script because jupyter can't hang

Also, for visualization purposes, need to subset the embeddings to those that are included in the mappings


Entity label types:
1. specific semantic types (semantic structure)
-semantic type visualization should only be done for a given broad semantic group, otherwise it's too cluttered
-pick a couple of broad semantic groups to zoom in on
2. broad semantic groups (semantic structure)
3. depth in hierarchies (hierarchical structure)

how to use hierarchy data to create depth (integar distance from root node) for each CUI:
1. subset MRHIER to snomed CUIs
2. count num of ancestors in path (average if multiple contexts, int)
3. look at distribution of depths and see if we need to bin


Relation labels:
just read in relation2oneormany.json and relasion2broad.json and map using id2relation
"""

'\nVisualization\n\nPerhaps we could do largeviz for kge, snomed2vec, cui2vec with hierarchical labels to show kge captures it better\n\nAlso try doing relations visualization with labels=oneormany,broad,specific\n\n\n'

In [92]:
import numpy as np
import pandas as pd
import pickle
import os
import json

In [93]:
## Set up paths
embeddings_dir = '/home/dc925/project/data/embeddings'
snomed2vec_emb_file = os.path.join(embeddings_dir, 'snomed2vec/snomed2vec.csv')
cui2vec_emb_file = os.path.join(embeddings_dir, 'cui2vec/cui2vec_pretrained.csv')

In [94]:
## Entity labels

# load in semantic_info.csv, which contains both specific semantic types and semantic groups
semantic_df = pd.read_csv('/home/dc925/project/clinical_kge/semantic_info.csv', sep='\t', index_col=0)
semantic_df = semantic_df.drop_duplicates(subset='CUI')
# Create mappings for semantic types and groups
cui2sty = semantic_df.set_index('CUI')['STY'].to_dict()
cui2sg = semantic_df.set_index('CUI')['SemGroup'].to_dict()
# semantic_groups = ['CHEM', 'DISO', 'ANAT', 'PROC', 'CONC', 'DEVI', 'PHEN', 'PHYS']
filtered_semantic_df = pd.read_csv('/home/dc925/project/clinical_kge/filtered_semantic_info.csv', sep='\t', index_col=0)
filtered_semantic_df = filtered_semantic_df.drop_duplicates(subset='CUI')
fcui2sty = filtered_semantic_df.set_index('CUI')['STY'].to_dict()
fcui2sg = filtered_semantic_df.set_index('CUI')['SemGroup'].to_dict()



# Hierarchy; create mappings for depth
# mrhier = pd.read_csv('/home/dc925/project/data/snomed/MRHIER.RRF', sep='|', header=None)[[0, 6]]
# mrhier = mrhier.dropna()

# mrhier.columns = ['CUI', 'PATH']
# mrhier['depth'] = mrhier['PATH'].apply(lambda x: len(x.split('.')))

# gp = mrhier.groupby('CUI')
# gp = gp.mean()
# depth = gp['depth'].apply(lambda x: round(x))
# cui2depth = depth.to_dict()
# cui2depth['C2720507'] = 1

In [95]:
len(fcui2sg)

345483

In [115]:
# load in trained embeddings
with open('/home/dc925/project/data/embeddings/kge/RotatE.pkl', 'rb') as fin:
    model = pickle.load(fin)
entity_embeddings = model.solver.entity_embeddings
id2entity = model.graph.id2entity

sty_labels = np.array([fcui2sty[id2entity[i]] for i in range(len(id2entity))])
sg_labels = np.array([fcui2sg[id2entity[i]] for i in range(len(id2entity))])
np.save('/home/dc925/project/data/embeddings/kge/sty_labels.npy', sty_labels)
np.save('/home/dc925/project/data/embeddings/kge/sg_labels.npy', sg_labels)

In [116]:
entity_embeddings.shape

(293879, 512)

In [117]:
np.unique(sg_labels, return_counts=True)

(array(['ANAT', 'CHEM', 'CONC', 'DEVI', 'DISO', 'PHEN', 'PHYS', 'PROC'],
       dtype='<U4'),
 array([ 33939,  40309,  13706,  12648, 126804,   2935,   3894,  59644]))

In [99]:
# l, c = np.unique(sty_labels, return_counts=True)
# dict(zip(l, c))

In [100]:
"""
PROC
'Therapeutic or Preventive Procedure': 33535
'Laboratory Procedure': 9745
'Health Care Activity': 6678
'Educational Activity': 527
'Diagnostic Procedure': 9126

CHEM
'Pharmacologic Substance': 8334
'Organic Chemical': 8483
'Amino Acid, Peptide, or Protein': 10540
'Clinical Drug': 9475
'Immunologic Factor': 1187
'Indicator, Reagent, or Diagnostic Aid': 846

"""

"\nPROC\n'Therapeutic or Preventive Procedure': 33535\n'Laboratory Procedure': 9745\n'Health Care Activity': 6678\n'Educational Activity': 527\n'Diagnostic Procedure': 9126\n\nCHEM\n'Pharmacologic Substance': 8334\n'Organic Chemical': 8483\n'Amino Acid, Peptide, or Protein': 10540\n'Clinical Drug': 9475\n'Immunologic Factor': 1187\n'Indicator, Reagent, or Diagnostic Aid': 846\n\n"

In [101]:
## Cui2vec
cui2vec = pd.read_csv(cui2vec_emb_file, index_col=0)
new_indexes = cui2vec.index[cui2vec.index.isin(fcui2sg.keys())]
# new_indexes = new_indexes[new_indexes.isin(cui2sg)]
sty_labels = np.array([cui2sty[cui] for cui in new_indexes])
sg_labels = np.array([fcui2sg[cui] for cui in new_indexes])
# depth_labels = np.array([cui2depth[cui] for cui in new_indexes])
cui2vec = cui2vec.loc[new_indexes].to_numpy()
np.save('/home/dc925/project/data/embeddings/cui2vec/cui2vec.npy', cui2vec)
np.save('/home/dc925/project/data/embeddings/cui2vec/sty_labels.npy', sty_labels)
np.save('/home/dc925/project/data/embeddings/cui2vec/sg_labels.npy', sg_labels)

In [102]:
cui2vec.shape

(45597, 500)

In [103]:
sg_labels.shape

(45597,)

In [104]:
np.unique(sg_labels)

array(['ANAT', 'CHEM', 'CONC', 'DEVI', 'DISO', 'PHEN', 'PHYS', 'PROC'],
      dtype='<U4')

In [105]:
## Snomed2vec
snomed2vec = pd.read_csv(snomed2vec_emb_file, sep='\t',index_col=0, header=None)
new_indexes = snomed2vec.index[snomed2vec.index.isin(fcui2sg.keys())]
sty_labels = np.array([cui2sty[cui] for cui in new_indexes])
sg_labels = np.array([fcui2sg[cui] for cui in new_indexes])
snomed2vec = snomed2vec.loc[new_indexes].to_numpy()
np.save('/home/dc925/project/data/embeddings/snomed2vec/snomed2vec.npy', snomed2vec)
np.save('/home/dc925/project/data/embeddings/snomed2vec/sty_labels.npy', sty_labels)
np.save('/home/dc925/project/data/embeddings/snomed2vec/sg_labels.npy', sg_labels)

In [106]:
snomed2vec.shape

(189882, 200)

In [107]:
sg_labels.shape

(189882,)

In [108]:
np.unique(sg_labels, return_counts=True)

(array(['ANAT', 'CHEM', 'DISO', 'PROC'], dtype='<U4'),
 array([17962, 28988, 99295, 43637]))

In [109]:
## Relation labels

# load in relation2oneormany.json and relation2broad.json and relation embeddings
with open('/home/dc925/project/clinical_kge/data/case4/relation2broad.json', 'r') as fin:
    relation2broad = json.load(fin)
with open('/home/dc925/project/clinical_kge/data/case4/relation2oneormany.json', 'r') as fin:
    relation2oneormany = json.load(fin)
with open('/home/dc925/project/clinical_kge/data/case4/relation2sg_oneormany.json', 'r') as fin:
    relation2sg_oneormany = json.load(fin)

relation_embeddings = model.solver.relation_embeddings
id2relation = model.graph.id2relation
broad_labels = np.array([relation2broad[id2relation[i]] for i in range(len(id2relation))])
oneormany_labels = np.array([relation2oneormany[id2relation[i]] for i in range(len(id2relation))])
sg_oneormany_labels = np.array([relation2sg_oneormany[id2relation[i]] for i in range(len(id2relation))])

In [110]:
relation_embeddings.shape

(162, 512)

In [111]:
broad_labels.shape

(162,)

In [112]:
oneormany_labels.shape

(162,)

In [113]:
sg_oneormany_labels.shape

(162,)

In [114]:
np.save('/home/dc925/project/data/embeddings/kge/broad_labels.npy', broad_labels)
np.save('/home/dc925/project/data/embeddings/kge/oneormany_labels.npy', oneormany_labels)
np.save('/home/dc925/project/data/embeddings/kge/sg_oneormany_labels.npy', sg_oneormany_labels)