In [1]:
import os 
import numpy as np
import pandas as pd
import json
import string

# Preprocess

In [2]:
ori = np.load('original_data.npz', allow_pickle=True)
print(ori.files)
mention_labels = ori['mention_labels']
mention_names = ori['mention_names']

['mention_indices', 'mention_labels', 'mention_names', 'concept_names', 'concept_indices']


In [25]:
relation_file ='../iBKH/Di_Di_res.csv'
relation = pd.read_csv(relation_file)
relation

Unnamed: 0,Disease_1,Disease_2,is_a,Resemble,Source
0,DOID:0001816,DOID:175,1,0,DO
1,DOID:175,DOID:176,1,0,DO
2,DOID:0002116,DOID:10124,1,0,DO
3,DOID:10124,DOID:5614,1,0,DO
4,DOID:0014667,DOID:4,1,0,DO
...,...,...,...,...,...
11067,DOID:219,DOID:8577,0,1,Hetionet
11068,DOID:2994,DOID:13499,0,1,Hetionet
11069,DOID:1793,DOID:10534,0,1,Hetionet
11070,DOID:219,DOID:3121,0,1,Hetionet


In [24]:
entity = pd.read_csv('../iBKH/disease_vocab.csv', dtype=str)
entity['name'] = entity['name'].astype(str).apply(str.lower).apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
entity

Unnamed: 0,primary,name,do_id,kegg_id,pharmgkb_id,mesh_id,umls_cui,icd_10,icd_9,omim_id,iDISK_id
0,DOID:4,disease,DOID:4,,,D004194,C0012634,,,,DC0478447
1,DOID:0001816,angiosarcoma,DOID:0001816,H01557,PA444390,D006394,C0018923,,,,
2,DOID:175,vascular cancer,DOID:175,,,D019043,C0282607,,,,
3,DOID:0002116,pterygium,DOID:0002116,,,,C0033999,,,,
4,DOID:10124,corneal disease,DOID:10124,,PA165108965,D003316,C0010034,H18.9,371.9,,
...,...,...,...,...,...,...,...,...,...,...,...
19231,iDISK:DC0492572,bleeding tendency,,,,,C1458140,,,,DC0492572
19232,iDISK:DC0492815,skin abscess,,,,,C0149777,,,,DC0492815
19233,iDISK:DC0493114,cystitis recurrent,,,,,C0581366,,,,DC0493114
19234,iDISK:DC0493119,epstein,,,,,C1396851,,,,DC0493119


# Get related entities

In [26]:
def get_doid(concept, df_entity):
    query = df_entity[df_entity['name'] == concept] 
    # print(query)
    # assert len(query) == 1
    doid = query['primary'].iloc[0]
    return doid

In [27]:
def get_name(doid, df_entity):
    query = df_entity[df_entity['primary'] == doid] 
    # print(query)
    # assert len(query) == 1
    name = query['name'].iloc[0]
    return name

In [28]:
def get_related_disease_items(label, df_entity, df_relation):
    related_entities = set()
    label_doid = get_doid(label, df_entity)
    # print(label_doid)
    # search for 1-hop items when label is in "Disease_1"
    match_1 = df_relation['Disease_1'].isin([label_doid])
    searched_doid = list(df_relation[match_1]['Disease_2'])
    for doid in searched_doid:
        name = get_name(doid, df_entity) 
        if name not in related_entities:
            related_entities.add(name)
    # search for 1-hop items when label is in "Disease_2"
    match_2 = df_relation['Disease_2'].isin([label_doid])
    searched_doid = list(df_relation[match_2]['Disease_1'])
    for doid in searched_doid:
        name = get_name(doid, df_entity) 
        if name not in related_entities:
            related_entities.add(name)
    return related_entities
get_related_disease_items('tuberculous pneumothorax', entity,relation)

{'pneumothorax'}

In [18]:
# relevance = pd.DataFrame(columns=['mention_names', 'relevance_0', 'relevance_1', 'relevance_2'])
# for i in range(len(mention_labels[:10])):
#     mention_name = mention_names[i]
#     mention_label = mention_labels[i]
#     zero_hop_set = {mention_label}
#     one_hop_set = get_related_disease_items(mention_label, entity, relation)   # get a set
#     two_hop_set = set()
#     for item in one_hop_set:
#         # print(item)
#         temp_two_hop_set = get_related_disease_items(item, entity, relation)
#         two_hop_set = two_hop_set | temp_two_hop_set
#     two_hop_set = two_hop_set - one_hop_set - zero_hop_set
#     print(mention_name)
#     print(zero_hop_set)
#     print(one_hop_set)
#     print(two_hop_set)
#     relevance.loc[len(relevance)] = [mention_name, zero_hop_set, one_hop_set, two_hop_set]

tuberculous pneumothorax unspecified
{'tuberculous pneumothorax'}
{'pneumothorax'}
{'spontaneous tension pneumothorax', 'pleural disease', 'primary spontaneous pneumothorax', 'hemopneumothorax'}
pulmonary tuberculosis unspecified unspecified
{'pulmonary tuberculosis'}
{'tuberculosis'}
{'ocular tuberculosis', 'cardiac tuberculosis', 'multidrugresistant tuberculosis', 'extrapulmonary tuberculosis', 'primary bacterial infectious disease'}
tuberculous pleurisy unspecified
{'pleural tuberculosis'}
{'extrapulmonary tuberculosis'}
{'central nervous system tuberculosis', 'tuberculosis', 'lymph node tuberculosis', 'skeletal tuberculosis', 'miliary tuberculosis', 'pericardial tuberculosis', 'abdominal tuberculosis', 'urogenital tuberculosis'}
tuberculous meningitis unspecified
{'tuberculous meningitis nos'}
set()
set()
cholera due to vibrio cholerae
{'cholera'}
{'primary bacterial infectious disease'}
{'rhinoscleroma', 'relapsing fever', 'gonorrhea', 'bejel', 'granuloma inguinale', 'tetanus', 's

In [30]:
all_dict = dict()
for i in range(len(mention_labels)):
    print(f'Start index {i}, total length {len(mention_labels)}')
    mention_name = mention_names[i]
    mention_label = mention_labels[i]
    zero_hop_set = {mention_label}
    one_hop_set = get_related_disease_items(mention_label, entity, relation)   # get a set
    two_hop_set = set()
    for item in one_hop_set:
        # print(item)
        temp_two_hop_set = get_related_disease_items(item, entity, relation)
        two_hop_set = two_hop_set | temp_two_hop_set
    two_hop_set = two_hop_set - one_hop_set - zero_hop_set
    # print(mention_name)
    # print(zero_hop_set)
    # print(one_hop_set)
    # print(two_hop_set)
    all_dict[i] = {'mention_name':mention_name, 'zero_hop_list': list(zero_hop_set), 'one_hop_list': list(one_hop_set), 'two_hop_list': list(two_hop_set)}

Start index 0, total length 1493
Start index 1, total length 1493
Start index 2, total length 1493
Start index 3, total length 1493
Start index 4, total length 1493
Start index 5, total length 1493
Start index 6, total length 1493
Start index 7, total length 1493
Start index 8, total length 1493
Start index 9, total length 1493
Start index 10, total length 1493
Start index 11, total length 1493
Start index 12, total length 1493
Start index 13, total length 1493
Start index 14, total length 1493
Start index 15, total length 1493
Start index 16, total length 1493
Start index 17, total length 1493
Start index 18, total length 1493
Start index 19, total length 1493
Start index 20, total length 1493
Start index 21, total length 1493
Start index 22, total length 1493
Start index 23, total length 1493
Start index 24, total length 1493
Start index 25, total length 1493
Start index 26, total length 1493
Start index 27, total length 1493
Start index 28, total length 1493
Start index 29, total le

In [33]:
with open('relevance.json', 'w', encoding='utf-8') as json_file:
    json.dump(all_dict, json_file)

In [32]:
all_dict[0]

{'mention_name': 'tuberculous pneumothorax unspecified',
 'zero_hop_list': ['tuberculous pneumothorax'],
 'one_hop_list': ['pneumothorax'],
 'two_hop_list': ['spontaneous tension pneumothorax',
  'primary spontaneous pneumothorax',
  'hemopneumothorax',
  'pleural disease']}

# load the json file and read

In [34]:
with open('relevance.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

In [42]:
data['1450']['two_hop_list']

['hypospadias',
 'disease',
 'caudal regression syndrome',
 'partial cryptophthalmia',
 'poland syndrome',
 'congenital symblepharon',
 'visceral heterotaxy',
 'gastroschisis',
 'renalhepaticpancreatic dysplasia',
 'bladder exstrophyepispadiascloacal exstrophy complex',
 'imperforate anus',
 'developmental cardiac valvular defect',
 'klippelfeil syndrome',
 'congenital nervous system abnormality',
 'orofacial cleft',
 'omphalocele',
 'polydactyly',
 'radioulnar synostosis',
 'silverrussell syndrome',
 'neural tube defect',
 'complete cryptophthalmia',
 'cleft palatelateral synechia syndrome',
 'meckels diverticulum',
 'agnathiaotocephaly complex']