In [1]:
import json
import umls_api
import os
import yaml
from tqdm import tqdm
import pandas as pd

In [2]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
umls_token = PARAM["umls_token"]

In [4]:
hpo_condition = {}
msh_condition = {}

conditions = pd.read_csv("neo4j/conditions.tsv", sep="\t")

for i, row in tqdm(conditions.iterrows()):


    condition = row["name"]
    umls = row["UMLS"]
    
    if str(row["HPO"]) != "nan":
        hpos = str(row["HPO"]).split("|")

        for hpo in hpos:
            h = hpo.split("/")[1]
            if h not in hpo_condition:
                hpo_condition[h] = []
            hpo_condition[h].append(umls)

    if str(row["MSH"]) != "nan":
        mshs = str(row["MSH"]).split("|")
    
        for msh in mshs:
            m = msh.split("/")[1]
            if m not in msh_condition:
                msh_condition[m] = []
            msh_condition[m].append(umls)


261it [00:00, 11043.21it/s]


In [5]:
hpo_condition

{'HP:0033678': ['UMLS/C0948089'],
 'HP:0003765': ['UMLS/C0033860'],
 'HP:0000822': ['UMLS/C0020538'],
 'HP:0002013': ['UMLS/C0042963'],
 'HP:0002018': ['UMLS/C0027497'],
 'HP:0011123': ['UMLS/C0011603'],
 'HP:0012384': ['UMLS/C0035455'],
 'HP:0002099': ['UMLS/C0004096'],
 'HP:0004808': ['UMLS/C0023467'],
 'HP:0006536': ['UMLS/C0001883'],
 'HP:0006775': ['UMLS/C0026764'],
 'HP:0001250': ['UMLS/C0014544', 'UMLS/C0036572'],
 'HP:0005978': ['UMLS/C0011860'],
 'HP:0000716': ['UMLS/C0011581'],
 'HP:0007378': ['UMLS/C0017185'],
 'HP:0000939': ['UMLS/C0029456'],
 'HP:0000479': ['UMLS/C0035309'],
 'HP:0000488': ['UMLS/C0035309'],
 'HP:0011034': ['UMLS/C0002726'],
 'HP:0002076': ['UMLS/C0149931'],
 'HP:0007269': ['UMLS/C0026847'],
 'HP:0002037': ['UMLS/C0021390'],
 'HP:0020172': ['UMLS/C0041755'],
 'HP:0001635': ['UMLS/C0018801'],
 'HP:0002664': ['UMLS/C0027651', 'UMLS/C0006826'],
 'HP:0100242': ['UMLS/C1261473'],
 'HP:0100806': ['UMLS/C0036690'],
 'HP:0100634': ['UMLS/C0206754'],
 'HP:0004326':

In [6]:
msh_condition

{'D054058': ['UMLS/C0948089'],
 'D013708': ['UMLS/C0039504'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D014839': ['UMLS/C0042963'],
 'D009325': ['UMLS/C0027497'],
 'D003872': ['UMLS/C0011603'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D015470': ['UMLS/C0023467'],
 'D000402': ['UMLS/C0001883'],
 'D009101': ['UMLS/C0026764'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D006984': ['UMLS/C0020564'],
 'D003924': ['UMLS/C0011860'],
 'D003866': ['UMLS/C0011581'],
 'D001419': ['UMLS/C0004611'],
 'D003141': ['UMLS/C0009450'],
 'D012893': ['UMLS/C0851578'],
 'D001284': ['UMLS/C0333641'],
 'D005770': ['UMLS/C0017185'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012164': ['UMLS/C0035309'],
 'D000686': ['UMLS/C0002726'],
 'D003928': ['UMLS/C0011881'],
 'D008881': ['UMLS/C0149931'],
 'D009134': ['UMLS/C0026847'],
 'D015212': ['UMLS/C0021390'],
 'D000152': ['UMLS/C0001144'],
 'D064420': ['UMLS/C0041755'],
 'D006509': ['UMLS/C00

In [7]:
hpo_name = {}
hpo_from_to = []

umls_to_hpo = set()


print (len(hpo_condition))
for hpo in tqdm(hpo_condition):

    results = umls_api.recursive_get_parent_HPO(hpo, umls_token)

    for r in results:
        hpo_from_to.append((r[0], r[1]))
        hpo_name[r[1]] = r[2]

    for u in hpo_condition[hpo]:
        umls_to_hpo.add((u, hpo))
        

137


  0%|          | 0/137 [00:00<?, ?it/s]

100%|██████████| 137/137 [06:17<00:00,  2.76s/it]


In [8]:
content = "from\tto\n"

for h in hpo_from_to:
    content += h[0] + "\t" + h[1] + "\n"


with open("neo4j/hpo_is_a.tsv", "w") as f:
    f.write(content)

In [9]:
content = "from\tto\n"

for h in umls_to_hpo:
    content += h[0] + "\t" + h[1] + "\n"


with open("neo4j/condition_is_hpo.tsv", "w") as f:
    f.write(content)

In [10]:
content = "HPO\tname\n"
for h in hpo_name:
    content += h + "\t" + hpo_name[h] + "\n"

with open("neo4j/condition_hpo_name.tsv", "w") as f:
    f.write(content)

In [11]:
msh_condition

{'D054058': ['UMLS/C0948089'],
 'D013708': ['UMLS/C0039504'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D014839': ['UMLS/C0042963'],
 'D009325': ['UMLS/C0027497'],
 'D003872': ['UMLS/C0011603'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D015470': ['UMLS/C0023467'],
 'D000402': ['UMLS/C0001883'],
 'D009101': ['UMLS/C0026764'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D006984': ['UMLS/C0020564'],
 'D003924': ['UMLS/C0011860'],
 'D003866': ['UMLS/C0011581'],
 'D001419': ['UMLS/C0004611'],
 'D003141': ['UMLS/C0009450'],
 'D012893': ['UMLS/C0851578'],
 'D001284': ['UMLS/C0333641'],
 'D005770': ['UMLS/C0017185'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012164': ['UMLS/C0035309'],
 'D000686': ['UMLS/C0002726'],
 'D003928': ['UMLS/C0011881'],
 'D008881': ['UMLS/C0149931'],
 'D009134': ['UMLS/C0026847'],
 'D015212': ['UMLS/C0021390'],
 'D000152': ['UMLS/C0001144'],
 'D064420': ['UMLS/C0041755'],
 'D006509': ['UMLS/C00

In [18]:
print (len(msh_condition))

msh_treated = {}

umls_to_msh = set()

for msh_id in tqdm(msh_condition):

    #print (msh_id)
    results = umls_api.get_all_items("MSH", msh_id, "includeAdditionalRelationLabels=may_be_prevented_by", umls_token)

    if msh_id not in msh_treated:
        msh_treated[msh_id] = set()
    for r in results:
        msh_treated[msh_id].add(r[1])
    
    for u in msh_condition[msh_id]:
        umls_to_msh.add((u, msh_id))

225


100%|██████████| 225/225 [03:33<00:00,  1.06it/s]


In [19]:
content = "MSH\tmay_be_treated_by\n"
for m in msh_treated:
    content += m + "\t" + "|".join(msh_treated[m]) + "\n"

with open("neo4j/msh_may_be_treated_by.tsv", "w") as f:
    f.write(content)

In [17]:
msh_condition

{'D054058': ['UMLS/C0948089'],
 'D013708': ['UMLS/C0039504'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D014839': ['UMLS/C0042963'],
 'D009325': ['UMLS/C0027497'],
 'D003872': ['UMLS/C0011603'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D015470': ['UMLS/C0023467'],
 'D000402': ['UMLS/C0001883'],
 'D009101': ['UMLS/C0026764'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D006984': ['UMLS/C0020564'],
 'D003924': ['UMLS/C0011860'],
 'D003866': ['UMLS/C0011581'],
 'D001419': ['UMLS/C0004611'],
 'D003141': ['UMLS/C0009450'],
 'D012893': ['UMLS/C0851578'],
 'D001284': ['UMLS/C0333641'],
 'D005770': ['UMLS/C0017185'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012164': ['UMLS/C0035309'],
 'D000686': ['UMLS/C0002726'],
 'D003928': ['UMLS/C0011881'],
 'D008881': ['UMLS/C0149931'],
 'D009134': ['UMLS/C0026847'],
 'D015212': ['UMLS/C0021390'],
 'D000152': ['UMLS/C0001144'],
 'D064420': ['UMLS/C0041755'],
 'D006509': ['UMLS/C00

In [20]:
content = "from\tto\n"

for h in umls_to_msh:
    content += h[0] + "\t" + h[1] + "\n"


with open("neo4j/condition_is_msh.tsv", "w") as f:
    f.write(content)