In [1]:
import json
import umls_api
import os
import yaml
from tqdm import tqdm
import pandas as pd

In [2]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
umls_token = PARAM["umls_token"]

In [4]:
hpo_condition = {}
msh_condition = {}

msh_name = {}
hpo_name = {}

conditions = pd.read_csv("neo4j/conditions.tsv", sep="\t")



for i, row in tqdm(conditions.iterrows()):


    condition = row["name"]
    umls = row["UMLS"]
    
    if str(row["HPO"]) != "nan":
        hpos = str(row["HPO"]).split("|")

        for hpo in hpos:
            h = hpo.split("/")[1]
            if h not in hpo_condition:
                hpo_condition[h] = []
            hpo_condition[h].append(umls)

            hpo_name[hpo] = condition

    if str(row["MSH"]) != "nan":
        mshs = str(row["MSH"]).split("|")
    
        for msh in mshs:
            m = msh.split("/")[1]
            if m not in msh_condition:
                msh_condition[m] = []
            msh_condition[m].append(umls)

            msh_name[msh] = condition

print (len(hpo_name))


266it [00:00, 16943.34it/s]

141





In [5]:
hpo_condition

{'HP:0002725': ['UMLS/C0024141'],
 'HP:0003765': ['UMLS/C0033860'],
 'HP:0000822': ['UMLS/C0020538'],
 'HP:0002018': ['UMLS/C0027497'],
 'HP:0002013': ['UMLS/C0042963'],
 'HP:0011123': ['UMLS/C0011603'],
 'HP:0012384': ['UMLS/C0035455'],
 'HP:0002099': ['UMLS/C0004096'],
 'HP:0033678': ['UMLS/C0948089'],
 'HP:0006536': ['UMLS/C0001883'],
 'HP:0001250': ['UMLS/C0014544', 'UMLS/C0036572'],
 'HP:0007302': ['UMLS/C0005586'],
 'HP:0100651': ['UMLS/C0011854'],
 'HP:0000716': ['UMLS/C0011581'],
 'HP:0005406': ['UMLS/C1835686'],
 'HP:0002621': ['UMLS/C0004153'],
 'HP:0002635': ['UMLS/C0004153'],
 'HP:0020110': ['UMLS/C0016658'],
 'HP:0000939': ['UMLS/C0029456'],
 'HP:0012452': ['UMLS/C0035258'],
 'HP:0002511': ['UMLS/C0002395'],
 'HP:0005978': ['UMLS/C0011860'],
 'HP:0002076': ['UMLS/C0149931'],
 'HP:0033345': ['UMLS/C0027796'],
 'HP:0001635': ['UMLS/C0018801'],
 'HP:0002664': ['UMLS/C0027651', 'UMLS/C0006826'],
 'HP:0025428': ['UMLS/C0006266'],
 'HP:0001370': ['UMLS/C0003873'],
 'HP:0000388':

In [6]:
msh_condition

{'D000152': ['UMLS/C0001144'],
 'D008180': ['UMLS/C0024141'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D009325': ['UMLS/C0027497'],
 'D014839': ['UMLS/C0042963'],
 'D003872': ['UMLS/C0011603'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D054058': ['UMLS/C0948089'],
 'D004617': ['UMLS/C0013922'],
 'D000402': ['UMLS/C0001883'],
 'D013927': ['UMLS/C0040053'],
 'D019964': ['UMLS/C0525045'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D001714': ['UMLS/C0005586'],
 'D015228': ['UMLS/C0020557'],
 'D003922': ['UMLS/C0011854'],
 'D003866': ['UMLS/C0011581'],
 'D003141': ['UMLS/C0009450'],
 'D001419': ['UMLS/C0004611'],
 'D050197': ['UMLS/C0004153'],
 'D050723': ['UMLS/C0016658'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012148': ['UMLS/C0035258'],
 'D000544': ['UMLS/C0002395'],
 'D003924': ['UMLS/C0011860'],
 'D008881': ['UMLS/C0149931'],
 'D012893': ['UMLS/C0851578'],
 'D014917': ['UMLS/C0043167'],
 'D004165': ['UMLS/C00

In [7]:

hpo_from_to = []

umls_to_hpo = set()


print (len(hpo_condition))
for hpo in tqdm(hpo_condition):

    results = umls_api.recursive_get_parent_HPO(hpo, umls_token)

    for r in results:
        hpo_from_to.append((r[0], r[1]))

        if r[1] not in hpo_condition:
            hpo_name[f"HPO/{r[1]}"] = r[2]

    for u in hpo_condition[hpo]:
        umls_to_hpo.add((u, hpo))

print (len(hpo_name))

141


100%|██████████| 141/141 [06:07<00:00,  2.61s/it]

383





In [8]:
umls_to_hpo

{('UMLS/C0001883', 'HP:0006536'),
 ('UMLS/C0002170', 'HP:0001596'),
 ('UMLS/C0002170', 'HP:0002293'),
 ('UMLS/C0002395', 'HP:0002511'),
 ('UMLS/C0002453', 'HP:0000141'),
 ('UMLS/C0002726', 'HP:0011034'),
 ('UMLS/C0002871', 'HP:0001903'),
 ('UMLS/C0003090', 'HP:0031013'),
 ('UMLS/C0003811', 'HP:0011675'),
 ('UMLS/C0003873', 'HP:0001370'),
 ('UMLS/C0004096', 'HP:0002099'),
 ('UMLS/C0004153', 'HP:0002621'),
 ('UMLS/C0004153', 'HP:0002635'),
 ('UMLS/C0004238', 'HP:0005110'),
 ('UMLS/C0004364', 'HP:0002960'),
 ('UMLS/C0005586', 'HP:0007302'),
 ('UMLS/C0006266', 'HP:0025428'),
 ('UMLS/C0006267', 'HP:0002110'),
 ('UMLS/C0006277', 'HP:0012387'),
 ('UMLS/C0006625', 'HP:0004326'),
 ('UMLS/C0006826', 'HP:0002664'),
 ('UMLS/C0007095', 'HP:0100570'),
 ('UMLS/C0007097', 'HP:0030731'),
 ('UMLS/C0007131', 'HP:0030358'),
 ('UMLS/C0007134', 'HP:0005584'),
 ('UMLS/C0007222', 'HP:0001626'),
 ('UMLS/C0007787', 'HP:0002326'),
 ('UMLS/C0008370', 'HP:0001396'),
 ('UMLS/C0009319', 'HP:0002583'),
 ('UMLS/C00094

In [9]:
content = "from\tto\n"

for h in hpo_from_to:
    content += f"HPO/{h[0]}" + "\t" + f"HPO/{h[1]}" + "\n"


with open("neo4j/hpo_is_a.tsv", "w") as f:
    f.write(content)

In [10]:
content = "from\tto\n"

for h in umls_to_hpo:
    content += h[0] + "\t" + f"HPO/{h[1]}" + "\n"


with open("neo4j/condition_is_hpo.tsv", "w") as f:
    f.write(content)

In [11]:
content = "HPO\tname\n"
for h in hpo_name:
    content += h + "\t" + hpo_name[h] + "\n"

with open("neo4j/condition_hpo_name.tsv", "w") as f:
    f.write(content)

In [12]:
msh_condition

{'D000152': ['UMLS/C0001144'],
 'D008180': ['UMLS/C0024141'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D009325': ['UMLS/C0027497'],
 'D014839': ['UMLS/C0042963'],
 'D003872': ['UMLS/C0011603'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D054058': ['UMLS/C0948089'],
 'D004617': ['UMLS/C0013922'],
 'D000402': ['UMLS/C0001883'],
 'D013927': ['UMLS/C0040053'],
 'D019964': ['UMLS/C0525045'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D001714': ['UMLS/C0005586'],
 'D015228': ['UMLS/C0020557'],
 'D003922': ['UMLS/C0011854'],
 'D003866': ['UMLS/C0011581'],
 'D003141': ['UMLS/C0009450'],
 'D001419': ['UMLS/C0004611'],
 'D050197': ['UMLS/C0004153'],
 'D050723': ['UMLS/C0016658'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012148': ['UMLS/C0035258'],
 'D000544': ['UMLS/C0002395'],
 'D003924': ['UMLS/C0011860'],
 'D008881': ['UMLS/C0149931'],
 'D012893': ['UMLS/C0851578'],
 'D014917': ['UMLS/C0043167'],
 'D004165': ['UMLS/C00

In [13]:
print (len(msh_condition))

msh_treated = {}

umls_to_msh_condition = set()

for msh_id in tqdm(msh_condition):

    #print (msh_id)
    results = umls_api.get_all_items("MSH", msh_id, "includeAdditionalRelationLabels=may_be_prevented_by", umls_token)

    if msh_id not in msh_treated:
        msh_treated[msh_id] = set()
    for r in results:
        msh_treated[msh_id].add(r[1])
    
    for u in msh_condition[msh_id]:
        umls_to_msh_condition.add((u, msh_id))

228


100%|██████████| 228/228 [03:49<00:00,  1.01s/it]


In [14]:
content = "MSH\tmay_be_treated_by\n"
for m in msh_treated:
    content += f"MSH/{m}" + "\t" + "|".join(msh_treated[m]) + "\n"

with open("neo4j/msh_may_be_treated_by.tsv", "w") as f:
    f.write(content)

In [15]:
msh_condition

{'D000152': ['UMLS/C0001144'],
 'D008180': ['UMLS/C0024141'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D009325': ['UMLS/C0027497'],
 'D014839': ['UMLS/C0042963'],
 'D003872': ['UMLS/C0011603'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D054058': ['UMLS/C0948089'],
 'D004617': ['UMLS/C0013922'],
 'D000402': ['UMLS/C0001883'],
 'D013927': ['UMLS/C0040053'],
 'D019964': ['UMLS/C0525045'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D001714': ['UMLS/C0005586'],
 'D015228': ['UMLS/C0020557'],
 'D003922': ['UMLS/C0011854'],
 'D003866': ['UMLS/C0011581'],
 'D003141': ['UMLS/C0009450'],
 'D001419': ['UMLS/C0004611'],
 'D050197': ['UMLS/C0004153'],
 'D050723': ['UMLS/C0016658'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012148': ['UMLS/C0035258'],
 'D000544': ['UMLS/C0002395'],
 'D003924': ['UMLS/C0011860'],
 'D008881': ['UMLS/C0149931'],
 'D012893': ['UMLS/C0851578'],
 'D014917': ['UMLS/C0043167'],
 'D004165': ['UMLS/C00

In [16]:
content = "from\tto\n"

for h in umls_to_msh_condition:
    content += h[0] + "\t" + f"MSH/{h[1]}" + "\n"


with open("neo4j/condition_is_msh.tsv", "w") as f:
    f.write(content)

In [17]:
content = "MSH\tname\n"
for m in msh_name:
    content += m + "\t" + msh_name[m] + "\n"

with open("neo4j/condition_msh_name.tsv", "w") as f:
    f.write(content)

In [18]:
msh_medicine = {}

conditions = pd.read_csv("neo4j/medicines.tsv", sep="\t")

msh_name_medicine = {}
for i, row in tqdm(conditions.iterrows()):

    umls = row["UMLS"]

    if str(row["MSH"]) != "nan":
        mshs = str(row["MSH"]).split("|")
    
        for msh in mshs:
            m = msh.split("/")[1]
            if m not in msh_medicine:
                msh_medicine[m] = []
            msh_medicine[m].append(umls)

            msh_name_medicine[m] = row["name"]


254it [00:00, 15647.40it/s]


In [19]:
print (len(msh_medicine))

msh_isa = {}

umls_to_msh_medicine = set()

for msh_id in tqdm(msh_medicine):

    #print (msh_id)
    results = umls_api.get_all_items("MSH", msh_id, "includeAdditionalRelationLabels=isa", umls_token)

    if msh_id not in msh_isa:
        msh_isa[msh_id] = {}
    for r in results:
        msh_isa[msh_id][r[0]] = r[1]
    
    for u in msh_medicine[msh_id]:
        umls_to_msh_medicine.add((u, msh_id))

219


100%|██████████| 219/219 [03:37<00:00,  1.01it/s]


In [20]:
node_content = "MSH\tname\n"

edge_content = "from\tto\n"

for m in msh_isa:
    node_content += f"MSH/{m}" + "\t" + msh_name_medicine[m] + "\n"

    for is_a in msh_isa[m]:
        node_content +=  f"MSH/{is_a}" + "\t" + msh_isa[m][is_a] + "\n"
        edge_content += f"MSH/{m}" + "\t" + f"MSH/{is_a}" + "\n"

print (node_content)
#print (edge_content)

with open("neo4j/medicine_msh_name.tsv", "w") as f:
    f.write(node_content)

with open("neo4j/medicine_msh_is_a.tsv", "w") as f:
    f.write(edge_content)


MSH	name
MSH/C511911	belimumab
MSH/D007166	Immunosuppressive Agents
MSH/C055085	calcipotriene
MSH/D003879	Dermatologic Agents
MSH/C466951	benzoyl peroxide / clindamycin
MSH/D000077261	carvedilol
MSH/D058668	Adrenergic alpha-1 Receptor Antagonists
MSH/D002121	Calcium Channel Blockers
MSH/D000959	Antihypertensive Agents
MSH/D000975	Antioxidants
MSH/D014665	Vasodilator Agents
MSH/D000319	Adrenergic beta-Antagonists
MSH/D002990	clobetasol
MSH/D005938	Glucocorticoids
MSH/D000893	Anti-Inflammatory Agents
MSH/C523187	fluticasone furoate
MSH/D000068298	fluticasone
MSH/D003879	Dermatologic Agents
MSH/D018926	Anti-Allergic Agents
MSH/D001993	Bronchodilator Agents
MSH/D000893	Anti-Inflammatory Agents
MSH/D000068297	Fluticasone Propionate, Salmeterol Xinafoate Drug Combination
MSH/D005938	Glucocorticoids
MSH/D001993	Bronchodilator Agents
MSH/D013566	Sympathomimetics
MSH/D000077425	fondaparinux sodium
MSH/D065427	Factor Xa Inhibitors
MSH/D000077213	lamotrigine
MSH/D026941	Sodium Channel Blockers
MS

In [21]:
content = "from\tto\n"

for h in umls_to_msh_medicine:
    content += h[0] + "\t" + f"MSH/{h[1]}" + "\n"


with open("neo4j/medicine_is_msh.tsv", "w") as f:
    f.write(content)