In [1]:
import json
import umls_api
import os
import yaml
from tqdm import tqdm
import pandas as pd

In [2]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
umls_token = PARAM["umls_token"]

In [4]:
hpo_condition = {}
msh_condition = {}

msh_name = {}
hpo_name = {}

conditions = pd.read_csv("neo4j/conditions.tsv", sep="\t")



for i, row in tqdm(conditions.iterrows()):


    condition = row["name"]
    umls = row["UMLS"]
    
    if str(row["HPO"]) != "nan":
        hpos = str(row["HPO"]).split("|")

        for hpo in hpos:
            h = hpo.split("/")[1]
            if h not in hpo_condition:
                hpo_condition[h] = []
            hpo_condition[h].append(umls)

            hpo_name[hpo] = condition

    if str(row["MSH"]) != "nan":
        mshs = str(row["MSH"]).split("|")
    
        for msh in mshs:
            m = msh.split("/")[1]
            if m not in msh_condition:
                msh_condition[m] = []
            msh_condition[m].append(umls)

            msh_name[msh] = condition

print (len(hpo_name))


279it [00:00, 15635.75it/s]

144





In [5]:
hpo_condition

{'HP:0002725': ['UMLS/C0024141'],
 'HP:0003765': ['UMLS/C0033860'],
 'HP:0000822': ['UMLS/C0020538'],
 'HP:0002013': ['UMLS/C0042963'],
 'HP:0011123': ['UMLS/C0011603'],
 'HP:0006510': ['UMLS/C0024117'],
 'HP:0012384': ['UMLS/C0035455'],
 'HP:0002099': ['UMLS/C0004096'],
 'HP:0033678': ['UMLS/C0948089'],
 'HP:0002204': ['UMLS/C0034065'],
 'HP:0001250': ['UMLS/C0014544', 'UMLS/C0036572'],
 'HP:0007302': ['UMLS/C0005586'],
 'HP:0100651': ['UMLS/C0011854'],
 'HP:0000716': ['UMLS/C0011581'],
 'HP:0005406': ['UMLS/C1835686'],
 'HP:0002621': ['UMLS/C0004153'],
 'HP:0002635': ['UMLS/C0004153'],
 'HP:0020110': ['UMLS/C0016658'],
 'HP:0000939': ['UMLS/C0029456'],
 'HP:0012452': ['UMLS/C0035258'],
 'HP:0002511': ['UMLS/C0002395'],
 'HP:0005978': ['UMLS/C0011860'],
 'HP:0002076': ['UMLS/C0149931'],
 'HP:0001635': ['UMLS/C0018802', 'UMLS/C0018801'],
 'HP:0100787': ['UMLS/C0033578'],
 'HP:0025428': ['UMLS/C0006266'],
 'HP:0001370': ['UMLS/C0003873'],
 'HP:0100582': ['UMLS/C0027430'],
 'HP:0011947':

In [6]:
msh_condition

{'D000152': ['UMLS/C0001144'],
 'D008180': ['UMLS/C0024141'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D014839': ['UMLS/C0042963'],
 'D003872': ['UMLS/C0011603'],
 'D029424': ['UMLS/C0024117'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D054058': ['UMLS/C0948089'],
 'D011655': ['UMLS/C0034065'],
 'D013927': ['UMLS/C0040053'],
 'D019964': ['UMLS/C0525045'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D001714': ['UMLS/C0005586'],
 'D015228': ['UMLS/C0020557'],
 'D003922': ['UMLS/C0011854'],
 'D003866': ['UMLS/C0011581', 'UMLS/C0041696'],
 'D001424': ['UMLS/C0004623'],
 'D050197': ['UMLS/C0004153'],
 'D050723': ['UMLS/C0016658'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012148': ['UMLS/C0035258'],
 'D000544': ['UMLS/C0002395'],
 'D003924': ['UMLS/C0011860'],
 'D008881': ['UMLS/C0149931'],
 'D012893': ['UMLS/C0851578'],
 'D030361': ['UMLS/C0343641'],
 'D013290': ['UMLS/C0038395'],
 'D014917': ['UMLS/C0043167'],
 'D00

In [7]:

hpo_from_to = []

umls_to_hpo = set()


print (len(hpo_condition))
for hpo in tqdm(hpo_condition):

    results = umls_api.recursive_get_parent_HPO(hpo, umls_token)

    for r in results:
        hpo_from_to.append((r[0], r[1]))

        if r[1] not in hpo_condition:
            hpo_name[f"HPO/{r[1]}"] = r[2]

    for u in hpo_condition[hpo]:
        umls_to_hpo.add((u, hpo))

print (len(hpo_name))

144


100%|██████████| 144/144 [06:33<00:00,  2.73s/it]

405





In [8]:
umls_to_hpo

{('UMLS/C0002170', 'HP:0001596'),
 ('UMLS/C0002170', 'HP:0002293'),
 ('UMLS/C0002395', 'HP:0002511'),
 ('UMLS/C0002453', 'HP:0000141'),
 ('UMLS/C0002726', 'HP:0011034'),
 ('UMLS/C0002871', 'HP:0001903'),
 ('UMLS/C0003811', 'HP:0011675'),
 ('UMLS/C0003873', 'HP:0001370'),
 ('UMLS/C0004096', 'HP:0002099'),
 ('UMLS/C0004153', 'HP:0002621'),
 ('UMLS/C0004153', 'HP:0002635'),
 ('UMLS/C0004238', 'HP:0005110'),
 ('UMLS/C0004364', 'HP:0002960'),
 ('UMLS/C0005586', 'HP:0007302'),
 ('UMLS/C0006266', 'HP:0025428'),
 ('UMLS/C0006267', 'HP:0002110'),
 ('UMLS/C0006277', 'HP:0012387'),
 ('UMLS/C0006625', 'HP:0004326'),
 ('UMLS/C0006826', 'HP:0002664'),
 ('UMLS/C0007095', 'HP:0100570'),
 ('UMLS/C0007097', 'HP:0030731'),
 ('UMLS/C0007131', 'HP:0030358'),
 ('UMLS/C0007134', 'HP:0005584'),
 ('UMLS/C0007222', 'HP:0001626'),
 ('UMLS/C0007787', 'HP:0002326'),
 ('UMLS/C0008370', 'HP:0001396'),
 ('UMLS/C0008372', 'HP:0001406'),
 ('UMLS/C0009319', 'HP:0002583'),
 ('UMLS/C0009404', 'HP:0100834'),
 ('UMLS/C00097

In [9]:
content = "from\tto\n"

for h in hpo_from_to:
    content += f"HPO/{h[0]}" + "\t" + f"HPO/{h[1]}" + "\n"


with open("neo4j/hpo_is_a.tsv", "w") as f:
    f.write(content)

In [10]:
content = "from\tto\n"

for h in umls_to_hpo:
    content += h[0] + "\t" + f"HPO/{h[1]}" + "\n"


with open("neo4j/condition_is_hpo.tsv", "w") as f:
    f.write(content)

In [11]:
content = "HPO\tname\n"
for h in hpo_name:
    content += h + "\t" + hpo_name[h] + "\n"

with open("neo4j/condition_hpo_name.tsv", "w") as f:
    f.write(content)

In [12]:
msh_condition

{'D000152': ['UMLS/C0001144'],
 'D008180': ['UMLS/C0024141'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D014839': ['UMLS/C0042963'],
 'D003872': ['UMLS/C0011603'],
 'D029424': ['UMLS/C0024117'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D054058': ['UMLS/C0948089'],
 'D011655': ['UMLS/C0034065'],
 'D013927': ['UMLS/C0040053'],
 'D019964': ['UMLS/C0525045'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D001714': ['UMLS/C0005586'],
 'D015228': ['UMLS/C0020557'],
 'D003922': ['UMLS/C0011854'],
 'D003866': ['UMLS/C0011581', 'UMLS/C0041696'],
 'D001424': ['UMLS/C0004623'],
 'D050197': ['UMLS/C0004153'],
 'D050723': ['UMLS/C0016658'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012148': ['UMLS/C0035258'],
 'D000544': ['UMLS/C0002395'],
 'D003924': ['UMLS/C0011860'],
 'D008881': ['UMLS/C0149931'],
 'D012893': ['UMLS/C0851578'],
 'D030361': ['UMLS/C0343641'],
 'D013290': ['UMLS/C0038395'],
 'D014917': ['UMLS/C0043167'],
 'D00

In [13]:
print (len(msh_condition))

msh_treated = {}

umls_to_msh_condition = set()

for msh_id in tqdm(msh_condition):

    #print (msh_id)
    results = umls_api.get_all_items("MSH", msh_id, "includeAdditionalRelationLabels=may_be_prevented_by", umls_token)

    if msh_id not in msh_treated:
        msh_treated[msh_id] = set()
    for r in results:
        msh_treated[msh_id].add(r[1])
    
    for u in msh_condition[msh_id]:
        umls_to_msh_condition.add((u, msh_id))

240


100%|██████████| 240/240 [04:21<00:00,  1.09s/it]


In [14]:
content = "MSH\tmay_be_treated_by\n"
for m in msh_treated:
    content += f"MSH/{m}" + "\t" + "|".join(msh_treated[m]) + "\n"

with open("neo4j/msh_may_be_treated_by.tsv", "w") as f:
    f.write(content)

In [15]:
msh_condition

{'D000152': ['UMLS/C0001144'],
 'D008180': ['UMLS/C0024141'],
 'D011565': ['UMLS/C0033860'],
 'D006973': ['UMLS/C0020538'],
 'D014839': ['UMLS/C0042963'],
 'D003872': ['UMLS/C0011603'],
 'D029424': ['UMLS/C0024117'],
 'D012220': ['UMLS/C0035455'],
 'D001249': ['UMLS/C0004096'],
 'D054058': ['UMLS/C0948089'],
 'D011655': ['UMLS/C0034065'],
 'D013927': ['UMLS/C0040053'],
 'D019964': ['UMLS/C0525045'],
 'D004827': ['UMLS/C0014544', 'UMLS/C0236018'],
 'D001714': ['UMLS/C0005586'],
 'D015228': ['UMLS/C0020557'],
 'D003922': ['UMLS/C0011854'],
 'D003866': ['UMLS/C0011581', 'UMLS/C0041696'],
 'D001424': ['UMLS/C0004623'],
 'D050197': ['UMLS/C0004153'],
 'D050723': ['UMLS/C0016658'],
 'D010024': ['UMLS/C0029456'],
 'D010300': ['UMLS/C0030567'],
 'D012148': ['UMLS/C0035258'],
 'D000544': ['UMLS/C0002395'],
 'D003924': ['UMLS/C0011860'],
 'D008881': ['UMLS/C0149931'],
 'D012893': ['UMLS/C0851578'],
 'D030361': ['UMLS/C0343641'],
 'D013290': ['UMLS/C0038395'],
 'D014917': ['UMLS/C0043167'],
 'D00

In [16]:
content = "from\tto\n"

for h in umls_to_msh_condition:
    content += h[0] + "\t" + f"MSH/{h[1]}" + "\n"


with open("neo4j/condition_is_msh.tsv", "w") as f:
    f.write(content)

In [17]:
content = "MSH\tname\n"
for m in msh_name:
    content += m + "\t" + msh_name[m] + "\n"

with open("neo4j/condition_msh_name.tsv", "w") as f:
    f.write(content)

In [18]:
msh_medicine = {}

conditions = pd.read_csv("neo4j/medicines.tsv", sep="\t")

msh_name_medicine = {}
for i, row in tqdm(conditions.iterrows()):

    umls = row["UMLS"]

    if str(row["MSH"]) != "nan":
        mshs = str(row["MSH"]).split("|")
    
        for msh in mshs:
            m = msh.split("/")[1]
            if m not in msh_medicine:
                msh_medicine[m] = []
            msh_medicine[m].append(umls)

            msh_name_medicine[m] = row["name"]


254it [00:00, 15527.67it/s]


In [19]:
msh_mapped_to = {}
valid_gsk = {}

content = "name\tUMLS\tMSH\tHPO\tRXNORM\tother_name\n"

gsk_is_a = set()

lines = open("cryptic_medicines.txt").readlines()

for line in tqdm(lines):
    gsk = line.strip()
    umls = umls_api.search(gsk, umls_token)

    if umls != "" and umls != None:
        msh = umls_api.get_MSH(umls, umls_token)
        name = ""
        
        if msh != "" and msh != None:
            msh_name_medicine[msh] = gsk

            name = umls_api.get_name(msh, umls_token)
            is_mapped_to = umls_api.get_all_items("MSH", msh, "includeAdditionalRelationLabels=mapped_to", umls_token)

            if msh not in msh_mapped_to:
                msh_mapped_to[msh] = []
            for m in is_mapped_to:
                msh_mapped_to[msh].append(m[0])
                msh_name_medicine[m[0]] = m[1]
            
            gsk_is_a.add((f"UMLS/{umls}", f"MSH/{msh}"))
        
        else:
            msh = ""


        content += f"{gsk}\tUMLS/{umls}\tMSH/{msh}\t\t\t{name}\n"

        valid_gsk[gsk] = umls
   
with open("neo4j/medicines_cryptic.tsv", "w") as f:
    f.write(content)

content = "from\tto\n"
with open("neo4j/gsk_is_a.tsv", "w") as f:
    for g in gsk_is_a:
        content += g[0] + "\t" + g[1] + "\n"
    f.write(content)

100%|██████████| 196/196 [07:06<00:00,  2.18s/it]


In [20]:
trial_cryptic_drugs = pd.read_csv("trial_cryptic_drugs.tsv", sep="\t")

content = "from\tto\n"

for i, row in tqdm(trial_cryptic_drugs.iterrows()):
    gsk = row["to"]
    trial = row["from"]

    if gsk in valid_gsk:
        content += f"{trial}\tUMLS/{valid_gsk[gsk]}\n"


with open("neo4j/trial_cryptic_drugs.tsv", "w") as f:
    f.write(content)

370it [00:00, 6411.16it/s]


In [21]:
msh_mapped_to

{'C549469': ['D013450', 'D000634'],
 'C581038': ['D013256'],
 'C512301': ['D014508', 'D011759'],
 'C000625977': ['D007211', 'D010069'],
 'C576936': ['D013449', 'D000420'],
 'D000077185': [],
 'C000593393': ['D001549', 'D008782'],
 'C558853': ['D010793', 'D010880', 'D010793', 'D010880'],
 'C558852': ['D009281', 'D010880', 'D009281', 'D010880'],
 'C562323': ['D001549', 'D011759'],
 'C561922': ['D001549'],
 'C000608195': ['D061067'],
 'C587902': ['D010069', 'D008698'],
 'C583947': ['D006877', 'D019086'],
 'C557997': ['D053961'],
 'C000604998': ['D011761'],
 'C568078': ['D007189', 'D014230'],
 'C587791': ['D007093', 'D011804'],
 'C000594852': ['D011993'],
 'C584220': ['D001552'],
 'C588298': ['D001897', 'D013449'],
 'C000604188': ['D014970'],
 'C000590132': ['D061067'],
 'C561573': ['D001562', 'D013876'],
 'C000594007': ['D053961', 'D010080'],
 'C582664': ['D061067'],
 'C409004': ['D013440'],
 'C545367': ['D014230', 'D000241'],
 'C000602898': ['D010649', 'D010880'],
 'C557781': ['D001572',

In [22]:
msh_name_medicine

{'C511911': 'belimumab',
 'C055085': 'calcipotriene',
 'C466951': 'benzoyl peroxide / clindamycin',
 'D000077261': 'carvedilol',
 'D002990': 'clobetasol',
 'C523187': 'fluticasone furoate',
 'D000068298': 'fluticasone',
 'D000068297': 'Fluticasone Propionate, Salmeterol Xinafoate Drug Combination',
 'D000077425': 'fondaparinux sodium',
 'D000077213': 'lamotrigine',
 'C550701': 'otelixizumab',
 'D017374': 'paroxetine',
 'C508887': 'retapamulin',
 'C101866': 'ezogabine',
 'C000592856': 'rilapladib',
 'C561716': 'ronacaleret',
 'C046649': 'ropinirole',
 'D000077154': 'rosiglitazone',
 'C000611385': 'naproxen / sumatriptan',
 'C086827': 'tazarotene',
 'C534550': 'vestipitant',
 'C510352': 'human papillomavirus vaccine, L1 type 16, 18',
 'D022242': 'Polyvalent pneumococcal vaccine',
 'D004166': 'diphtheria antitoxin',
 'D013745': 'tetanus toxoid vaccine, inactivated',
 'D011055': 'Oral Poliovirus Vaccine',
 'C044237': 'Haemophilus influenzae type b polysaccharide vaccine',
 'C092872': 'stre

In [23]:
print (len(msh_medicine))

msh_isa = {}


umls_to_msh_medicine = set()

for msh_id in tqdm(msh_medicine):

    #print (msh_id)
    results = umls_api.get_all_items("MSH", msh_id, "includeAdditionalRelationLabels=isa", umls_token)

    if msh_id not in msh_isa:
        msh_isa[msh_id] = []
    for r in results:
        msh_isa[msh_id].append(r[0])

        msh_name_medicine[r[0]] = r[1]


    results = umls_api.get_all_items("MSH", msh_id, "includeAdditionalRelationLabels=mapped_to", umls_token)

    if msh_id not in msh_mapped_to:
        msh_mapped_to[msh_id] = []
    for r in results:
        msh_mapped_to[msh_id].append(r[0])

        msh_name_medicine[r[0]] = r[1]
    
    for u in msh_medicine[msh_id]:
        umls_to_msh_medicine.add((u, msh_id))

219


100%|██████████| 219/219 [06:37<00:00,  1.82s/it]


In [24]:
edge_content = "from\tto\n"

for m in msh_isa:
    for is_a in msh_isa[m]:
        edge_content += f"MSH/{m}" + "\t" + f"MSH/{is_a}" + "\n"

#print (edge_content)

# with open("neo4j/medicine_msh_name.tsv", "w") as f:
#     f.write(node_content)

with open("neo4j/medicine_msh_is_a.tsv", "w") as f:
    f.write(edge_content)


edge_content = "from\tto\n"

for m in msh_mapped_to:
    for mapped_to in msh_mapped_to[m]:
        edge_content += f"MSH/{m}" + "\t" + f"MSH/{mapped_to}" + "\n"

with open("neo4j/medicine_msh_mapped_to.tsv", "w") as f:
    f.write(edge_content)


node_content = "MSH\tname\n"

for msh in msh_name_medicine:
    node_content += f"MSH/{msh}" + "\t" + msh_name_medicine[msh] + "\n"

with open("neo4j/medicine_msh_name.tsv", "w") as f:
    f.write(node_content)

In [25]:
content = "from\tto\n"

for h in umls_to_msh_medicine:
    content += h[0] + "\t" + f"MSH/{h[1]}" + "\n"


with open("neo4j/medicine_is_msh.tsv", "w") as f:
    f.write(content)