In [1]:
import pandas as pd
import json
import re
import umls_api
import yaml
from tqdm import tqdm

In [2]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
umls_token = PARAM["umls_token"]

In [4]:
output_folder = "neo4j"

In [5]:
df = pd.read_csv('medicine_data.csv')
df.head()

Unnamed: 0,sub_category,product_name,salt_composition,product_price,product_manufactured,medicine_desc,side_effects,drug_interactions
0,Human Insulin Basal,Human Insulatard 40IU/ml Suspension for Injection,Insulin Isophane (40IU),₹133.93,Novo Nordisk India Pvt Ltd,Human Insulatard 40IU/ml Suspension for Inject...,"Hypoglycemia (low blood glucose level),Injecti...","{""drug"": [""Benazepril"", ""Captopril"", ""Enalapri..."
1,Human Insulin Basal,Insulin 40IU/ml Injection,Insulin Isophane (40IU),₹121.91,Sun Pharmaceutical Industries Ltd,Insulin 40IU/ml Injection is used to improve b...,"Hypoglycemia (low blood glucose level),Injecti...","{""drug"": [""Benazepril"", ""Captopril"", ""Enalapri..."
2,Human Insulin Basal,Huminsulin N 40IU/ml Injection,Insulin Isophane (40IU),₹133.45,Eli Lilly and Company India Pvt Ltd,Huminsulin N 40IU/ml Injection is used to impr...,"Hypoglycemia (low blood glucose level),Injecti...","{""drug"": [""Benazepril"", ""Captopril"", ""Enalapri..."
3,Human Insulin Basal,Insugen-N 40IU/ml Injection,Insulin Isophane (40IU),₹133.36,Biocon,Insugen-N 40IU/ml Injection is used to improve...,"Hypoglycemia (low blood glucose level),Injecti...","{""drug"": [""Benazepril"", ""Captopril"", ""Enalapri..."
4,Human Insulin Basal,Insulatard 100IU/ml Flexpen,Insulin Isophane (100IU/ml),₹401.03,Novo Nordisk India Pvt Ltd,Insulatard 100IU/ml Flexpen is used to improve...,"Hypoglycemia (low blood glucose level),Injecti...","{""drug"": [""Benazepril"", ""Captopril"", ""Enalapri..."


In [6]:
drugs = set()
relations = set()


for i, row in df.iterrows():

    drug = re.sub('\(.+?\)', '', row['salt_composition']).strip()
    drug_interactions = json.loads(row['drug_interactions'])

    interactive_drugs = drug_interactions["drug"]
    interaction = drug_interactions["effect"]


    for d, i in zip(interactive_drugs, interaction):
        drugs.add(d.strip())
        drugs.add(drug)

        smaller = min(drug, d.strip())
        bigger = max(drug, d.strip())

        relations.add((smaller, bigger, i.strip()))
   


In [7]:
content = "name\n"
for drug in drugs:
    content += drug + '\n'

with open('drugs.tsv', 'w') as f:
    f.write(content)

In [8]:
content = "from\tto\teffect\n"
for r in relations:
    content += r[0] + "\t" + r[1] + "\t" + r[2] + '\n'

with open(f'{output_folder}/INTERACTS_WITH.tsv', 'w') as f:
    f.write(content)

In [9]:
drug_details = {}


typos = {"Tremor": "Tremors"}

for i, row in df.iterrows():
    drug = re.sub('\(.+?\)', '', row['salt_composition']).strip()

    if drug in drugs:
        #effects = set([re.sub(r"\s+", " ", x.strip().replace("(", "").replace(")", "").replace(",", "")) for x in re.split(r",(?![^(]*\))", row['side_effects'])])
        effects = set([x.strip() for x in re.split(r",(?![^(]*\))", row['side_effects'])])

        for t in typos:
            if t in effects:
                effects.remove(t)
                effects.add(typos[t])

        drug_details[drug] = {
            'side_effects': effects
        }


In [10]:
df_11100 = pd.read_csv('Medicine_Details.csv')
df_11100.head()

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Image URL,Manufacturer,Excellent Review %,Average Review %,Poor Review %
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Roche Products India Pvt Ltd,22,56,22
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glaxo SmithKline Pharmaceuticals Ltd,47,35,18
2,Azithral 500 Tablet,Azithromycin (500mg),Treatment of Bacterial infections,Nausea Abdominal pain Diarrhea,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Alembic Pharmaceuticals Ltd,39,40,21
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...,Treatment of Cough with mucus,Nausea Vomiting Diarrhea Upset stomach Stomach...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glenmark Pharmaceuticals Ltd,24,41,35
4,Aciloc 150 Tablet,Ranitidine (150mg),Treatment of Gastroesophageal reflux disease (...,Headache Diarrhea Gastrointestinal disturbance,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Cadila Pharmaceuticals Ltd,34,37,29


In [11]:
n = 0

for i, row in df_11100.iterrows():
    n += 1

    if n < 10:
        drug = re.sub('\(.+?\)', '', row['Composition']).strip()

        Side_effects = [x.strip() for x in re.findall(r"[A-Z][^A-Z]*", row['Side_effects'])]

        print (drug, Side_effects)
    
    else:
        break

Bevacizumab ['Rectal bleeding', 'Taste change', 'Headache', 'Nosebleeds', 'Back pain', 'Dry skin', 'High blood pressure', 'Protein in urine', 'Inflammation of the nose']
Amoxycillin   +  Clavulanic Acid ['Vomiting', 'Nausea', 'Diarrhea', 'Mucocutaneous candidiasis']
Azithromycin ['Nausea', 'Abdominal pain', 'Diarrhea']
Ambroxol  + Levosalbutamol  + Guaifenesin ['Nausea', 'Vomiting', 'Diarrhea', 'Upset stomach', 'Stomach pain', 'Allergic reaction', 'Dizziness', 'Headache', 'Rash', 'Hives', 'Tremors', 'Palpitations', 'Muscle cramp', 'Increased heart rate']
Ranitidine ['Headache', 'Diarrhea', 'Gastrointestinal disturbance']
Fexofenadine ['Headache', 'Drowsiness', 'Dizziness', 'Nausea']
Pheniramine ['Sedation']
Donepezil ['Common cold', 'Urinary incontinence', 'Rash', 'Nausea', 'Diarrhea', 'Insomnia difficulty in sleeping', 'Weight loss', 'Accidental injury']
Amoxycillin   +  Clavulanic Acid ['Vomiting', 'Nausea', 'Diarrhea', 'Mucocutaneous candidiasis']


In [12]:
typos = {"Abnormal E C G": "Abnormal ecg", "Increased creatine phosphokinase C P K": "Increased creatine phosphokinase cpk", "Increased L D L cholesterol level in blood": "Increased ldl cholesterol level in blood"}

In [13]:


for i, row in df_11100.iterrows():
    drug = re.sub('\(.+?\)', '', row['Composition']).strip()

    if drug in drugs:
        effect_str= row['Side_effects']

        for t in typos:
            effect_str = effect_str.replace(t, typos[t])

        new_effects = set([x.strip() for x in re.findall(r"[A-Z][^A-Z]*", effect_str)])

    
        if drug in drug_details:
            old_effect = drug_details[drug]["side_effects"]

            for o in old_effect:

                if o not in new_effects:
                    old_words = set(re.findall(r'\b\w+\b', o))

                    for n in new_effects:
                        new_words = set(re.findall(r'\b\w+\b', n))

                        if old_words == new_words:
                            typos[n] = o
            

print (typos)

{'Abnormal E C G': 'Abnormal ecg', 'Increased creatine phosphokinase C P K': 'Increased creatine phosphokinase cpk', 'Increased L D L cholesterol level in blood': 'Increased ldl cholesterol level in blood', 'Insomnia difficulty in sleeping': 'Insomnia (difficulty in sleeping)', 'Orthostatic hypotension sudden lowering of blood pressure on standing': 'Orthostatic hypotension (sudden lowering of blood pressure on standing)', 'Nasal congestion stuffy nose': 'Nasal congestion (stuffy nose)', 'Paresthesia tingling or pricking sensation': 'Paresthesia (tingling or pricking sensation)', 'Hypoglycemia low blood glucose level': 'Hypoglycemia (low blood glucose level)', 'Balance disorder loss of balance': 'Balance disorder (loss of balance)', 'Akathisia inability to stay still': 'Akathisia (inability to stay still)', 'Flushing sense of warmth in the face ears neck and trunk': 'Flushing (sense of warmth in the face, ears, neck and trunk)', 'Arrhythmia irregular heartbeats': 'Arrhythmia (irregular

In [14]:


for i, row in df_11100.iterrows():
    drug = re.sub('\(.+?\)', '', row['Composition']).strip()

    if drug in drugs:
        effect_str= row['Side_effects']

        for t in typos:
            effect_str = effect_str.replace(t, typos[t])

        effects = set([x.strip() for x in re.findall(r"[A-Z][^A-Z]*", effect_str)])

        for e in effects:
            if len(e) < 3:
                print (i, drug, effect_str)
        
        if drug not in drug_details:
            drug_details[drug] = {
                'side_effects': set()
            }
            
        drug_details[drug]["side_effects"] |= effects

content = ""

for drug in drugs:
    if drug in drug_details:
        if "No common side effects seen" in drug_details[drug]['side_effects'] and len(drug_details[drug]['side_effects']) > 1:
            drug_details[drug]['side_effects'].remove("No common side effects seen")
        content += json.dumps({"name": drug, "side_effects": list(drug_details[drug]['side_effects'])}) + '\n'
    else:
        content += json.dumps({"name": drug, "side_effects": [] }) + "\n"

with open('drug_details_3.jsonl', 'w') as f:
    f.write(content)

In [28]:
drug_id = {}

conditions = {}
may_treat = set()
may_prevent = set()
contraindicated_with_disease = set()

actions = {}
has_mechanism_of_action = set()

structures = {}
has_structural_class = set()

therapeutics = {}
has_therapeutic_class = set()

done = []

In [29]:


with open('drug_details_3.jsonl', 'r') as f:
    for line in tqdm(f.readlines()):
        drug = json.loads(line)
        drug_name = drug['name']
        side_effects = drug['side_effects']

        if drug_name not in done:

            drug_id[drug_name] = {"side_effects": side_effects, "UMLS_ID": "", "RXNORM_id": ""}

            UMLS_ID = umls_api.search(drug['name'], umls_token)
            if UMLS_ID:
                drug_id[drug_name]["UMLS_ID"] = UMLS_ID
            
                RXNORM_id = umls_api.get_all_id_pages("RXNORM", UMLS_ID, umls_token)
                if RXNORM_id:
                    drug_id[drug_name]["RXNORM_id"] = RXNORM_id

                    #may_treat
                    results = umls_api.get_all_items("RXNORM", RXNORM_id, "includeAdditionalRelationLabels=may_treat", umls_token)
                    for m in results:
                        conditions[m[0]] = m[1]
                        may_treat.add((drug_name, m[0]))
                    
                    #may_prevent
                    results = umls_api.get_all_items("RXNORM", RXNORM_id, "includeAdditionalRelationLabels=may_prevent", umls_token)
                    for m in results:
                        conditions[m[0]] = m[1]
                        may_prevent.add((drug_name, m[0]))

                    #has_active_metabolites
                    results = umls_api.get_all_items("RXNORM", RXNORM_id, "includeAdditionalRelationLabels=contraindicated_with_disease", umls_token)
                    for m in results:
                        conditions[m[0]] = m[1]
                        contraindicated_with_disease.add((drug_name, m[0]))
                    
                    #has_mechanism_of_action
                    results = umls_api.get_all_items("RXNORM", RXNORM_id, "includeAdditionalRelationLabels=has_mechanism_of_action", umls_token)
                    for m in results:
                        actions[m[0]] = m[1]
                        has_mechanism_of_action.add((drug_name, m[0]))

                    #has_structural_class
                    results = umls_api.get_all_items("RXNORM", RXNORM_id, "includeAdditionalRelationLabels=has_structural_class", umls_token)
                    for m in results:
                        structures[m[0]] = m[1]
                        has_structural_class.add((drug_name, m[0]))

                    #has_therapeutic_class
                    results = umls_api.get_all_items("RXNORM", RXNORM_id, "includeAdditionalRelationLabels=has_therapeutic_class", umls_token)
                    for m in results:
                        therapeutics[m[0]] = m[1]
                        has_therapeutic_class.add((drug_name, m[0]))
            
            
            done.append(drug_name)



  0%|          | 0/499 [00:00<?, ?it/s]

100%|██████████| 499/499 [58:52<00:00,  7.08s/it] 


In [41]:
content = ""

for d in drug_id:
    drug = {"name": d, "UMLS_ID": drug_id[d]["UMLS_ID"], "RXNORM_id": drug_id[d]["RXNORM_id"], "side_effects": drug_id[d]["side_effects"]}
    content += json.dumps(drug) + '\n'

with open(f'drug_details_4.jsonl', 'w') as f:
    f.write(content)

In [42]:
content = ""

for d in conditions:
    condition = {"name": conditions[d], "ID": d}
    content += json.dumps(condition) + '\n'

with open(f'{output_folder}/condition.jsonl', 'w') as f:
    f.write(content)

content = "from\tto\n"
for r in may_treat:
    content += r[0] + "\t" + r[1] + '\n'

with open(f'{output_folder}/MAY_TREAT.tsv', 'w') as f:
    f.write(content)


content = "from\tto\n"
for r in may_prevent:
    content += r[0] + "\t" + r[1] + '\n'

with open(f'{output_folder}/MAY_PREVENT.tsv', 'w') as f:
    f.write(content)

content = "from\tto\n"
for r in contraindicated_with_disease:
    content += r[0] + "\t" + r[1] + '\n'

with open(f'{output_folder}/CONTRAINDICATED_WITH_DISEASE.tsv', 'w') as f:
    f.write(content)


In [43]:
content = ""

for d in structures:
    structure = {"name": structures[d], "ID": d}
    content += json.dumps(structure) + '\n'

with open(f'{output_folder}/structures.jsonl', 'w') as f:
    f.write(content)

content = "from\tto\n"
for r in has_structural_class:
    content += r[0] + "\t" + r[1] + '\n'

with open(f'{output_folder}/HAS_STRUCTURAL_CLASS.tsv', 'w') as f:
    f.write(content)


content = ""

for d in therapeutics:
    therapeutic = {"name": therapeutics[d], "ID": d}
    content += json.dumps(therapeutic) + '\n'

with open(f'{output_folder}/therapeutics.jsonl', 'w') as f:
    f.write(content)

content = "from\tto\n"
for r in has_therapeutic_class:
    content += r[0] + "\t" + r[1] + '\n'

with open(f'{output_folder}/HAS_THERAPEUTIC_CLASS.tsv', 'w') as f:
    f.write(content)

In [44]:
content = ""

for d in actions:
    action = {"name": actions[d], "ID": d}
    content += json.dumps(action) + '\n'

with open(f'{output_folder}/actions.jsonl', 'w') as f:
    f.write(content)

content = "from\tto\n"
for r in has_mechanism_of_action:
    content += r[0] + "\t" + r[1] + '\n'

with open(f'{output_folder}/HAS_MECHANISM_OF_ACTION.tsv', 'w') as f:
    f.write(content)