In [1]:
import pandas as pd
import numpy as np
import re
import json
import healthcare_nlp

from subprocess import check_output
from io import BytesIO

In [2]:
clinical_trials = pd.read_csv('clinical_study_data_request_com_all_sponsor.tsv', sep="\t")

In [3]:
output_folder = "neo4j"

In [4]:
clinical_trials.head()

Unnamed: 0,Posting ID,Sponsor,Study Title,Medicine or Vaccine (generic name),Sponsor Identification Number,Medical Condition,Phase,Link to Sponsor Study Registry,Link to study details on ClinicalTrials.gov (if available),Link to study details on EudraCT (if available),Analysis-ready dataset,Annotated case report form,Clinical study report,Dataset specifications,Protocol with any amendments,Raw dataset,Reporting and analysis plan,Additional Information about the data and documents available for this study,Date Added to this Site
0,1,GSK,"U0289-401: An Evaluator Blinded, 8 Week, Split...",benzoyl peroxide/salicylic acid,STF114550,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01706250,,N,Y,Y,Y,N,Y,N,An analysis-ready dataset and a reporting and ...,1-Aug-2015
1,3,GSK,"GSK1550188, A randomised, single-blind, placeb...",belimumab,BEL114243,Systemic Lupus Erythematosus,Phase 1,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01381536,,N,Y,Y,Y,N,Y,Y,An analysis-ready dataset is not available for...,1-Jul-2015
2,4,GSK,"A Multicenter, Randomized, Double-Blind, Phase...","calcipotriol, calcipotriene",STF114741,Psoriasis,Phase 3,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT00688519,,Y,Y,Y,Y,Y,Y,Y,,1-Jul-2015
3,8,GSK,Two-week Study to Determine and Compare the To...,benzoyl peroxide/clindamycin phosphate,STF114546,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01015638,,N,N,Y,Y,Y,Y,N,An analysis-ready dataset and reporting and an...,1-Aug-2014
4,9,GSK,"A single-blind, randomized, comparative split-...",benzoyl peroxide/clindamycin phosphate,STF114547,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT00964223,,Y,Y,Y,Y,Y,N,Y,A raw dataset is not available for this study.,1-Aug-2014


In [5]:
len(clinical_trials["Medicine or Vaccine (generic name)"].unique())


485

In [6]:
len(clinical_trials["Medical Condition"].unique())

402

In [7]:
condition_abnornaml_list_mapping = {"Alzheimers disease": "Alzheimer's disease",
                                    "Dementia with parkinson’s disease": "Parkinson's disease dementia", 
                                    "Dementia with lewy bodies": "Dementia",
                                    "Renal cell carcinoma (rcc)": "Renal cell carcinoma",
                                    "Influenza, human": "Influenza",
                                    "Healthy subjects": "",
                                    "Healthy volunteers": "",
                                    "Anklyosing spondylitis": "Ankylosing spondylitis",
                                    "Antifungal agents": "",
                                    "Partial-onset seizures": "Partial onset seizures",
                                    "Relapse-remitting multiple sclerosis": "Multiple sclerosis",
                                    "Relapsing remitting multiple sclerosis": "Multiple sclerosis",
                                    "Relapsing multiple sclerosis": "Multiple sclerosis",
                                    "Relapsing-remitting multiple sclerosis": "Multiple sclerosis",
                                    "Skin care": "",
                                    "Solid tumors": "Solid tumours",
                                    "Aids": "AIDS",
                                    "Nutritional status": "",					
                                    "Nutritional supplement": "",
                                    "Arthritis, rheumatoid": "Rheumatoid arthritis",
                                    "Myocardial perfusion imaging": "",
                                    "Neoplasms, breast": "Breast neoplasms",
                                    "Neoplasms, ovarian": "Ovarian neoplasms",
                                    "Fasciolasis": "Fascioliasis",
                                    "Neoplasms, colorectal": "Colorectal Neoplasms",
                                    "Neoplasms, gastrointestinal tract": "Gastrointestinal Neoplasms",
                                    "Herpes simplex": "Herpes simplex infection",
                                    "Pulmonary disease, chronic obstructive": "Chronic Obstructive Airway Disease",
                                    "Skin diseases": "Dermatologic disorders",
                                    "Skin infections, bacterial": "Bacterial skin infections",
                                    "Infection, human immunodeficiency virus": "Human immunodeficiency virus infection",
                                    "Infection, human immunodeficiency virus i": "Human immunodeficiency virus infection",
                                    "Infections, bacterial": "Bacterial infections",
                                    "Infections, herpesviridae": "Herpesviridae infections",
                                    "Infections, human immunodeficiency virus and herpesviridae": "Human immunodeficiency virus infection and herpesviridae infections",
                                    "Infections, meningococcal": "Meningococcal infections",
                                    "Infections, papillomavirus": "Papillomavirus infections",
                                    "Infections, respiratory syncytial virus": "Respiratory syncytial virus infections",
                                    "Infections, respiratory tract": "Respiratory tract infections",
                                    "Infections, rotavirus": "Rotavirus infections",
                                    "Infections, staphylococcal": "Staphylococcal infections",
                                    "Infections, streptococcal": "Streptococcal infections",
                                    "Dermatitis, atopic": "Atopic dermatitis",
                                    "Dermatitis, chronic": "Chronic dermatitis",
                                    "Dermatitis, seborrheic": "Seborrheic dermatitis",
                                    "Embolism, pulmonary": "Pulmonary embolism",
                                    "Epilepsy, tonic-clonic": "Tonic-clonic epilepsy",
                                    "Epilepsy, partial": "Partial epilepsy",
                                    "Scleroderma, systemic": "Systemic scleroderma",
                                    "Sinusitis, acute": "Acute sinusitis",
                                    "Spondylitis, ankylosing": "Ankylosing spondylitis",
                                    "Lymphoma, follicular": "Follicular lymphoma",
                                    "Lymphoma, mantle-cell": "Mantle-cell lymphoma",
                                    "Lymphoma, non-hodgkin": "Non-hodgkin lymphoma",
                                    "Diabetes mellitus, type 1": "Type 1 diabetes mellitus",
                                    "Diabetes mellitus, type 2": "Type 2 diabetes mellitus",
                                    "Fibrillation, atrial": "Atrial fibrillation",
                                    "Cholestasis, intrahepatic": "Intrahepatic cholestasis",
                                    "Atrophy, geographic": "Geographic atrophy",
                                    "Atrophy, muscular": "Muscular atrophy",
                                    "Carcinoma, hepatocellular": "Hepatocellular carcinoma",
                                    "Carcinoma, midline": "Midline carcinoma",
                                    "Carcinoma, renal cell": "Renal cell carcinoma",
                                    "Carcinoma, small cell": "Small cell carcinoma",
                                    "Cancer, advanced gastric": "Advanced gastric cancer",
                                    "Cancer, breast": "Breast cancer",
                                    "Hypertrophy, left ventricular": "Left ventricular hypertrophy",
                                    "Metastases, brain": "Brain metastases",
                                    "Neoplasms, prostate": "Prostate neoplasms",
                                    "Oedema, pulmonary": "Pulmonary edema",
                                    "Osteoporosis, male": "Male osteoporosis",
                                    "Osteoporosis, postmenopausal": "Postmenopausal osteoporosis",
                                    "Pain, neuropathic": "Neuropathic pain",
                                    "Psoriasis, nail": "Nail psoriasis",
                                    "Psoriasis, palmoplantar": "Palmoplantar psoriasis",
                                    "Retinopathy, diabetic": "Diabetic retinopathy",
                                    "Sarcoma, soft tissue": "Soft tissue sarcoma",
                                    "Spasticity, post-stroke": "Post-stroke spasticity",
                                    "Rhinitis, vasomotor": "Vasomotor rhinitis",
                                    "Polyps, nasal": "Nasal polyps",
                                    "Oesophagitis, eosinophilic": "Eosinophilic esophagitis",
                                    "Neuropathy, diabetic": "Diabetic neuropathy",
                                    "Neuralgia, postherpetic": "Postherpetic neuralgia",
                                    "Lung cancer, non-small cell": "Non-small cell lung cancer",
                                    "Lung injury, acute": "Acute lung injury",
                                    "Hypertension, pulmonary": "Pulmonary hypertension",
                                    "Hyperreactivity, bronchial": "Bronchial hyperreactivity",
                                    "Dysentery, bacillary": "Bacillary dysentery",
                                    "Depressive disorder, major": "Major depressive disorder",
                                    "Arrhythmia, cardiac": "Cardiac arrhythmia",
                                    "Constriction, bronchial": "Bronchial constriction",
                                    "Heart failure, congestive": "Congestive heart failure",
                                    "Heart failure, congestive and microalbuminuria": "Congestive heart failure and microalbuminuria heart failure",
                                    "Leukaemia, lymphocytic, chronic": "Chronic lymphocytic leukaemia",
                                    "Leukaemia, myelocytic, acute": "Acute myelocytic leukaemia",
                                    "Influenza a virus, h1n1 subtype": "H1N1 subtype influenza A virus",
                                    "Ischaemic attack, transient": "Transient ischaemic attack",
                                    "Bronchitis, chronic": "Chronic bronchitis",
                                    "Multiple sclerosis, relapsing-remitting": "Relapsing-remitting multiple sclerosis",
                                    "Nausea and vomiting, postoperative": "Postoperative nausea and vomiting",
                                    "Nausea and vomiting, chemotherapy-induced": "Chemotherapy-induced nausea and vomiting",
                                    "Neoplasms, head and neck": "Head neoplasms and neck neoplasms",
                                    "Obstetric labour, premature": "Premature obstetric labour",
                                    }

In [8]:
conditions = []

for combination in clinical_trials["Medical Condition"].unique():
    for x in combination.split("; "):
        x = x.strip().lower().capitalize()
        if x in condition_abnornaml_list_mapping:
            if condition_abnornaml_list_mapping[x] != "":
                conditions.append(condition_abnornaml_list_mapping[x])
        elif "transplant" in x:
            pass
        else:
            conditions.append(x)

conditions = sorted(list(set(conditions)))

content = ""
for c in conditions:
    content += c + "\n"

with open("conditions.txt", "w") as f:
    f.write(content)

Here, we need to head to gcp and do some ner.

In [9]:
condition_corrections = {
    "Measles": [{"preferredTerms": "Measles", "entityids": "UMLS/C0025007", "HPO": [], "MSH": ["MSH/D008457"], "RXNORM": []}],
    "Measles, mumps, rubella": [{"preferredTerms": "Measles", "entityids": "UMLS/C0025007", "HPO": [], "MSH": ["MSH/D008457"], "RXNORM": []}, {"preferredTerms": "Mumps", "entityids": "UMLS/C0026780", "HPO": [], "MSH": ["MSH/D009107"], "RXNORM": []}, {"preferredTerms": "Rubella", "entityids": "UMLS/C0035920", "HPO": [], "MSH": ["MSH/D012409"], "RXNORM": []}],
    "Tetanus": [{"preferredTerms": "Tetanus", "entityids": "UMLS/C0039614", "HPO": [], "MSH": ["MSH/D013742"], "RXNORM": []}],
    "Tuberculosis": [{"preferredTerms": "Tuberculosis", "entityids": "UMLS/C0041296", "HPO": [], "MSH": ["MSH/D014376"], "RXNORM": []}],
    "Fascioliasis": [{"preferredTerms": "Fascioliasis", "entityids": "UMLS/C0015652", "HPO": [], "MSH": ["MSH/D005211"], "RXNORM": []}],
    "Neoplasms": [{"preferredTerms": "Neoplasms", "entityids": "UMLS/C0027651", "HPO": ["HPO/HP:0002664"], "MSH": ["MSH/D009369"], "RXNORM": []}],
    "Mumps": [{"preferredTerms": "Mumps", "entityids": "UMLS/C0026780", "HPO": [], "MSH": ["MSH/D009107"], "RXNORM": []}]
}

In [10]:
len("Chronic Obstructive Airway Disease".strip())

34

In [11]:
offset_text = []

begin = 0
with open('conditions.txt') as f:
    for line in f.readlines():
        line = line.strip()
        end = begin + len(line) + 1 ## +1 for the ; character
        offset_text.append((begin, end, line))

        begin = end

In [12]:
offset_text

[(0, 5, 'AIDS'),
 (5, 25, 'Acellular pertussis'),
 (25, 39, 'Acne vulgaris'),
 (39, 63, 'Acute coronary syndrome'),
 (63, 81, 'Acute lung injury'),
 (81, 108, 'Acute myelocytic leukaemia'),
 (108, 131, 'Acute myeloid leukemia'),
 (131, 147, 'Acute sinusitis'),
 (147, 172, 'Adrenocortical carcinoma'),
 (172, 195, 'Advanced breast cancer'),
 (195, 219, 'Advanced gastric cancer'),
 (219, 230, 'Alcoholism'),
 (230, 254, 'Allergic conjunctivitis'),
 (254, 263, 'Alopecia'),
 (263, 283, "Alzheimer's disease"),
 (283, 295, 'Amyloidosis'),
 (295, 303, 'Anaemia'),
 (303, 313, 'Analgesia'),
 (313, 336, 'Ankylosing spondylitis'),
 (336, 354, 'Anxiety disorders'),
 (354, 361, 'Asthma'),
 (361, 381, 'Asthma and rhinitis'),
 (381, 397, 'Atherosclerosis'),
 (397, 415, 'Atopic dermatitis'),
 (415, 435, 'Atrial fibrillation'),
 (435, 483, 'Attention deficit hyperactivity disorder (adhd)'),
 (483, 503, 'Autoimmune diseases'),
 (503,
  633,
  'B-cell acute lymphoblastic leukemia, relapsed b-cell acute lym

In [13]:
gcp_token = check_output(["gcloud", "auth", "print-access-token"], encoding='UTF-8').strip()

In [14]:
#test_text = ";".join([x[2] for x in offset_text])

test_text = ""
content = ""

for x in offset_text:

    if len(test_text) + len(x[2]) < 10000:

        test_text += x[2] + ";"

    else:
        test_text = test_text[:-1]

        result = healthcare_nlp.get_healthcare_json(test_text, gcp_token)

        json_form = json.loads(result)
        
        temp = healthcare_nlp.get_json(json_form, "PROBLEM", condition_corrections, offset_text)

        content += "\n".join([json.dumps(x) for x in temp]) + "\n"

        test_text = x[2] + ";"


if test_text != "":
    test_text = test_text[:-1]

    result = healthcare_nlp.get_healthcare_json(test_text, gcp_token)

    with open("condition_gcp_raw.json", "w") as f:
        f.write(result)

    json_form = json.loads(result)

    temp = healthcare_nlp.get_json(json_form, "PROBLEM", condition_corrections, offset_text)

    content += "\n".join([json.dumps(x) for x in temp]) + "\n"

{'UMLS/C0001144': {'preferredTerm': 'Acne Vulgaris', 'vocabularyCodes': ['MSH/D000152', 'MTH/NOCODE', 'NCI/C27195']}, 'UMLS/C0001175': {'preferredTerm': 'Acquired Immunodeficiency Syndrome', 'vocabularyCodes': ['LNC/LA10430-9', 'MEDLINEPLUS/1', 'MSH/D000163', 'MTH/NOCODE', 'NCI/C2851']}, 'UMLS/C0001519': {'preferredTerm': 'Adie Syndrome', 'vocabularyCodes': ['MSH/D000270', 'MTH/NOCODE', 'NCI/C34357', 'OMIM/103100']}, 'UMLS/C0001529': {'preferredTerm': 'Adiposis Dolorosa', 'vocabularyCodes': ['MSH/D000274', 'NCI/C84540', 'OMIM/103200']}, 'UMLS/C0001617': {'preferredTerm': 'Adrenal Cortex Hormones', 'vocabularyCodes': ['LNC/LP31653-6', 'MEDLINEPLUS/4557', 'MSH/D000305', 'MTH/NOCODE', 'NCI/C211', 'NCI/C2322', 'VANDF/4021625']}, 'UMLS/C0001683': {'preferredTerm': 'Advance Directives', 'vocabularyCodes': ['LNC/LP133261-0', 'LNC/LP74455-4', 'LNC/MTHU021127', 'LNC/MTHU047657', 'MEDLINEPLUS/4151', 'MSH/D016223', 'MTH/NOCODE', 'NCI/C93142']}, 'UMLS/C0001815': {'preferredTerm': 'Primary Myelofib

In [15]:
test_text

"AIDS;Acellular pertussis;Acne vulgaris;Acute coronary syndrome;Acute lung injury;Acute myelocytic leukaemia;Acute myeloid leukemia;Acute sinusitis;Adrenocortical carcinoma;Advanced breast cancer;Advanced gastric cancer;Alcoholism;Allergic conjunctivitis;Alopecia;Alzheimer's disease;Amyloidosis;Anaemia;Analgesia;Ankylosing spondylitis;Anxiety disorders;Asthma;Asthma and rhinitis;Atherosclerosis;Atopic dermatitis;Atrial fibrillation;Attention deficit hyperactivity disorder (adhd);Autoimmune diseases;B-cell acute lymphoblastic leukemia, relapsed b-cell acute lymphoblastic leukemia, refractory b-cell acute lymphoblastic leukemia;Bacillary dysentery;Bacterial infections;Bacterial skin infections;Bipolar disorder;Bipolar i disorder;Bladder pain;Brain metastases;Breast cancer;Breast neoplasms;Bronchial constriction;Bronchial hyperreactivity;Bronchiectasis;Bronchospasm;Cachexia;Cancer;Cancer|neoplasms;Carcinoid tumor;Cardiac arrhythmia;Cardiovascular disease;Cataract;Cerebrovascular accident;

In [16]:
test_text[5161:5169]

';Osteoar'

In [17]:
offset_text

[(0, 5, 'AIDS'),
 (5, 25, 'Acellular pertussis'),
 (25, 39, 'Acne vulgaris'),
 (39, 63, 'Acute coronary syndrome'),
 (63, 81, 'Acute lung injury'),
 (81, 108, 'Acute myelocytic leukaemia'),
 (108, 131, 'Acute myeloid leukemia'),
 (131, 147, 'Acute sinusitis'),
 (147, 172, 'Adrenocortical carcinoma'),
 (172, 195, 'Advanced breast cancer'),
 (195, 219, 'Advanced gastric cancer'),
 (219, 230, 'Alcoholism'),
 (230, 254, 'Allergic conjunctivitis'),
 (254, 263, 'Alopecia'),
 (263, 283, "Alzheimer's disease"),
 (283, 295, 'Amyloidosis'),
 (295, 303, 'Anaemia'),
 (303, 313, 'Analgesia'),
 (313, 336, 'Ankylosing spondylitis'),
 (336, 354, 'Anxiety disorders'),
 (354, 361, 'Asthma'),
 (361, 381, 'Asthma and rhinitis'),
 (381, 397, 'Atherosclerosis'),
 (397, 415, 'Atopic dermatitis'),
 (415, 435, 'Atrial fibrillation'),
 (435, 483, 'Attention deficit hyperactivity disorder (adhd)'),
 (483, 503, 'Autoimmune diseases'),
 (503,
  633,
  'B-cell acute lymphoblastic leukemia, relapsed b-cell acute lym

GCP healthcare NLP requires at least 10 characters

In [18]:
with open('condition_gcp_ner.jsonl', 'w') as f:
    f.write(content)

In [19]:
condition_gcp_nlp = {}

with open('condition_gcp_ner.jsonl', 'r') as json_file:
    for line in json_file.readlines():
        obj = json.loads(line)
        condition_gcp_nlp[list(obj.keys())[0]] = list(obj.values())[0]

condition_gcp_nlp

{'AIDS': [{'preferredTerms': 'Acquired Immunodeficiency Syndrome',
   'entityids': 'UMLS/C0001175',
   'HPO': [],
   'MSH': ['MSH/D000163'],
   'RXNORM': []}],
 'Acellular pertussis': [{'preferredTerms': 'Pertussis',
   'entityids': 'UMLS/C0043167',
   'HPO': [],
   'MSH': ['MSH/D014917'],
   'RXNORM': []}],
 'Acne vulgaris': [{'preferredTerms': 'Acne Vulgaris',
   'entityids': 'UMLS/C0001144',
   'HPO': [],
   'MSH': ['MSH/D000152'],
   'RXNORM': []}],
 'Acute coronary syndrome': [{'preferredTerms': 'Acute Coronary Syndrome',
   'entityids': 'UMLS/C0948089',
   'HPO': ['HPO/HP:0033678'],
   'MSH': ['MSH/D054058'],
   'RXNORM': []}],
 'Acute lung injury': [{'preferredTerms': 'Traumatic injury',
   'entityids': 'UMLS/C3263723',
   'HPO': [],
   'MSH': ['MSH/D014947'],
   'RXNORM': []}],
 'Acute myelocytic leukaemia': [{'preferredTerms': 'Leukemia, Myelocytic, Acute',
   'entityids': 'UMLS/C0023467',
   'HPO': ['HPO/HP:0004808'],
   'MSH': ['MSH/D015470'],
   'RXNORM': []}],
 'Acute myel

In [20]:
trial_conditions = []

condition_nodes = {}

for i, row in clinical_trials.iterrows():
    for x in row["Medical Condition"].split("; "):
        x = x.strip().lower().capitalize()

        query = ""
        if x in condition_abnornaml_list_mapping:
            if condition_abnornaml_list_mapping[x] != "":
                query = condition_abnornaml_list_mapping[x]
        else:
            query = x

        if query in condition_gcp_nlp:
            for entity in condition_gcp_nlp[query]:
                if "entityids" in entity:
                    trial_conditions.append((row["Posting ID"], entity["entityids"]))

                    condition_nodes[entity["entityids"]] = {"name": entity["preferredTerms"], "HPO": entity["HPO"], "MSH": entity["MSH"], "RXNORM": entity["RXNORM"]}
        

In [21]:
content = "from\tto\n"
for row in trial_conditions:
    content += f"{row[0]}\t{row[1]}\n"

with open(f"{output_folder}/trial_conditions.tsv", "w") as f:
    f.write(content)

In [22]:
content = "UMLS\tname\tHPO\tMSH\tRXNORM\n"
for c in condition_nodes:
    content += f"{c}\t{condition_nodes[c]['name']}\t{'|'.join(condition_nodes[c]['HPO'])}\t{'|'.join(condition_nodes[c]['MSH'])}\t{'|'.join(condition_nodes[c]['RXNORM'])}\n"

with open(f"{output_folder}/conditions.tsv", "w") as f:
    f.write(content)

In [23]:
medicine_abnornaml_list_mapping = {
                                    "Nan": "",
                                    "Lanalumab": "lanadelumab",
                                    "Papzopanib": "pazopanib",
                                    "Indacterol": "indacaterol",
                                    "Seralaxin": "Serelaxin",
                                    "Haemophilus influenzae type b": "Haemophilus influenzae type b vaccine"
                                }

In [24]:
rx_cryptic_drug = re.compile(r'[A-Za-z]+\d+')

In [25]:
medicine = []
cryptic_drugs = []


content = ""
for m in clinical_trials["Medicine or Vaccine (generic name)"].unique():
    for x in str(m).split(";"):
        x = x.strip().lower().capitalize()

        if x in medicine_abnornaml_list_mapping:
            if medicine_abnornaml_list_mapping[x] != "":
                medicine.append(medicine_abnornaml_list_mapping[x])
        else:
            if rx_cryptic_drug.match(x) is None:
                medicine.append(x)
            else:
                if "vaccine" in x:
                    medicine.append(x)
                else:
                    for s in x.split("/"):
                        cryptic_drugs.append(s.strip())
                        

medicine = sorted(list(set(medicine)))

content = ""
for c in medicine:
    content += c + "\n"

with open("medicines.txt", "w") as f:
    f.write(content)

content = ""
for c in cryptic_drugs:
    content += c + "\n"

with open("cryptic_medicines.txt", "w") as f:
    f.write(content)




In [26]:

medicine_corrections = {
    "Benzoyl peroxide/clindamycin": [{"preferredTerms": "benzoyl peroxide / clindamycin", "entityids": "UMLS/C1720375", "HPO": [], "MSH": ["MSH/C466951"], "RXNORM": ["RXNORM/646781"]}],
    "Casopitant": [{"preferredTerms": "casopitant", "entityids": "UMLS/C2347566", "HPO": [], "MSH": [], "RXNORM": []}],
    "Alisporivir": [{"preferredTerms": "alisporivir", "entityids": "UMLS/C1567950", "HPO": [], "MSH": ["MSH/C499715"], "RXNORM": []}],
    "Indacterol": [{"preferredTerms": "Recombinant Interleukin-18", "entityids": "UMLS/C1527127", "HPO": [], "MSH": [], "RXNORM": []}],
    "Iboctadekin": [{"preferredTerms": "Recombinant Interleukin-18", "entityids": "UMLS/C1527127", "HPO": [], "MSH": [], "RXNORM": []}],
    "Human papillomavirus types 16 and 18 vaccine": [{"preferredTerms": "human papillomavirus vaccine, L1 type 16, 18", "entityids": "UMLS/C1721788", "HPO": [], "MSH": ["MSH/C510352"], "RXNORM": []}]
}

In [27]:
offset_text = []


begin = 0
with open('medicines.txt') as f:
    for line in f.readlines():
        end = begin + len(line)
        offset_text.append((begin, end, line.strip()))

        begin += len(line)

In [28]:
gcp_token = check_output(["gcloud", "auth", "print-access-token"], encoding='UTF-8').strip()
#test_text = ";".join([x[2] for x in offset_text])

test_text = ""
content = ""

for x in offset_text:

    if len(test_text) + len(x[2]) < 10000:

        test_text += x[2] + ";"

    else:
        test_text = test_text[:-1]

        result = healthcare_nlp.get_healthcare_json(test_text, gcp_token)

        temp = healthcare_nlp.get_json(json_form, "MEDICINE", medicine_corrections, offset_text)

        content += "\n".join([json.dumps(x) for x in temp]) + "\n"

        test_text = x[2] + ";"


if test_text != "":
    test_text = test_text[:-1]

    result = healthcare_nlp.get_healthcare_json(test_text, gcp_token)

    json_form = json.loads(result)

    temp = healthcare_nlp.get_json(json_form, "MEDICINE", medicine_corrections, offset_text)

    content += "\n".join([json.dumps(x) for x in temp]) + "\n"

{'UMLS/C0001927': {'preferredTerm': 'albuterol', 'vocabularyCodes': ['LNC/LP17843-1', 'LNC/MTHU013596', 'MSH/D000420', 'MTH/NOCODE', 'NCI/C215', 'RXNORM/435', 'VANDF/4018796']}, 'UMLS/C0003330': {'preferredTerm': 'Antigens, Fungal', 'vocabularyCodes': ['MSH/D000946']}, 'UMLS/C0003341': {'preferredTerm': 'Antigens, Tumor-Associated, Carbohydrate', 'vocabularyCodes': ['MSH/D015295']}, 'UMLS/C0003372': {'preferredTerm': 'Antilymphocyte Serum', 'vocabularyCodes': ['MSH/D000961', 'MTH/NOCODE', 'NCI/C62577']}, 'UMLS/C0003442': {'preferredTerm': 'lymphocyte immune globulin, anti-thymocyte globulin', 'vocabularyCodes': ['MSH/D000961', 'MTH/NOCODE', 'NCI/C278', 'RXNORM/1011', 'VANDF/4018097', 'VANDF/4022194']}, 'UMLS/C0004057': {'preferredTerm': 'aspirin', 'vocabularyCodes': ['LNC/LA26702-3', 'MSH/D001241', 'MTH/NOCODE', 'MTH/U000319', 'MTH/U000320', 'NCI/C287', 'RXNORM/1191', 'VANDF/4017536']}, 'UMLS/C0005088': {'preferredTerm': 'benzoyl peroxide', 'vocabularyCodes': ['MSH/D001585', 'MTH/NOCOD

In [29]:
with open('medicine_gcp_ner.jsonl', 'w') as f:
    f.write(content)

In [30]:
medicine_gcp_nlp = {}

with open('medicine_gcp_ner.jsonl', 'r') as json_file:
    for line in json_file.readlines():
        obj = json.loads(line)
        medicine_gcp_nlp[list(obj.keys())[0]] = list(obj.values())[0]

len(medicine_gcp_nlp)

286

In [31]:
trial_medicines = []
medicine_nodes = {}
trial_cryptic_drugs = []

for i, row in clinical_trials.iterrows():
    for x in str(row["Medicine or Vaccine (generic name)"]).split("; "):
        x = x.strip().lower().capitalize()

        for gsk in x.split("/"):
            if gsk.strip() in cryptic_drugs:
                trial_cryptic_drugs.append((row["Posting ID"], gsk.strip()))

        query = ""
        if x in medicine_abnornaml_list_mapping:
            if medicine_abnornaml_list_mapping[x] != "":
                query = medicine_abnornaml_list_mapping[x]
        else:
            query = x

        if query in medicine_gcp_nlp:
            for entity in medicine_gcp_nlp[query]:
                if "entityids" in entity:
                    trial_medicines.append((row["Posting ID"], entity["entityids"]))

                    medicine_nodes[entity["entityids"]] = {"name": entity["preferredTerms"], "HPO": entity["HPO"], "MSH": entity["MSH"], "RXNORM": entity["RXNORM"]}

In [32]:
trial_cryptic_drugs
content = "from\tto\n"
for row in trial_cryptic_drugs:
    content += f"{row[0]}\t{row[1]}\n"

with open(f"trial_cryptic_drugs.tsv", "w") as f:
    f.write(content)

In [33]:
content = "from\tto\n"
for row in trial_medicines:
    content += f"{row[0]}\t{row[1]}\n"

In [34]:
with open(f"{output_folder}/trial_medicines.tsv", "w") as f:
    f.write(content)

In [35]:
content = "UMLS\tname\tHPO\tMSH\tRXNORM\tother_name\n"
for c in medicine_nodes:
    content += f"{c}\t{medicine_nodes[c]['name']}\t{'|'.join(medicine_nodes[c]['HPO'])}\t{'|'.join(medicine_nodes[c]['MSH'])}\t{'|'.join(medicine_nodes[c]['RXNORM'])}\t\n"

with open(f"{output_folder}/medicines.tsv", "w") as f:
    f.write(content)

In [36]:
clinical_trials["Sponsor"].unique()

array(['GSK', 'ViiV', 'Novartis', 'Astellas', 'Eisai', 'Grunenthal',
       'Teva', 'Ono'], dtype=object)

In [37]:
sponsors = set()

trial_sponsors = []

for i, row in clinical_trials.iterrows():
    sponsor = str(row["Sponsor"])

    sponsors.add(sponsor)

    trial_sponsors.append((row["Posting ID"], sponsor))

content = "from\tto\n"
for row in trial_sponsors:
    content += f"{row[0]}\t{row[1]}\n"

with open(f"{output_folder}/trial_sponsors.tsv", "w") as f:
    f.write(content)

content = "name\n"
for s in sponsors:
    content += f"{s}\n"

with open(f"{output_folder}/sponsors.tsv", "w") as f:
    f.write(content)