In [40]:
import pandas as pd
import numpy as np
import re

In [41]:
clinical_trials = pd.read_csv('clinical_study_data_request_com_all_sponsor.tsv', sep="\t")

In [42]:
clinical_trials.head()

Unnamed: 0,Posting ID,Sponsor,Study Title,Medicine or Vaccine (generic name),Sponsor Identification Number,Medical Condition,Phase,Link to Sponsor Study Registry,Link to study details on ClinicalTrials.gov (if available),Link to study details on EudraCT (if available),Analysis-ready dataset,Annotated case report form,Clinical study report,Dataset specifications,Protocol with any amendments,Raw dataset,Reporting and analysis plan,Additional Information about the data and documents available for this study,Date Added to this Site
0,1,GSK,"U0289-401: An Evaluator Blinded, 8 Week, Split...",benzoyl peroxide/salicylic acid,STF114550,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01706250,,N,Y,Y,Y,N,Y,N,An analysis-ready dataset and a reporting and ...,1-Aug-2015
1,3,GSK,"GSK1550188, A randomised, single-blind, placeb...",belimumab,BEL114243,Systemic Lupus Erythematosus,Phase 1,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01381536,,N,Y,Y,Y,N,Y,Y,An analysis-ready dataset is not available for...,1-Jul-2015
2,4,GSK,"A Multicenter, Randomized, Double-Blind, Phase...","calcipotriol, calcipotriene",STF114741,Psoriasis,Phase 3,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT00688519,,Y,Y,Y,Y,Y,Y,Y,,1-Jul-2015
3,8,GSK,Two-week Study to Determine and Compare the To...,benzoyl peroxide/clindamycin phosphate,STF114546,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01015638,,N,N,Y,Y,Y,Y,N,An analysis-ready dataset and reporting and an...,1-Aug-2014
4,9,GSK,"A single-blind, randomized, comparative split-...",benzoyl peroxide/clindamycin phosphate,STF114547,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT00964223,,Y,Y,Y,Y,Y,N,Y,A raw dataset is not available for this study.,1-Aug-2014


In [43]:
len(clinical_trials["Medicine or Vaccine (generic name)"].unique())


485

In [44]:
len(clinical_trials["Medical Condition"].unique())

402

In [45]:
condition_abnornaml_list_mapping = {"Alzheimers disease": "Alzheimer's disease",
                                    "Dementia with parkinsonâ€™s disease": "Parkinson's disease dementia", 
                                    "Dementia with lewy bodies": "Dementia",
                                    "Renal cell carcinoma (rcc)": "Renal cell carcinoma",
                                    "Influenza, human": "Influenza",
                                    "Healthy subjects": "",
                                    "Healthy volunteers": "",
                                    "Anklyosing spondylitis": "Ankylosing spondylitis",
                                    "Antifungal agents": "",
                                    "Partial-onset seizures": "Partial onset seizures",
                                    "Relapse-remitting multiple sclerosis": "Multiple sclerosis",
                                    "Relapsing remitting multiple sclerosis": "Multiple sclerosis",
                                    "Relapsing multiple sclerosis": "Multiple sclerosis",
                                    "Relapsing-remitting multiple sclerosis": "Multiple sclerosis",
                                    "Skin care": "",
                                    "Solid tumors": "Solid tumours",
                                    "Aids": "AIDS",
                                    "Nutritional status": "",					
                                    "Nutritional supplement": ""
                                    }

In [46]:
conditions = []

for combination in clinical_trials["Medical Condition"].unique():
    for x in combination.split("; "):
        x = x.strip().lower().capitalize()
        if x in condition_abnornaml_list_mapping:
            if condition_abnornaml_list_mapping[x] != "":
                conditions.append(condition_abnornaml_list_mapping[x])
        elif "transplant" in x:
            pass
        else:
            conditions.append(x)

conditions = sorted(list(set(conditions)))

content = ""
for c in conditions:
    content += c + "\n"

with open("conditions.txt", "w") as f:
    f.write(content)

Here, we need to head to gcp and do some ner. healthcare_nlp_experiment.ipynb

GCP healthcare NLP requires at least 10 characters

In [69]:
condition_gcp_nlp = pd.read_csv("condition_gcp_ner.tsv", sep="\t")
condition_gcp_nlp.head()

Unnamed: 0,text,preferredTerm,entityId,confidence,HPO,MSH
0,AIDS,Acquired Immunodeficiency Syndrome;AID - Artif...,UMLS/C0001175;UMLS/C0021588,0.82486,,MSH/D000163;MSH/D007316
1,Acellular pertussis,Pertussis,UMLS/C0043167,0.988182,,MSH/D014917
2,Acne vulgaris,Acne Vulgaris,UMLS/C0001144,0.970287,,MSH/D000152
3,Acute coronary syndrome,Acute Coronary Syndrome,UMLS/C0948089,0.891674,HPO/HP:0033678,MSH/D054058
4,Acute myeloid leukemia,"Leukemia, Myelocytic, Acute",UMLS/C0023467,0.834782,HPO/HP:0004808,MSH/D015470


In [70]:
import math

condition_nlp_mapping = {}
for i, row in condition_gcp_nlp.iterrows():
    temp = {}
    for h in ["preferredTerm", "entityId", "HPO", "MSH"]:
        if ";" in str(row[h]):
            temp[h] = row[h].split(";")[0]
        else:
            temp[h] = str(row[h])
    condition_nlp_mapping[row["text"]] = temp



In [71]:
condition_nlp_mapping

{'AIDS': {'preferredTerm': 'Acquired Immunodeficiency Syndrome',
  'entityId': 'UMLS/C0001175',
  'HPO': 'nan',
  'MSH': 'MSH/D000163'},
 'Acellular pertussis': {'preferredTerm': 'Pertussis',
  'entityId': 'UMLS/C0043167',
  'HPO': 'nan',
  'MSH': 'MSH/D014917'},
 'Acne vulgaris': {'preferredTerm': 'Acne Vulgaris',
  'entityId': 'UMLS/C0001144',
  'HPO': 'nan',
  'MSH': 'MSH/D000152'},
 'Acute coronary syndrome': {'preferredTerm': 'Acute Coronary Syndrome',
  'entityId': 'UMLS/C0948089',
  'HPO': 'HPO/HP:0033678',
  'MSH': 'MSH/D054058'},
 'Acute myeloid leukemia': {'preferredTerm': 'Leukemia, Myelocytic, Acute',
  'entityId': 'UMLS/C0023467',
  'HPO': 'HPO/HP:0004808',
  'MSH': 'MSH/D015470'},
 'Adrenocortical carcinoma': {'preferredTerm': 'Adrenocortical carcinoma',
  'entityId': 'UMLS/C0206686',
  'HPO': 'HPO/HP:0006744',
  'MSH': 'MSH/D018268'},
 'Advanced breast cancer': {'preferredTerm': 'Malignant Neoplasms',
  'entityId': 'UMLS/C0006826',
  'HPO': 'HPO/HP:0002664',
  'MSH': 'MS

In [72]:
trial_conditions = []

for i, row in clinical_trials.iterrows():
    for x in row["Medical Condition"].split("; "):
        x = x.strip().lower().capitalize()

        query = ""
        if x in condition_abnornaml_list_mapping:
            if condition_abnornaml_list_mapping[x] != "":
                query = condition_abnornaml_list_mapping[x]
        else:
            query = x

        if query in condition_nlp_mapping and condition_nlp_mapping[query]["entityId"] != 'nan':
            trial_conditions.append((row["Posting ID"], condition_nlp_mapping[query]["entityId"]))
        

In [73]:
content = "from\tto\n"
for row in trial_conditions:
    content += f"{row[0]}\t{row[1]}\n"

In [74]:
with open("trial_conditions.tsv", "w") as f:
    f.write(content)

In [53]:
medicine_abnornaml_list_mapping = {"Nan": ""}

In [54]:
rx_cryptic_drug = re.compile(r'[A-Za-z]+\d+')

In [55]:
medicine = []
cryptic_drugs = []

content = ""
for m in clinical_trials["Medicine or Vaccine (generic name)"].unique():
    for x in str(m).split(";"):
        x = x.strip().lower().capitalize()
        if x in medicine_abnornaml_list_mapping:
            if medicine_abnornaml_list_mapping[x] != "":
                medicine.append(medicine_abnornaml_list_mapping[x])
        else:
            if rx_cryptic_drug.match(x) is None:
                medicine.append(x)
            else:
                if "vaccine" in x:
                    medicine.append(x)
                else:
                    cryptic_drugs.append(x)

medicine = sorted(list(set(medicine)))

content = ""
for c in medicine:
    content += c + "\n"

with open("medicines.txt", "w") as f:
    f.write(content)

content = ""
for c in cryptic_drugs:
    content += c + "\n"

with open("cryptic_medicines.txt", "w") as f:
    f.write(content)


In [56]:
medicine_gcp_nlp = pd.read_csv("medicine_gcp_ner.tsv", sep="\t")
medicine_gcp_nlp.head()

Unnamed: 0,text,preferredTerm,entityId,confidence,HPO,MSH
0,10-valent pneumococcal polysaccharide and non-...,"pneumococcal polysaccharide, type 37;Polyvalen...",UMLS/C0245576;UMLS/C0305065;UMLS/C1098021;UMLS...,0.519008,,MSH/C058381;MSH/D022242;MSH/C437065;MSH/C437065
1,12-valent pneumococcal polysaccharide and non-...,"pneumococcal polysaccharide, type 37;Polyvalen...",UMLS/C0245576;UMLS/C0305065;UMLS/C1098021;UMLS...,0.609726,,MSH/C058381;MSH/D022242;MSH/C437065;MSH/C437065
2,Abacavir,abacavir,UMLS/C0663655,0.960383,,MSH/C106538
3,Abacavir/lamivudine,abacavir / lamivudine,UMLS/C1613391,0.5521,,MSH/C492871
4,Abacavir/lamivudine/zidovudine,abacavir,UMLS/C0663655,0.95807,,MSH/C106538


In [57]:

medicine_nlp_mapping = {}
for i, row in medicine_gcp_nlp.iterrows():
    temp = {}
    for h in ["preferredTerm", "entityId", "HPO", "MSH"]:
        if ";" in str(row[h]):
            temp[h] = row[h].split(";")[0]
        else:
            temp[h] = str(row[h])
    medicine_nlp_mapping[row["text"]] = temp

In [59]:
trial_medicines = []

for i, row in clinical_trials.iterrows():
    for x in str(row["Medicine or Vaccine (generic name)"]).split("; "):
        x = x.strip().lower().capitalize()

        query = ""
        if x in medicine_abnornaml_list_mapping:
            if medicine_abnornaml_list_mapping[x] != "":
                query = medicine_abnornaml_list_mapping[x]
        else:
            query = x

        if query in medicine_nlp_mapping and medicine_nlp_mapping[query]["entityId"] != 'nan':
            trial_medicines.append((row["Posting ID"], medicine_nlp_mapping[query]["entityId"]))

In [62]:
content = "from\tto\n"
for row in trial_medicines:
    content += f"{row[0]}\t{row[1]}\n"

In [63]:
with open("trial_medicines.tsv", "w") as f:
    f.write(content)