In [36]:
import pandas as pd
import numpy as np

In [37]:
clinical_trials = pd.read_csv('clinical_study_data_request_com_all_sponsor.tsv', sep="\t")

In [38]:
clinical_trials.head()

Unnamed: 0,Posting ID,Sponsor,Study Title,Medicine or Vaccine (generic name),Sponsor Identification Number,Medical Condition,Phase,Link to Sponsor Study Registry,Link to study details on ClinicalTrials.gov (if available),Link to study details on EudraCT (if available),Analysis-ready dataset,Annotated case report form,Clinical study report,Dataset specifications,Protocol with any amendments,Raw dataset,Reporting and analysis plan,Additional Information about the data and documents available for this study,Date Added to this Site
0,1,GSK,"U0289-401: An Evaluator Blinded, 8 Week, Split...",benzoyl peroxide/salicylic acid,STF114550,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01706250,,N,Y,Y,Y,N,Y,N,An analysis-ready dataset and a reporting and ...,1-Aug-2015
1,3,GSK,"GSK1550188, A randomised, single-blind, placeb...",belimumab,BEL114243,Systemic Lupus Erythematosus,Phase 1,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01381536,,N,Y,Y,Y,N,Y,Y,An analysis-ready dataset is not available for...,1-Jul-2015
2,4,GSK,"A Multicenter, Randomized, Double-Blind, Phase...","calcipotriol, calcipotriene",STF114741,Psoriasis,Phase 3,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT00688519,,Y,Y,Y,Y,Y,Y,Y,,1-Jul-2015
3,8,GSK,Two-week Study to Determine and Compare the To...,benzoyl peroxide/clindamycin phosphate,STF114546,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT01015638,,N,N,Y,Y,Y,Y,N,An analysis-ready dataset and reporting and an...,1-Aug-2014
4,9,GSK,"A single-blind, randomized, comparative split-...",benzoyl peroxide/clindamycin phosphate,STF114547,Acne Vulgaris,Phase 4,https://www.gsk-studyregister.com/en/trial-det...,http://clinicaltrials.gov/show/NCT00964223,,Y,Y,Y,Y,Y,N,Y,A raw dataset is not available for this study.,1-Aug-2014


In [39]:
len(clinical_trials["Medicine or Vaccine (generic name)"].unique())


485

In [40]:
len(clinical_trials["Medical Condition"].unique())

402

In [41]:
condition_abnornaml_list_mapping = {"alzheimers disease": "alzheimer's disease",
                                    "dementia with parkinson’s disease": "parkinson's disease dementia", 
                                    "dementia with lewy bodies": "dementia",
                                    "renal cell carcinoma (rcc)": "renal cell carcinoma",
                                    "influenza, human": "influenza",
                                    "healthy subjects": "",
                                    "healthy volunteers": "",
                                    "anklyosing spondylitis": "ankylosing spondylitis",
                                    "antifungal agents": "",
                                    "partial-onset seizures": "partial onset seizures",
                                    "relapse-remitting multiple sclerosis": "multiple sclerosis",
                                    "relapsing remitting multiple sclerosis": "multiple sclerosis",
                                    "relapsing multiple sclerosis": "multiple sclerosis",
                                    "relapsing-remitting multiple sclerosis": "multiple sclerosis",
                                    "skin care": "",
                                    "solid tumors": "solid tumours",
                                    "aids": "AIDS",
                                    "nutritional status": "",					
                                    "nutritional supplement": ""
                                    }

In [42]:
conditions = []

for combination in clinical_trials["Medical Condition"].unique():
    for x in combination.split("; "):
        x = x.strip().lower()
        if x in condition_abnornaml_list_mapping:
            if condition_abnornaml_list_mapping[x] != "":
                conditions.append(condition_abnornaml_list_mapping[x])
        elif "transplant" in x:
            pass
        else:
            conditions.append(x)

conditions = sorted(list(set(conditions)))

content = ""
for c in conditions:
    content += c + "\n"

with open("conditions.txt", "w") as f:
    f.write(content)

Here, we need to head to gcp and do some ner. healthcare_nlp_experiment.ipynb

GCP healthcare NLP requires at least 10 characters

In [44]:
condition_gcp_nlp = pd.read_csv("condition_gcp_ner.tsv", sep="\t")
condition_gcp_nlp.head()

Unnamed: 0,text,preferredTerm,entityId,confidence,HPO,MSH
0,AIDS,Acquired Immunodeficiency Syndrome;AID - Artif...,UMLS/C0001175;UMLS/C0021588,0.863419,,MSH/D000163;MSH/D007316
1,acellular pertussis,Pertussis,UMLS/C0043167,0.986963,,MSH/D014917
2,acne vulgaris,Acne Vulgaris,UMLS/C0001144,0.980972,,MSH/D000152
3,acute coronary syndrome,Acute Coronary Syndrome,UMLS/C0948089,0.91688,HPO/HP:0033678,MSH/D054058
4,acute myeloid leukemia,"Leukemia, Myelocytic, Acute",UMLS/C0023467,0.812012,HPO/HP:0004808,MSH/D015470


In [47]:
condition_nlp_mapping = {}
for i, row in condition_gcp_nlp.iterrows():
    temp = {}
    for h in ["preferredTerm", "entityId", "HPO", "MSH"]:
        if ";" in str(row[h]):
            temp[h] = row[h].split(";")[0]
        else:
            temp[h] = row[h]
    condition_nlp_mapping[row["text"]] = temp



In [48]:
condition_nlp_mapping

{'AIDS': {'preferredTerm': 'Acquired Immunodeficiency Syndrome',
  'entityId': 'UMLS/C0001175',
  'HPO': nan,
  'MSH': 'MSH/D000163'},
 'acellular pertussis': {'preferredTerm': 'Pertussis',
  'entityId': 'UMLS/C0043167',
  'HPO': nan,
  'MSH': 'MSH/D014917'},
 'acne vulgaris': {'preferredTerm': 'Acne Vulgaris',
  'entityId': 'UMLS/C0001144',
  'HPO': nan,
  'MSH': 'MSH/D000152'},
 'acute coronary syndrome': {'preferredTerm': 'Acute Coronary Syndrome',
  'entityId': 'UMLS/C0948089',
  'HPO': 'HPO/HP:0033678',
  'MSH': 'MSH/D054058'},
 'acute myeloid leukemia': {'preferredTerm': 'Leukemia, Myelocytic, Acute',
  'entityId': 'UMLS/C0023467',
  'HPO': 'HPO/HP:0004808',
  'MSH': 'MSH/D015470'},
 'adrenocortical carcinoma': {'preferredTerm': 'Adrenocortical carcinoma',
  'entityId': 'UMLS/C0206686',
  'HPO': 'HPO/HP:0006744',
  'MSH': 'MSH/D018268'},
 'advanced breast cancer': {'preferredTerm': 'Malignant Neoplasms',
  'entityId': 'UMLS/C0006826',
  'HPO': 'HPO/HP:0002664',
  'MSH': 'MSH/D009

In [50]:
trial_conditions = []

for i, row in clinical_trials.iterrows():
    for x in row["Medical Condition"].split("; "):
        x = x.strip().lower()

        query = ""
        if x in condition_abnornaml_list_mapping:
            if condition_abnornaml_list_mapping[x] != "":
                query = condition_abnornaml_list_mapping[x]
        else:
            query = x

        if query in condition_nlp_mapping:
            trial_conditions.append((row["Posting ID"], condition_nlp_mapping[query]["entityId"]))

In [51]:
trial_conditions

[(1, 'UMLS/C0001144'),
 (3, 'UMLS/C0024141'),
 (4, 'UMLS/C0033860'),
 (8, 'UMLS/C0001144'),
 (9, 'UMLS/C0001144'),
 (10, 'UMLS/C0001144'),
 (11, 'UMLS/C0001144'),
 (12, 'UMLS/C0033860'),
 (13, 'UMLS/C0020538'),
 (14, 'UMLS/C0020538'),
 (16, 'UMLS/C0027497'),
 (19, 'UMLS/C0001144'),
 (20, 'UMLS/C0011603'),
 (21, 'UMLS/C0024115'),
 (22, 'UMLS/C0024115'),
 (23, 'UMLS/C0024115'),
 (32, 'UMLS/C0035455'),
 (33, 'UMLS/C0035455'),
 (34, 'UMLS/C0035455'),
 (35, 'UMLS/C0035455'),
 (36, 'UMLS/C0035455'),
 (37, 'UMLS/C0035455'),
 (38, 'UMLS/C0035455'),
 (39, 'UMLS/C0035455'),
 (40, 'UMLS/C0004096'),
 (41, 'UMLS/C0004096'),
 (42, 'UMLS/C0004096'),
 (43, 'UMLS/C0024115'),
 (44, 'UMLS/C0024115'),
 (45, 'UMLS/C0024115'),
 (46, 'UMLS/C0024115'),
 (47, 'UMLS/C0004096'),
 (48, 'UMLS/C0948089'),
 (49, 'UMLS/C0013922'),
 (50, 'UMLS/C0040053'),
 (51, 'UMLS/C0035455'),
 (52, 'UMLS/C0004096'),
 (53, 'UMLS/C0525045'),
 (54, 'UMLS/C0014544'),
 (55, 'UMLS/C0005586'),
 (56, 'UMLS/C0005586'),
 (74, 'UMLS/C0020557'