In [1]:
import os
import urllib
import zipfile
import xml.etree.ElementTree
import gzip
import io
import re

import pandas

In [None]:
# Run to download all trials as xml
assert False # long operation, comment out if desired
query = {'resultsxml': 'true'}
query_str = urllib.parse.urlencode(query)
query_url = 'http://clinicaltrials.gov/search?{}'.format(query_str) 
zip_path = 'download/resultsxml.zip'
capture = ! wget --timestamping --output-document "{zip_path}" "{query_url}"

In [2]:
# Read MeSH descriptor and supplementary terms
url = 'https://raw.githubusercontent.com/dhimmel/mesh/a7036a37302973b15ab949aab4056d9bc062910e/data/descriptor-terms.tsv'
desc_df = pandas.read_table(url)

url = 'https://raw.githubusercontent.com/dhimmel/mesh/a7036a37302973b15ab949aab4056d9bc062910e/data/supplemental-terms.tsv'
supp_df = pandas.read_table(url)

assert not set(desc_df.TermName) & set(supp_df.TermName)

# Create a dictionary of MeSH term names to unique identifiers
mesh_name_to_id = dict(zip(desc_df.TermName, desc_df.DescriptorUI))
mesh_name_to_id.update(dict(zip(supp_df.TermName, supp_df.SupplementalRecordUI)))

In [3]:
def map_term_to_mesh(name):
    ui = mesh_name_to_id.get(name)
    if ui is not None:
        return ui
    first_lower = name[0].lower() + name[1:]
    ui = mesh_name_to_id.get(first_lower)
    if ui is not None:
        #print('Changed "{}" to "{}" to match MeSH'.format(name, first_lower))
        return ui
    print('MeSH term not matched', name)
    return None

In [4]:
def map_terms_to_mesh(names):
    uis = [map_term_to_mesh(name) for name in names]
    return list(filter((None).__ne__, uis))

In [5]:
def parse_study_xml(study):
    series = pandas.Series()
    series['nct_id'] = study.findtext('id_info/nct_id')
    series['study_type'] = study.findtext('study_type')
    series['brief_title'] = study.findtext('brief_title')
    brief_summary = study.findtext('brief_summary/textblock', '')
    series['brief_summary'] = re.sub(r' *\n *', ' ', brief_summary).strip()
    series['overall_status'] = study.findtext('overall_status')
    series['start_date'] = study.findtext('start_date')
    series['phase'] = study.findtext('phase')
    series['conditions'] = [x.text for x in study.findall('condition')]
    series['intervention_drugs'] = [x.text for x in study.findall('intervention[intervention_type="Drug"]/intervention_name')]
    series['mesh_conditions'] = map_terms_to_mesh(x.text for x in study.findall('condition_browse/mesh_term'))
    series['mesh_interventions'] = map_terms_to_mesh(x.text for x in study.findall('intervention_browse/mesh_term'))
    return series

In [6]:
# Read studies from zipfile
studies = list()
zip_path = 'download/resultsxml.zip'

with zipfile.ZipFile(zip_path) as open_zip:
    filenames = open_zip.namelist()
    for filename in filenames:
        with open_zip.open(filename) as open_xml:
            element_tree = xml.etree.ElementTree.parse(open_xml)
            series = parse_study_xml(element_tree)
            studies.append(series)

study_df = pandas.DataFrame(studies)
len(study_df)

190954

In [7]:
study_df.head()

Unnamed: 0,nct_id,study_type,brief_title,brief_summary,overall_status,start_date,phase,conditions,intervention_drugs,mesh_conditions,mesh_interventions
0,NCT01137227,Observational,Factors Associated With Physical Inactivity Am...,Despite the acknowledgment that physical activ...,"Active, not recruiting",April 2010,,"[Physical Inactivity, Obesity, Overweight]",[],[],[]
1,NCT00852527,Observational,Evaluation of VA's Traumatic Brain Injury (TBI...,The purpose of the proposed study is to determ...,Completed,June 2009,,[Traumatic Brain Injury],[],[D001930],[]
2,NCT00704327,Observational,Evaluating the Impact of Cerebral Ischemic And...,The purpose of this study is to evaluate wheth...,Recruiting,February 2008,,[Cognitive Decline],[],[],[]
3,NCT00005027,Interventional,Rebeccamycin Analog in Treating Patients With ...,RATIONALE: Drugs used in chemotherapy use diff...,Completed,June 2000,Phase 2,[Kidney Cancer],[becatecarin],"[D002292, D007680]",[]
4,NCT00305227,Interventional,Intravaginal LACTIN-V for Prevention of Recurr...,Recurrent urinary tract infections (RUTIS) con...,Completed,March 2006,Phase 2,[Urinary Tract Infection],"[Lactin-V, Placebo]","[D003141, D007239, D014552]",[]


In [8]:
# Save clinical trials, pipe delimiting plural fields
write_df = study_df.copy()
plural_columns = ['conditions', 'intervention_drugs', 'mesh_conditions', 'mesh_interventions']
for column in plural_columns:
    write_df[column] = write_df[column].map(lambda x: '|'.join(x))

with gzip.open('data/results.tsv.gz', 'wt') as write_file:
    write_df.to_csv(write_file, sep='\t', index=False)