# Extract indications in clinical trial from clinicaltrials.gov

+ [documentation](https://clinicaltrials.gov/ct2/help/how-read-study "How to read a study record")

In [7]:
import os
import collections
import urllib
import urllib.request
import zipfile
import xml.etree.ElementTree
import gzip
import io
import re
import random

import pandas

## Download XML-formatted study records and save randomly sampled studies

In [2]:
# # Run to download all trials as xml
# query = {'resultsxml': 'true'}
# query_str = urllib.parse.urlencode(query)
# query_url = 'http://clinicaltrials.gov/search?{}'.format(query_str) 
# zip_path = 'download/resultsxml.zip'
# urllib.request.urlretrieve(query_url, zip_path)

In [3]:
def zip_reader(path, max_records=None):
    """
    Read records from `download/resultsxml.zip`
    """
    with zipfile.ZipFile(path) as open_zip:
        filenames = open_zip.namelist()
        for i, filename in enumerate(filenames):
            with open_zip.open(filename) as open_xml:
                yield filename, xml.etree.ElementTree.parse(open_xml)
            if max_records is not None and i + 1 >= max_records:
                break

In [4]:
%%time
# Prepare sample xml files
random.seed(0)
path = 'download/resultsxml.zip'
for filename, tree in zip_reader(path):
    if random.random() < 1e-4:
        sample_path = os.path.join('download', 'sample', filename)
        tree.write(sample_path)

CPU times: user 5min 5s, sys: 2.46 s, total: 5min 8s
Wall time: 5min 8s


## Below code is in progress

In [5]:
# dhimmel/mesh commit
commit = '9e16dfdca6c6d32cf8d1dcb4149c86be58a1a029'

# Read MeSH descriptor and supplementary terms
url = 'https://github.com/dhimmel/mesh/blob/{}/data/descriptor-terms.tsv?raw=true'.format(commit)
desc_df = pandas.read_table(url)

url = 'https://github.com/dhimmel/mesh/blob/{}/data/supplemental-terms.tsv?raw=true'.format(commit)
supp_df = pandas.read_table(url)

assert not set(desc_df.TermName) & set(supp_df.TermName)

# Create a dictionary of MeSH term names to unique identifiers
mesh_name_to_id = dict(zip(desc_df.TermName, desc_df.DescriptorUI))
mesh_name_to_id.update(dict(zip(supp_df.TermName, supp_df.SupplementalRecordUI)))

In [8]:
unmatched_terms = collections.Counter()

def get_mesh_id(name):
    # Match by name
    mesh_id = mesh_name_to_id.get(name)
    if mesh_id is not None:
        return mesh_id
    # Match by name with first letter lowercase
    first_lower = name[0].lower() + name[1:]
    mesh_id = mesh_name_to_id.get(first_lower)
    if mesh_id is not None:
        return mesh_id
    # Return `None` for unmatched
    unmatched_terms[name] += 1
    return None

def get_mesh_ids(names):
    mesh_ids = [get_mesh_id(name) for name in names]
    return [x for x in mesh_ids if x is not None]

In [None]:
def parse_study_xml(study):
    series = pandas.Series()
    series['nct_id'] = study.findtext('id_info/nct_id')
    series['study_type'] = study.findtext('study_type')
    series['brief_title'] = study.findtext('brief_title')
    brief_summary = study.findtext('brief_summary/textblock', '')
    series['brief_summary'] = re.sub(r' *\n *', ' ', brief_summary).strip()
    series['overall_status'] = study.findtext('overall_status')
    series['start_date'] = study.findtext('start_date')
    series['phase'] = study.findtext('phase')
    series['conditions'] = [x.text for x in study.findall('condition')]
    series['intervention_drugs'] = [x.text for x in study.findall('intervention[intervention_type="Drug"]/intervention_name')]
    series['mesh_conditions'] = map_terms_to_mesh(x.text for x in study.findall('condition_browse/mesh_term'))
    series['mesh_interventions'] = map_terms_to_mesh(x.text for x in study.findall('intervention_browse/mesh_term'))
    return series

In [None]:
# Read studies from zipfile
studies = list()
zip_path = 'download/resultsxml.zip'

series = parse_study_xml(element_tree)

study_df = pandas.DataFrame(studies)
len(study_df)

In [None]:
study_df.head()

In [None]:
# Save clinical trials, pipe delimiting plural fields
write_df = study_df.copy()
plural_columns = ['conditions', 'intervention_drugs', 'mesh_conditions', 'mesh_interventions']
for column in plural_columns:
    write_df[column] = write_df[column].map(lambda x: '|'.join(x))

with gzip.open('data/results.tsv.gz', 'wt') as write_file:
    write_df.to_csv(write_file, sep='\t', index=False)