# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [2]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


Autosave disabled


In [3]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [4]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [5]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

#Remove old files from test folder
shutil.rmtree(path_to_test_folder) 

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []

print('\nPaths and folders created\n')

In [6]:
# Create folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

    print('\nSetup folders created with exceptions: {}\n'.format(create_folders_errors))
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

### Create test folder with 50 random files

In [7]:
'''
Select 50 random files from all trials and save them in test folder
'''
files_for_test = 200

import random

# Pending: If files in test folder, delete them
random_files = []
random_files = random.sample(os.listdir(path_to_all_xml_trials), files_for_test)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)


In [8]:
print(random_files[0:5])

## Parse all xml files

In [7]:
'''
Parse all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}\n'.format(len(all_parsed_files)))
        
    return all_parsed_files

In [8]:
# Run function
all_parsed_files.clear()
%time parse_xml_files(path_to_all_xml_trials)


[<Element 'clinical_study' at 0x119b217c8>,
 <Element 'clinical_study' at 0x11ab9e638>,
 <Element 'clinical_study' at 0x11abae6d8>,
 <Element 'clinical_study' at 0x11abba4f8>,
 <Element 'clinical_study' at 0x11abc8098>,
 <Element 'clinical_study' at 0x11abd34a8>,
 <Element 'clinical_study' at 0x11abe13b8>,
 <Element 'clinical_study' at 0x11abf14f8>,
 <Element 'clinical_study' at 0x11b092188>,
 <Element 'clinical_study' at 0x11b09ac28>,
 <Element 'clinical_study' at 0x11b0a6c28>,
 <Element 'clinical_study' at 0x11b0b2548>,
 <Element 'clinical_study' at 0x11b0c3638>,
 <Element 'clinical_study' at 0x11b0d7458>,
 <Element 'clinical_study' at 0x11b0fa818>,
 <Element 'clinical_study' at 0x11b10be08>,
 <Element 'clinical_study' at 0x11b119d68>,
 <Element 'clinical_study' at 0x11b12b278>,
 <Element 'clinical_study' at 0x11b149ae8>,
 <Element 'clinical_study' at 0x11b178598>,
 <Element 'clinical_study' at 0x11b1839f8>,
 <Element 'clinical_study' at 0x11b192408>,
 <Element 'clinical_study' at 0x

In [9]:
print(len(all_parsed_files))

## Extract tags from parsed trials

In [10]:
# Dictionary to hold all data
all_data_dictionary = {}

# Final json file
json_file = '/all_trials_json' #name json file  

print('\nCreated dictionary and path to json file\n')


### Parse tags in files

In [11]:
'''
Find values by unique tags in XML files 
Save them in a dictionary 
Export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        
        
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')
%time create_dictionary_from_tag('brief_title')

print('Dictionary created')


In [12]:
'''
Check number of values for each key before creating a dataframe
'''

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))

In [13]:
# Append first condition to dictionary
# Add Nan for missing values

all_data_dictionary.setdefault('condition', [])

for n in all_parsed_files:
        value_conditions = n.find('condition')
        if n.find('condition') is not None:
            all_data_dictionary['condition'].append(value_conditions.text)
        else:
            all_data_dictionary['condition'].append('None')


In [15]:
print(all_data_dictionary.keys())
# all_data_dictionary['condition']

In [16]:
'''
Check number of values for each key before creating a dataframe
'''

#Pending: check only condition 

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


## Dump results in a json file

In [17]:
# Dump dictionary into a JSON file
with open('{}{}.json'.format(path_to_json_file, json_file), 'w') as fp:
    json.dump(all_data_dictionary, fp)
    print('JSON file created\n')

    json_size = round(os.path.getsize(path_to_json_file + json_file + '.json') / 1000000, 2)
    print("JSON file: {} Mb".format(json_size))

## Import json file in a dataframe

In [18]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

df = pd.read_json(file)


In [20]:
# Create a new column: study_first_submitted as dates

df['dates'] = pd.to_datetime(df['study_first_submitted'])

In [21]:
df.describe()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,dates
count,286328,286328,286328,286328,286328,286328
unique,286328,6055,18304,284435,47596,6055
top,NCT02703272,"September 13, 2005",National Institutes of Health Clinical Center ...,[Trial of device that is not approved or clear...,Healthy,2005-09-13 00:00:00
freq,1,1700,4272,798,6274,1700
first,,,,,,1999-09-17 00:00:00
last,,,,,,2018-10-04 00:00:00


In [23]:
df = df.sort_values(by ='dates')

In [29]:
df.info()

In [25]:
df.head(10)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,dates
46038,NCT00004640,"September 17, 1999",University of Washington,"""Clinical Trials to Enhance Elders' Oral Healt...",Tooth Loss,1999-09-17
150757,NCT00004639,"September 17, 1999",University of Florida,Cleft Palate Surgery and Speech Development,Cleft Lip,1999-09-17
43264,NCT00000341,"September 20, 1999",National Institute on Drug Abuse (NIDA),Evaluation of Liquid vs. Tablet Buprenorphine - 6,Opioid-Related Disorders,1999-09-20
195656,NCT00000289,"September 20, 1999",National Institute on Drug Abuse (NIDA),Role of Metabolites in Nicotine Dependence (3)...,Tobacco Use Disorder,1999-09-20
528,NCT00000227,"September 20, 1999",University of Vermont,Alternate-Day Buprenorphine Administration. Ph...,Opioid-Related Disorders,1999-09-20
142427,NCT00000254,"September 20, 1999",University of Chicago,Isoflurane at Subanesthetic Concentrations - 6,Opioid-Related Disorders,1999-09-20
124206,NCT00000257,"September 20, 1999",University of Chicago,Effects of Alcohol History on Effects of Nitro...,Alcohol-Related Disorders,1999-09-20
89379,NCT00000251,"September 20, 1999",University of Chicago,Effects of Subanesthetic Concentrations of Iso...,Opioid-Related Disorders,1999-09-20
213966,NCT00000275,"September 20, 1999",National Institute on Drug Abuse (NIDA),Cocaine Abuse and ADHD - 10,Cocaine-Related Disorders,1999-09-20
262794,NCT00000188,"September 20, 1999",University of Pennsylvania,Selegiline in Treatment of Cocaine Dependence - 2,Cocaine-Related Disorders,1999-09-20


In [26]:
df.tail(10)

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,dates
8829,NCT03697902,"October 4, 2018",Hospital Universitario Dr. Jose E. Gonzalez,Influence of Imaging Angle in Measurement of C...,Fetal Cisterna Magna Length,2018-10-04
117070,NCT03697967,"October 4, 2018",Maisonneuve-Rosemont Hospital,Supine vs Prone Position at Birth Before Cord ...,Newborn; Anemia,2018-10-04
71927,NCT03698266,"October 4, 2018",Queen's University,Is Needle Knife Fistulotomy An Effective First...,ERCP,2018-10-04
243900,NCT03697707,"October 4, 2018",DCPrime BV,Efficacy and Safety of Immunotherapy With Allo...,Acute Myeloid Leukemia in Remission,2018-10-04
198496,NCT03697824,"October 4, 2018",GlaxoSmithKline,"Clinical Trial of Safety, Tolerability and Ant...",Neoplasms,2018-10-04
26842,NCT03697915,"October 4, 2018",Istanbul Bilgi University,Implementing Lift System for Early Mobilization,Inpatient Facililty Diagnoses,2018-10-04
40693,NCT03698175,"October 4, 2018",University of Oxford,Effects of a Brief Mental Exercise on Emotiona...,Three Good Things,2018-10-04
177526,NCT03698292,"October 4, 2018",Ain Shams University,Itopride in Feeding Intolerance of Critically-...,Critically-ill Patients,2018-10-04
17299,NCT03697889,"October 4, 2018",Johnson & Johnson Consumer and Personal Produc...,A Study to Assess Bioquivalence Between a Nove...,Healthy,2018-10-04
171511,NCT03697980,"October 4, 2018",Medtronic Cardiac Rhythm and Heart Failure,"Apogee, Addendum to the Destination Therapy Po...",Chronic Heart Failure,2018-10-04
