# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [2]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


Autosave disabled


In [16]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [17]:
# Print messages to the terminal

# sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [18]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

#Remove old files from test folder
try:
    shutil.rmtree(path_to_test_folder) 
except FileNotFoundError:
    pass

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []

print('\nPaths and folders created\n')


Paths and folders created



In [19]:
# Create folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

    print('\nSetup folders created with exceptions: {}\n'.format(create_folders_errors))
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

/Users/cmserna/Sites/clinical trials/mvp/data/test created

Setup folders created with exceptions: [FileExistsError(17, 'File exists'), FileExistsError(17, 'File exists')]



### Create test folder with n random files

In [20]:
'''
Select n random files from all trials and save them in test folder
'''
files_for_test = 1000

import random

# Pending: If files in test folder, delete them
random_files = []
random_files = random.sample(os.listdir(path_to_all_xml_trials), files_for_test)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)


Random files: 1000


In [21]:
print(random_files[0:5])

['NCT00313196.xml', 'NCT03562676.xml', 'NCT01731444.xml', 'NCT03508128.xml', 'NCT00513994.xml']


## Parse xml files

In [22]:
'''
Parse all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
#         if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}\n'.format(len(all_parsed_files)))
        
    return all_parsed_files

In [23]:
# Run function
all_parsed_files.clear()
%time parse_xml_files(path_to_test_folder)


/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00202488.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02907567.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01647750.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03016897.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02684539.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03258567.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01153581.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01923142.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02942563.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02027220.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00445003.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00342251.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00879853.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03169946.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00876590.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03709225.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02350517.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02768519.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00482729.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03739866.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02605252.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03562819.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02943811.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02848950.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01133158.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00889187.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01650129.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02638909.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00281255.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01934101.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01241370.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02500433.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00063297.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03042286.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00774995.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02766530.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03345758.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00326963.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02729012.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03641118.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00362700.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00543595.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00461708.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03510338.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02684409.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01919463.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02812927.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02680860.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00556049.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00790049.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00526357.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03708224.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00078962.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01577225.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00628498.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01532037.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00074802.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02373774.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00829348.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01822652.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

[<Element 'clinical_study' at 0x1153a9868>,
 <Element 'clinical_study' at 0x1153af2c8>,
 <Element 'clinical_study' at 0x1153bbae8>,
 <Element 'clinical_study' at 0x1155f68b8>,
 <Element 'clinical_study' at 0x115606868>,
 <Element 'clinical_study' at 0x11560b1d8>,
 <Element 'clinical_study' at 0x1156251d8>,
 <Element 'clinical_study' at 0x11576cd68>,
 <Element 'clinical_study' at 0x1157784f8>,
 <Element 'clinical_study' at 0x1157873b8>,
 <Element 'clinical_study' at 0x11591a368>,
 <Element 'clinical_study' at 0x115832bd8>,
 <Element 'clinical_study' at 0x115837368>,
 <Element 'clinical_study' at 0x115856368>,
 <Element 'clinical_study' at 0x11585ef48>,
 <Element 'clinical_study' at 0x115b3d0e8>,
 <Element 'clinical_study' at 0x115b4c0e8>,
 <Element 'clinical_study' at 0x115b4ec28>,
 <Element 'clinical_study' at 0x115b636d8>,
 <Element 'clinical_study' at 0x115b74098>,
 <Element 'clinical_study' at 0x10ac1c9f8>,
 <Element 'clinical_study' at 0x10ac235e8>,
 <Element 'clinical_study' at 0x

In [24]:
print(len(all_parsed_files))

1000


## Extract tags from parsed trials

In [25]:
# Dictionary to hold all data
all_data_dictionary = {}

# Final json file
json_file = '/all_trials_json' #name json file  

print('\nCreated dictionary and path to json file\n')



Created dictionary and path to json file



### Parse tags in files

In [26]:
'''
Find values by unique tags in XML files 
Save them in a dictionary 
Export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        
        
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('agency')
%time create_dictionary_from_tag('brief_title')

print('Dictionary created')


NCT00202488 file parsed

NCT02907567 file parsed

NCT01647750 file parsed

NCT03016897 file parsed

NCT02684539 file parsed

NCT03258567 file parsed

NCT01153581 file parsed

NCT01923142 file parsed

NCT02942563 file parsed

NCT02027220 file parsed

NCT00445003 file parsed

NCT00342251 file parsed

NCT02913664 file parsed

NCT02384447 file parsed

NCT03664479 file parsed

NCT03599570 file parsed

NCT00172510 file parsed

NCT00239967 file parsed

NCT02501616 file parsed

NCT02339844 file parsed

NCT02865057 file parsed

NCT02846467 file parsed

NCT01454804 file parsed

NCT03423563 file parsed

NCT02185508 file parsed

NCT00994370 file parsed

NCT00226304 file parsed

NCT02598037 file parsed

NCT02483403 file parsed

NCT00298415 file parsed

NCT02430805 file parsed

NCT00591656 file parsed

NCT01492062 file parsed

NCT00833118 file parsed

NCT01546233 file parsed

NCT02348515 file parsed

NCT00167323 file parsed

NCT03033550 file parsed

NCT00310674 file parsed

NCT03329599 file parsed




April 15, 2011 file parsed

September 2, 2005 file parsed

September 10, 2014 file parsed

September 20, 2011 file parsed

July 25, 2016 file parsed

May 3, 2010 file parsed

September 18, 2014 file parsed

October 5, 2011 file parsed

April 22, 2010 file parsed

November 2, 1999 file parsed

March 5, 2016 file parsed

April 16, 2016 file parsed

November 1, 1999 file parsed

August 27, 2015 file parsed

December 19, 2007 file parsed

June 11, 2013 file parsed

May 30, 2018 file parsed

June 24, 2014 file parsed

August 31, 2011 file parsed

January 6, 2015 file parsed

February 8, 2016 file parsed

October 22, 2015 file parsed

August 5, 2015 file parsed

March 2, 2018 file parsed

July 11, 2001 file parsed

November 4, 2014 file parsed

August 1, 2016 file parsed

June 1, 2007 file parsed

April 20, 2015 file parsed

August 28, 2017 file parsed

June 26, 2017 file parsed

March 7, 2016 file parsed

June 24, 2018 file parsed

August 10, 2015 file parsed

September 12, 2005 file parse


June 6, 2014 file added to the dictionary

March 23, 2016 file added to the dictionary

April 28, 2013 file added to the dictionary

October 29, 2009 file added to the dictionary

October 4, 2017 file added to the dictionary

January 15, 2018 file added to the dictionary

November 2, 1999 file added to the dictionary

July 28, 2014 file added to the dictionary

May 15, 2017 file added to the dictionary

December 23, 2013 file added to the dictionary

December 11, 2008 file added to the dictionary

May 8, 2017 file added to the dictionary

December 11, 2008 file added to the dictionary

January 13, 2009 file added to the dictionary

March 10, 2011 file added to the dictionary

April 17, 2018 file added to the dictionary

May 3, 2002 file added to the dictionary

December 5, 2012 file added to the dictionary

June 27, 2018 file added to the dictionary

March 17, 2018 file added to the dictionary

February 11, 2008 file added to the dictionary

July 1, 2015 file added to the dictionary




ErinoakKids Centre for Treatment and Development file parsed

McMaster Children's Hospital file parsed

Biovotec AS file parsed

National Cancer Institute (NCI) file parsed

Nanjing University School of Medicine file parsed

Braintree Laboratories file parsed

Dana-Farber Cancer Institute file parsed

University of Chicago file parsed

University of Wisconsin, Madison file parsed

University of Texas Southwestern Medical Center file parsed

Universidade Federal do Rio Grande do Norte file parsed

Universidade Estadual da Paraiba file parsed

Istituto Auxologico Italiano file parsed

Icahn School of Medicine at Mount Sinai file parsed

H. Lee Moffitt Cancer Center and Research Institute file parsed

Yonsei University file parsed

Ambu A/S file parsed

Wake Forest University Health Sciences file parsed

National Cancer Institute (NCI) file parsed

Seoul National University Hospital file parsed

Asan Medical Center file parsed

Dankook University file parsed

International Tuberculosis R

Amgen file added to the dictionary

University of North Carolina, Chapel Hill file added to the dictionary

Rare Diseases Clinical Research Network file added to the dictionary

National Center for Research Resources (NCRR) file added to the dictionary

National Institutes of Health (NIH) file added to the dictionary

Merck Sharp & Dohme Corp. file added to the dictionary

University of Oklahoma file added to the dictionary

Yale University file added to the dictionary

Kochi University file added to the dictionary

Northumbria University file added to the dictionary

Ginsana SA file added to the dictionary

MediQuest Therapeutics file added to the dictionary

Jaeb Center for Health Research file added to the dictionary

Eisai Inc. file added to the dictionary

Halozyme Therapeutics file added to the dictionary

New York State Psychiatric Institute file added to the dictionary

Pfizer file added to the dictionary

Institute of Liver and Biliary Sciences, India file added to the diction


Walking Aids in the Management of Knee Osteoarthritis file added to the dictionary

Evaluation of the Efficacy of Intramuscular Islet Autograft After Extensive Pancreatectomy file added to the dictionary

A Study Evaluating GDC-0980 Administered Once Weekly in Patients With Refractory Solid Tumors or Non-Hodgkin's Lymphoma file added to the dictionary

Treatment of Diabetic Macular Edema With Aflibercept in Subjects Previously Treated With Ranibizumab or Bevacizumab file added to the dictionary

A Four-Week Study Comparing Acetaminophen Extended Release and Rofecoxib in the Treatment of Osteoarthritis of the Knee file added to the dictionary

Paroxetine for Comorbid Social Anxiety Disorder and Alcoholism file added to the dictionary

MRA With Feraheme in HHT file added to the dictionary

Treatment of Natural Killer/T Cell Lymphoma-I/II file added to the dictionary

The Effect of Rifaximin on Portal Vein Thrombosis file added to the dictionary

A Relative Bioavailability Study of 50 mg

In [27]:
'''
Check number of values for each key before creating a dataframe
'''

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))

nct_id 1000
study_first_submitted 1000
agency 1650
brief_title 1000


In [28]:
# Function for adding new tag

def add_new_tags(new_key):
    all_data_dictionary.setdefault(new_key, [])

    for n in all_parsed_files:
            value_conditions = n.find(new_key)
            if n.find(new_key) is not None:
                all_data_dictionary[new_key].append(value_conditions.text)
            else:
                all_data_dictionary[new_key].append('None')

    print('{} tag added'.format(new_key))


In [29]:
add_new_tags('condition')
add_new_tags('detailed_description/textblock')
add_new_tags('brief_summary/source')
add_new_tags('brief_summary/textblock')

condition tag added
detailed_description/textblock tag added
brief_summary/source tag added
brief_summary/textblock tag added


In [30]:
'''
Check keys and number of values for each one
'''

print(all_data_dictionary.keys())

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


dict_keys(['nct_id', 'study_first_submitted', 'agency', 'brief_title', 'condition', 'detailed_description/textblock', 'brief_summary/source', 'brief_summary/textblock'])
nct_id 1000
study_first_submitted 1000
agency 1650
brief_title 1000
condition 1000
detailed_description/textblock 1000
brief_summary/source 1000
brief_summary/textblock 1000


## Dump results in a json file

In [None]:
# Dump dictionary into a JSON file
with open('{}{}.json'.format(path_to_json_file, json_file), 'w') as fp:
    json.dump(all_data_dictionary, fp)
    print('\nJSON file created\n')

    json_size = round(os.path.getsize(path_to_json_file + json_file + '.json') / 1000000, 2)
    print("JSON file: {} Mb".format(json_size))

## Import json file in a dataframe

In [None]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

df = pd.read_json(file)


In [None]:
df.head()

In [None]:
# rename columns & delete \n values

In [None]:
df['detailed_description/textblock'][0]

In [None]:
# Create a new column: study_first_submitted as dates

df['dates'] = pd.to_datetime(df['study_first_submitted'])

In [None]:
df.head()