# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [2]:
# reset variables and turn off autosave
# %reset
# %autosave 0

In [3]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [4]:
# Print messages to the terminal

# sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [5]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

#Remove old files from test folder
try:
    shutil.rmtree(path_to_test_folder) 
except FileNotFoundError:
    pass

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []

print('\nPaths and folders created\n')


Paths and folders created



In [6]:
# Create folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

    print('\nSetup folders created with exceptions: {}\n'.format(create_folders_errors))
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

/Users/cmserna/Sites/clinical trials/mvp/data/test created

Setup folders created with exceptions: [FileExistsError(17, 'File exists'), FileExistsError(17, 'File exists')]



### Create test folder with n random files

In [7]:
'''
Select n random files from all trials and save them in test folder
'''
files_for_test = 1000

import random

# Pending: If files in test folder, delete them
random_files = []
random_files = random.sample(os.listdir(path_to_all_xml_trials), files_for_test)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)


Random files: 1000


In [8]:
print(random_files[0:5])

['NCT03703011.xml', 'NCT00718315.xml', 'NCT03652753.xml', 'NCT03403504.xml', 'NCT01145014.xml']


## Parse xml files

In [9]:
'''
Parse all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
#         if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}\n'.format(len(all_parsed_files)))
        
    return all_parsed_files

In [10]:
# Run function
all_parsed_files.clear()
%time parse_xml_files(path_to_test_folder)


/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02802423.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02278575.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00921440.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03706820.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00913380.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02079610.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03103295.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01998529.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02785419.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03372629.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01600157.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01466530.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03729583.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02423317.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00624520.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02976012.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02982187.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02717234.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02524652.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00471770.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01230892.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00237224.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02198560.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00462917.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00784719.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02368288.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03266198.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02978885.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03602170.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00978965.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03652753.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01022866.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03172182.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00534664.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00371163.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00352547.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp


/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00718315.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01886755.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02306200.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01210898.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00003585.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00023127.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00056433.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01145014.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01987271.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01806948.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01993589.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03727516.xml file parsed

/Users/cmserna/Sites/clinical trials/mv

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00359346.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02466594.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01838525.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02792491.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03204539.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00352950.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00259636.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01627236.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01660633.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00461864.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01421953.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03171532.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02001818.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00126360.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01926145.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01847222.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02474043.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02618109.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02226562.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00389077.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02215369.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01171313.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00385671.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03582826.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00612105.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02149251.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03409224.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01862107.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00743821.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02750995.xml file parsed

Number of files parsed: 1000

CPU times: user 1.07 s, sys: 170 ms, total: 1.24 s
Wall time: 1.23 s


[<Element 'clinical_study' at 0x107980f98>,
 <Element 'clinical_study' at 0x1116db9f8>,
 <Element 'clinical_study' at 0x1116eca48>,
 <Element 'clinical_study' at 0x1116fb228>,
 <Element 'clinical_study' at 0x111701958>,
 <Element 'clinical_study' at 0x111840a98>,
 <Element 'clinical_study' at 0x1120d8548>,
 <Element 'clinical_study' at 0x1120df9a8>,
 <Element 'clinical_study' at 0x1120ef688>,
 <Element 'clinical_study' at 0x1120f8e58>,
 <Element 'clinical_study' at 0x112468318>,
 <Element 'clinical_study' at 0x112477a48>,
 <Element 'clinical_study' at 0x11248b638>,
 <Element 'clinical_study' at 0x112493c78>,
 <Element 'clinical_study' at 0x1124a3098>,
 <Element 'clinical_study' at 0x111d33f98>,
 <Element 'clinical_study' at 0x111d416d8>,
 <Element 'clinical_study' at 0x111d48e58>,
 <Element 'clinical_study' at 0x111d5e368>,
 <Element 'clinical_study' at 0x111d6a2c8>,
 <Element 'clinical_study' at 0x1118fd368>,
 <Element 'clinical_study' at 0x1119130e8>,
 <Element 'clinical_study' at 0x

In [11]:
print(len(all_parsed_files))

1000


## Extract tags from parsed trials

In [12]:
# Dictionary to hold all data
all_data_dictionary = {}

# Final json file
json_file = '/all_trials_json' #name json file  

print('\nCreated dictionary and path to json file\n')



Created dictionary and path to json file



### Parse tags in files

In [13]:
'''
Find values by unique tags in XML files 
Save them in a dictionary 
Export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        
        
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')
%time create_dictionary_from_tag('brief_title')
%time create_dictionary_from_tag('study_type')

print('Dictionary created')


NCT02802423 file parsed

NCT02278575 file parsed

NCT00921440 file parsed

NCT03706820 file parsed

NCT00913380 file parsed

NCT02079610 file parsed

NCT03103295 file parsed

NCT01998529 file parsed

NCT02785419 file parsed

NCT03372629 file parsed

NCT01600157 file parsed

NCT01466530 file parsed

NCT03195036 file parsed

NCT02941042 file parsed

NCT03040804 file parsed

NCT02704715 file parsed

NCT00881543 file parsed

NCT03334734 file parsed

NCT02019537 file parsed

NCT01781273 file parsed

NCT02772640 file parsed

NCT00582400 file parsed

NCT01480102 file parsed

NCT00158886 file parsed

NCT02390817 file parsed

NCT01643330 file parsed

NCT02515461 file parsed

NCT02386709 file parsed

NCT01933542 file parsed

NCT01373944 file parsed

NCT01547104 file parsed

NCT00452634 file parsed

NCT03561584 file parsed

NCT03580746 file parsed

NCT03142828 file parsed

NCT03442413 file parsed

NCT00247260 file parsed

NCT02221011 file parsed

NCT00005996 file parsed

NCT01860495 file parsed




February 28, 2018 file parsed

May 20, 2014 file parsed

November 21, 2016 file parsed

October 14, 2014 file parsed

March 17, 2010 file parsed

August 28, 2012 file parsed

February 14, 2017 file parsed

October 21, 2016 file parsed

April 24, 2007 file parsed

December 20, 2007 file parsed

November 30, 2009 file parsed

August 25, 2015 file parsed

March 13, 2017 file parsed

September 28, 2018 file parsed

September 2, 2013 file parsed

September 28, 2005 file parsed

February 17, 2011 file parsed

May 6, 2006 file parsed

April 24, 2014 file parsed

April 5, 2002 file parsed

January 30, 2009 file parsed

November 12, 2009 file parsed

October 8, 2012 file parsed

February 24, 2000 file parsed

November 21, 2014 file parsed

May 10, 2016 file parsed

July 22, 2016 file parsed

August 2, 2011 file parsed

February 25, 2014 file parsed

February 4, 2017 file parsed

November 14, 2016 file parsed

July 15, 2015 file parsed

February 25, 2016 file parsed

July 7, 2016 file parsed

M

January 15, 2014 file added to the dictionary

August 19, 2009 file added to the dictionary

February 27, 2013 file added to the dictionary

November 1, 1999 file added to the dictionary

April 15, 2008 file added to the dictionary

September 5, 2008 file added to the dictionary

March 24, 2014 file added to the dictionary

October 15, 2003 file added to the dictionary

May 30, 2015 file added to the dictionary

July 10, 2016 file added to the dictionary

April 22, 2015 file added to the dictionary

June 4, 2018 file added to the dictionary

March 31, 2015 file added to the dictionary

April 21, 2008 file added to the dictionary

August 5, 2013 file added to the dictionary

April 11, 2006 file added to the dictionary

December 10, 2014 file added to the dictionary

October 25, 2015 file added to the dictionary

May 3, 2017 file added to the dictionary

February 10, 2015 file added to the dictionary

June 26, 2013 file added to the dictionary

March 14, 2007 file added to the dictionary


University of Aarhus file added to the dictionary

VA Office of Research and Development file added to the dictionary

Louis Stokes VA Medical Center file added to the dictionary

University of California, Irvine file added to the dictionary

George Mason University file added to the dictionary

Loma Linda University file added to the dictionary

University of California, San Francisco file added to the dictionary

Société Française de Recherche et de Médecine du Sommeil file added to the dictionary

The George Institute for Global Health, China file added to the dictionary

Rutgers, The State University of New Jersey file added to the dictionary

Aesculap, Inc. file added to the dictionary

Boehringer Ingelheim file added to the dictionary

University of Bologna file added to the dictionary

Ente Ospedaliero Cantonale, Bellinzona file added to the dictionary

Pfizer file added to the dictionary

IRCCS San Raffaele file added to the dictionary

University of Aarhus file added to the d

Multidisciplinary Approach for Treat To Target In Rheumatoid Arthritis file parsed

Interobserver Agreement of Endoscopic Ultrasonography and Endoscopic Sonoelastography in the Evaluation of Lymph Nodes file parsed

Study to Explore Pharmacokinetics and Pharmacodynamics of a Single Rising Dose of BI 135585 XX file parsed

Insulin Resistance in Severely Obese Patients file parsed

Cluster Randomized Trial of Peer Health Education in Malaria in The Gambia file parsed

Insulin Clamp Ancillary Study for Assessment of Insulin Resistance file parsed

Interleukin (IL)-13 as a Marker in Pediatrics Asthma file parsed

A Phase 1 Study of OCV-C02 in Patients With Advanced or Relapsed Colorectal Cancer file parsed

Combination Chemotherapy Following Surgery in Treating Patients With Advanced Bladder Cancer file parsed

Retrospective Brazilian Study of Fulvestrant in Advanced Breast Cancer file parsed

Treatment of Malignant Vertebral Fractures With Percutaneous Balloon Kyphoplasty. file parsed

Th


Observational [Patient Registry] file parsed

Interventional file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Interventional file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Observational file parsed

Interventional file parsed

Intervent

In [14]:
'''
Check number of values for each key before creating a dataframe
'''

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))

nct_id 1000
study_first_submitted 1000
source 1000
brief_title 1000
study_type 1000


In [15]:
# Function for adding new tag

def add_new_tags(new_key):
    all_data_dictionary.setdefault(new_key, [])

    for n in all_parsed_files:
            value_conditions = n.find(new_key)
            if n.find(new_key) is not None:
                all_data_dictionary[new_key].append(value_conditions.text)
            else:
                all_data_dictionary[new_key].append('None')

    print('{} tag added'.format(new_key))


In [16]:
add_new_tags('condition')
add_new_tags('condition_browse/mesh_term')
add_new_tags('intervention_browse/mesh_term')
add_new_tags('detailed_description/textblock')
add_new_tags('brief_summary/textblock')
add_new_tags('location/facility/address/city')
add_new_tags('location/facility/address/country')
add_new_tags('location/facility/address/zip')

condition tag added
condition_browse/mesh_term tag added
intervention_browse/mesh_term tag added
detailed_description/textblock tag added
brief_summary/textblock tag added
location/facility/address/city tag added
location/facility/address/country tag added
location/facility/address/zip tag added


In [17]:
'''
Check keys and number of values for each one
'''

print(all_data_dictionary.keys())

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


dict_keys(['nct_id', 'study_first_submitted', 'source', 'brief_title', 'study_type', 'condition', 'condition_browse/mesh_term', 'intervention_browse/mesh_term', 'detailed_description/textblock', 'brief_summary/textblock', 'location/facility/address/city', 'location/facility/address/country', 'location/facility/address/zip'])
nct_id 1000
study_first_submitted 1000
source 1000
brief_title 1000
study_type 1000
condition 1000
condition_browse/mesh_term 1000
intervention_browse/mesh_term 1000
detailed_description/textblock 1000
brief_summary/textblock 1000
location/facility/address/city 1000
location/facility/address/country 1000
location/facility/address/zip 1000


## Dump results in a json file

In [18]:
# Dump dictionary into a JSON file
with open('{}{}.json'.format(path_to_json_file, json_file), 'w') as fp:
    json.dump(all_data_dictionary, fp)
    print('\nJSON file created\n')

    json_size = round(os.path.getsize(path_to_json_file + json_file + '.json') / 1000000, 2)
    print("JSON file: {} Mb".format(json_size))


JSON file created

JSON file: 2.27 Mb


## Test: Import json file in a dataframe

In [19]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

df = pd.read_json(file)


In [20]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,study_type,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip
0,NCT02802423,"June 13, 2016","BioLite, Inc.","A Phase I/II, Open Label Study to Evaluate the...",Interventional,Triple Negative Breast Cancer,Breast Neoplasms,Docetaxel,,\n The primary purpose of this study is t...,,,
1,NCT02278575,"October 28, 2014",Vastra Gotaland Region,Atenativ Effect on Uterine Blood Flow and Pree...,Interventional,Antithrombin III Deficiency,Pre-Eclampsia,Antithrombins,\n The study will be an open controlled p...,\n The study will be an open controlled p...,Gothenburg,Sweden,41685
2,NCT00921440,"June 15, 2009",University of Cologne,Computed Tomography Coronary Angiography (CTCA...,Observational,Coronary Artery Disease,Coronary Artery Disease,,\n A total of 50 patients prospectively u...,\n The investigators' rationale was to ev...,,,
3,NCT03706820,"September 29, 2018",Aristotle University Of Thessaloniki,Exercise Hemodynamics in Patients With Pulmona...,Observational,Exercise Pulmonary Hypertension,"Hypertension, Pulmonary",,\n Patients with fibrotic pulmonary disea...,\n The study evaluates the rest and exerc...,Thessaloniki,Greece,57010
4,NCT00913380,"May 15, 2009",Seoul National University Bundang Hospital,Diagnosis of Acute Appendicitis: Low-dose Comp...,Interventional,Appendicitis,Appendicitis,,\n Acute appendicitis is a very common di...,\n The purpose of this study is to determ...,Seongnam,"Korea, Republic of",463-707


In [22]:
df['location/facility/address/city'].nunique()

453