# Clinical trials: Parse all XML files


In [145]:
#Data: clinicaltrials.gov
#Source: wget https://clinicaltrials.gov/AllPublicXML.zip
#Date: September 14, 2018 [9pm]

In [146]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [147]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [148]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


In [None]:
''' GO TO IMPORT JSON FILE INTO A DATAFRAME
    IF JSON FILE AVAILABLE'''

## Import all xml files from folder

In [149]:
'''
Parse a folder of xml files and save them in a all_parsed_files variable.
Slow function that checks for xml files
'''

def parse_xml_files(path_to_folder):
    start = time.time()
    
    for filename in os.listdir(path_to_folder):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))
        
    end = time.time()
    performance = round(end - start)

    print('Number of files parsed: {}'.format(len(all_parsed_files)))
    print('Files parsed in: {} seconds'.format(str(performance)))
        
    return all_parsed_files

In [150]:
'''
Create paths and folders
Execute parsing function
'''

# Folders with xml files
xml_trials_path = 'data/all_trials/'

# Test folder ~280 files
test_folder = 'data/test/'

# Variable for all parsed files
all_parsed_files = []


# Run function
%time parse_xml_files(xml_trials_path)


'''
Estimated: 
Number of files parsed: 284374
Files parsed in: 1989 seconds
CPU times: user 6min 41s, sys: 13min 27s, total: 20min 9s
Wall time: 33min 9s
'''

[<Element 'clinical_study' at 0x11d61c3b8>,
 <Element 'clinical_study' at 0x11d61c138>,
 <Element 'clinical_study' at 0x11e4893b8>,
 <Element 'clinical_study' at 0x11e668e08>,
 <Element 'clinical_study' at 0x10cc532c8>,
 <Element 'clinical_study' at 0x11e1925e8>,
 <Element 'clinical_study' at 0x11e1989f8>,
 <Element 'clinical_study' at 0x11e1a1908>,
 <Element 'clinical_study' at 0x11e1aaa48>,
 <Element 'clinical_study' at 0x11e1f26d8>,
 <Element 'clinical_study' at 0x11e1fd1d8>,
 <Element 'clinical_study' at 0x11e2041d8>,
 <Element 'clinical_study' at 0x11e24ca98>,
 <Element 'clinical_study' at 0x11e25db88>,
 <Element 'clinical_study' at 0x11e2699a8>,
 <Element 'clinical_study' at 0x11e28bcc8>,
 <Element 'clinical_study' at 0x11e29d318>,
 <Element 'clinical_study' at 0x11e2ac278>,
 <Element 'clinical_study' at 0x11e2b4728>,
 <Element 'clinical_study' at 0x11e2cff98>,
 <Element 'clinical_study' at 0x11e2f99f8>,
 <Element 'clinical_study' at 0x11e303e58>,
 <Element 'clinical_study' at 0x

## Parse all xml files and dump results in a single json file

In [151]:
'''
Create folder and path for json file
'''

json_folder = 'data/json/'

try:
    os.mkdir(json_folder)
    if name:
        print("Folder {} created".format(name))
except IOError as e:
    print(e)
    pass


In [152]:
# Variables for parsing function: path and dictionary

all_data_dictionary = {}

json_file = 'all_trials_json' #name json file  


## Method 1: Function to parse tags in files

In [153]:
# Slow. ~30 minutes to parse all files checking for only two tags
# All tags need to be in the doc or will throw an error

'''
Find values by tag in XML files, save them in a dictionary, and export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
    
    # Dump dictionary into a JSON file
    with open('{}{}.json'.format(json_folder, json_file), 'w') as fp:
        json.dump(all_data_dictionary, fp)
        print('JSON file created\n')
        
        json_size = round(os.path.getsize(json_folder + json_file + '.json') / 1000000, 2)
        print("JSON file: {} Mb".format(json_size))
    
        

In [154]:
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')

'''JSON file: 21.58 Mb
CPU times: user 28.7 s, sys: 1min 51s, total: 2min 20s
Wall time: 4min 47s'''

In [None]:
'''
# Tags potentially with multiple or missing values
'official title'
'url'
'detailed_description'
'brief_summary'

''''

## >> Import json file in a dataframe

In [157]:
'''
Check number of values for each key before creating a dataframe
'''
%time 
for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


In [155]:
# Import json into a dataframe

df = pd.read_json(json_folder + json_file + '.json')


ValueError: arrays must all be same length

In [None]:
print(df.columns)
print(df.describe())

In [143]:
df

Unnamed: 0,nct_id,study_first_submitted,source,brief_summary
0,NCT00000233,"September 20, 1999",National Institute on Drug Abuse (NIDA),\n
1,NCT00000227,"September 20, 1999",University of Vermont,\n
2,NCT00000145,"September 23, 1999",National Eye Institute (NEI),\n
3,NCT00000151,"September 23, 1999",National Eye Institute (NEI),\n
4,NCT00000179,"October 29, 1999",National Institute on Aging (NIA),\n
5,NCT00000192,"September 20, 1999",Yale University,\n
6,NCT00000347,"September 20, 1999",National Institute on Drug Abuse (NIDA),\n
7,NCT00000353,"September 20, 1999",National Institute on Drug Abuse (NIDA),\n
8,NCT00000384,"November 2, 1999",University of Pennsylvania,\n
9,NCT00000390,"January 17, 2000",National Institute of Mental Health (NIMH),\n
