# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [2]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


Autosave disabled


In [3]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [4]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [7]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []


In [8]:
# Create folders

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            print(e)
            pass

    print('\nSetup folders created\n')
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

In [None]:
'''
Function to create test folder by selecting 100 random files from all trials
'''

import random

filenames = random.sample(os.listdir(dirpath), 100)
for fname in filenames:
    srcpath = os.path.join(dirpath, fname))
    shutil.copyfile(srcpath, destDirectory)




## Parse all xml files

In [None]:
'''
Parse a folder all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    
    for filename in os.listdir(path_to_folder):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}'.format(len(all_parsed_files)))
    print('Files parsed in: {} seconds'.format(str(performance)))
        
    return all_parsed_files

In [None]:
# Run function
%time parse_xml_files(xml_trials_path)

'''
Estimated: 
Number of files parsed: 284374
Files parsed in: 1989 seconds
CPU times: user 6min 41s, sys: 13min 27s, total: 20min 9s
Wall time: 33min 9s
'''

## Parse all xml files and dump results in a single json file

In [None]:
'''
Create folder and path for json file
'''

json_folder = 'data/json/'

try:
    os.mkdir(json_folder)
    if name:
        print("Folder {} created".format(name))
except IOError as e:
    print(e)
    pass


In [None]:
# Variables for parsing function: path and dictionary

all_data_dictionary = {}

json_file = 'all_trials_json' #name json file  


## Method 1: Function to parse tags in files

In [None]:
# Slow. ~30 minutes to parse all files checking for only two tags
# All tags need to be in the doc or will throw an error

'''
Find values by tag in XML files, save them in a dictionary, and export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
    
    # Dump dictionary into a JSON file
    with open('{}{}.json'.format(json_folder, json_file), 'w') as fp:
        json.dump(all_data_dictionary, fp)
        print('JSON file created\n')
        
        json_size = round(os.path.getsize(json_folder + json_file + '.json') / 1000000, 2)
        print("JSON file: {} Mb".format(json_size))
    
        

In [None]:
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')

'''JSON file: 21.58 Mb
CPU times: user 28.7 s, sys: 1min 51s, total: 2min 20s
Wall time: 4min 47s'''

In [None]:
'''
# Tags potentially with multiple or missing values
'official title'
'url'
'detailed_description'
'brief_summary'

''''

## >> Import json file in a dataframe

In [None]:
'''
Check number of values for each key before creating a dataframe
'''
%time 
for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


In [None]:
# Import json into a dataframe

df = pd.read_json(json_folder + json_file + '.json')


In [None]:
print(df.columns)
print(df.describe())

In [None]:
df