# Clinical trials: Parse all XML files


In [None]:
#Data: clinicaltrials.gov
#Source: wget https://clinicaltrials.gov/AllPublicXML.zip
#Date: September 14, 2018 [9pm]

In [None]:
# reset variables if needed
# %reset

In [None]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [None]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Import all xml files from folder

In [None]:
'''
Parse a folder of xml files and save them in a all_parsed_files variable.
Slow function to check only for xml files
'''

def parse_xml_files(path_to_folder):
    start = time.time()
    
    for filename in os.listdir(path_to_folder):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))
        
    end = time.time()
    performance = round(end - start)

    print('Number of files parsed: {}'.format(len(all_parsed_files)))
    print('Files parsed in: {} seconds'.format(str(performance)))
        
    return all_parsed_files

# Folders with xml files
xml_trials_path = 'data/all_trials/' #all xml files
test_folder = 'data/test/' #use for debugging

# # Variable for all parsed files
all_parsed_files = []

# Run function
%time parse_xml_files(xml_trials_path)

In [None]:
'''
Number of files parsed: 284374
Files parsed in: 2001 seconds
CPU times: user 6min 52s, sys: 13min 4s, total: 19min 56s
Wall time: 33min 21s
'''

In [None]:
'''
Faster option:
Parse a folder of xml files and save them in a all_parsed_files variable
'''

xml_trials = 'data/all_trials/'

all_parsed_files = []

for xml in os.listdir(xml_trials):
    print('File parsed {}'.format(xml))
    %time all_parsed_files.append(xml)

print('Number of files in data_trials_xml: {}'.format(len(all_parsed_files)))
print('First files: {}'.format(all_parsed_files[0:5]))

In [None]:
#Resulting variable

# all_parsed_files

## Parse all xml files and dump results in a json file

In [None]:
# Create folder for json file

json_folder = 'data/json/'

try:
    os.mkdir(json_folder)
    if name:
        print("Folder {} created".format(name))
except IOError as e:
    print(e)
    pass


In [None]:
# Variables for parsing function: path and dictionary

all_data_dictionary = {}

json_file = 'all_trials_json' #name json file  


In [None]:
# Slow. ~30 minutes to parse all files checking for only two tags

'''
Function to find values by tag in XML files, 
save them in a dictionary, and export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    values = []
    
    for n in parsed_files:
        for i in n.iter(tag):
            values.append(i.text) #get values
            # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set key
    name_dictionary.setdefault(tag, [])
    
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        
    with open('{}{}.json'.format(json_folder, json_file), 'w') as fp:
        json.dump(all_data_dictionary, fp)
        print('JSON file created\n')
        
        json_size = round(os.path.getsize(json_folder + json_file + '.json') / 1000000, 2)
        print("JSON file: {} Mb".format(json_size))

In [None]:
# Execute function

%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('brief_title')
%time create_dictionary_from_tag('sponsor')
%time create_dictionary_from_tag('textblock')
%time create_dictionary_from_tag('intervention_name')
%time create_dictionary_from_tag('intervention_name')

In [None]:
# Function not working

'''
Function to save info from several tags in several files, 
and save keys and values in a dictionary
'''

tags_dictionary = {}

def parsing_several_tags(name_dictionary, tag, parsed_files):
    k = []
    v = []
    
    for p in tag:
        for n in parsed_files:
            for i in n.iter(p):
                k.append(i.tag)
                v.append(i.text)

                print("{} file parsed\n".format(i.text))
                
    print("Keys and Values from several tags & files stored")
    
    name_dictionary = dict(zip(k, v))
    
    return name_dictionary


list_of_tags = ['nct_id','country']

parsing_several_tags(tags_dictionary, list_of_tags, all_parsed_files)

## Import json file in a dataframe

In [None]:
# Import json into a dataframe
df = pd.read_json(json_folder + json_file + '.json')

In [None]:
df.columns

In [None]:
df.loc[0:5]