# Clinical trials: Parse all XML files


In [57]:
#Data: clinicaltrials.gov
#Source: wget https://clinicaltrials.gov/AllPublicXML.zip
#Date: September 14, 2018 [9pm]

In [58]:
# reset variables if needed
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [59]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [60]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Import all xml files from folder

In [61]:
'''
Parse a folder of xml files and save them in a all_parsed_files variable.
Slow function that checks for xml files
'''

def parse_xml_files(path_to_folder):
    start = time.time()
    
    for filename in os.listdir(path_to_folder):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))
        
    end = time.time()
    performance = round(end - start)

    print('Number of files parsed: {}'.format(len(all_parsed_files)))
    print('Files parsed in: {} seconds'.format(str(performance)))
        
    return all_parsed_files

In [62]:
'''
Execute function
'''

# Folders with xml files
xml_trials_path = 'data/all_trials/' #all xml files
test_folder = 'data/test/' #use for debugging

# Variable for all parsed files
all_parsed_files = []

# Run function
%time parse_xml_files(test_folder)

[<Element 'clinical_study' at 0x117705958>,
 <Element 'clinical_study' at 0x108263228>,
 <Element 'clinical_study' at 0x117903ea8>,
 <Element 'clinical_study' at 0x1177639f8>]

## Parse all xml files and dump results in a single json file

In [63]:
# Create folder for json file

json_folder = 'data/json/'

try:
    os.mkdir(json_folder)
    if name:
        print("Folder {} created".format(name))
except IOError as e:
    print(e)
    pass


In [64]:
# Variables for parsing function: path and dictionary

all_data_dictionary = {}

json_file = 'all_trials_json' #name json file  


### Function to parse tags in files (slow)

In [65]:
# Slow. ~30 minutes to parse all files checking for only two tags

'''
Find values by tag in XML files, save them in a dictionary, and export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            values.append(i.text) #get values
            # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
    
    # Dump dictionary into a JSON file
    with open('{}{}.json'.format(json_folder, json_file), 'w') as fp:
        json.dump(all_data_dictionary, fp)
        print('JSON file created\n')
        
        json_size = round(os.path.getsize(json_folder + json_file + '.json') / 1000000, 2)
        print("JSON file: {} Mb".format(json_size))
        
    return name_dictionary
        
        

In [66]:
# Execute function
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('condition')

{'nct_id': ['NCT00000106', 'NCT00000104', 'NCT00000105', 'NCT00000102'],
 'condition': ['Rheumatic Diseases',
  'Lead Poisoning',
  'Cancer',
  'Congenital Adrenal Hyperplasia']}

In [70]:
# New additions to the dictionary
%time create_dictionary_from_tag('brief_title')

{'nct_id': ['NCT00000106', 'NCT00000104', 'NCT00000105', 'NCT00000102'],
 'condition': ['Rheumatic Diseases',
  'Lead Poisoning',
  'Cancer',
  'Congenital Adrenal Hyperplasia'],
 'brief_title': ['41.8 Degree Centigrade Whole Body Hyperthermia for the Treatment of Rheumatoid Diseases',
  'Does Lead Burden Alter Neuropsychological Development?',
  'Vaccination With Tetanus and KLH to Assess Immune Responses.',
  'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets']}

## Import json file in a dataframe

In [71]:
# Import json into a dataframe

df = pd.read_json(json_folder + json_file + '.json')


In [72]:
df.columns

Index(['nct_id', 'condition', 'brief_title'], dtype='object')

In [74]:
df

Unnamed: 0,nct_id,condition,brief_title
0,NCT00000106,Rheumatic Diseases,41.8 Degree Centigrade Whole Body Hyperthermia...
1,NCT00000104,Lead Poisoning,Does Lead Burden Alter Neuropsychological Deve...
2,NCT00000105,Cancer,Vaccination With Tetanus and KLH to Assess Imm...
3,NCT00000102,Congenital Adrenal Hyperplasia,Congenital Adrenal Hyperplasia: Calcium Channe...
