# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [2]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


Autosave disabled


In [3]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [4]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [5]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []


In [6]:
# Create folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

    print('\nSetup folders created with exceptions: {}'.format(create_folders_errors))
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

### Create test folder with 50 random files

In [7]:
'''
Select 50 random files from all trials and save them in test folder
'''
import random

# Pending: If files in test folder, delete them
random_files = []
random_files.clear()
random_files = random.sample(os.listdir(path_to_all_xml_trials), 50)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)


## Parse all xml files

In [8]:
'''
Parse a folder all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}'.format(len(all_parsed_files)))
        
    return all_parsed_files

In [11]:
all_parsed_files.clear()
print(len(all_parsed_files))

In [12]:
# Run function
%time parse_xml_files(path_to_test_folder)


[<Element 'clinical_study' at 0x10d90d0e8>,
 <Element 'clinical_study' at 0x12145f868>,
 <Element 'clinical_study' at 0x12144f9a8>,
 <Element 'clinical_study' at 0x12143a0e8>,
 <Element 'clinical_study' at 0x121431ea8>,
 <Element 'clinical_study' at 0x121437a48>,
 <Element 'clinical_study' at 0x1212a53b8>,
 <Element 'clinical_study' at 0x1212baea8>,
 <Element 'clinical_study' at 0x1212c03b8>,
 <Element 'clinical_study' at 0x1212c79f8>,
 <Element 'clinical_study' at 0x1203dfdb8>,
 <Element 'clinical_study' at 0x1211f7db8>,
 <Element 'clinical_study' at 0x1211ece08>,
 <Element 'clinical_study' at 0x1211eeb88>,
 <Element 'clinical_study' at 0x1212154a8>,
 <Element 'clinical_study' at 0x11f175a48>,
 <Element 'clinical_study' at 0x11f143e58>,
 <Element 'clinical_study' at 0x11f1536d8>,
 <Element 'clinical_study' at 0x1213bb9a8>,
 <Element 'clinical_study' at 0x1213ac548>,
 <Element 'clinical_study' at 0x1213aa048>,
 <Element 'clinical_study' at 0x11f297638>,
 <Element 'clinical_study' at 0x

## Extract tags from parsed trials

In [13]:
# Dictionary to hold all data
all_data_dictionary = {}
all_data_dictionary.clear()

# Final json file
json_file = 'all_trials_json' #name json file  


### Parse tags in files

In [24]:
'''
Find values by unique tags in XML files 
Save them in a dictionary 
Export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        

In [25]:
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')
%time create_dictionary_from_tag('brief_title')

In [16]:
# Append first condition to dictionary
conditions = []
key = 'condition_test'
all_data_dictionary.setdefault(key, [])

for n in all_parsed_files:
    conditions = n.find('condition')
    all_data_dictionary[key].append(conditions.text)
    

In [18]:
print(all_data_dictionary.keys())

## Dump results in a json file

In [None]:
# Dump dictionary into a JSON file
    with open('{}{}.json'.format(path_to_json_file, json_file), 'w') as fp:
        json.dump(all_data_dictionary, fp)
        print('JSON file created\n')
        
        json_size = round(os.path.getsize(path_to_json_file + json_file + '.json') / 1000000, 2)
        print("JSON file: {} Mb".format(json_size))

## Import json file in a dataframe

In [19]:
'''
Check number of values for each key before creating a dataframe
'''
%time 
for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


In [21]:
# Import json into a dataframe

df = pd.read_json(path_to_test_folder + json_file + '.json')


ValueError: Expected object or value

In [None]:
print(df.columns)
print(df.describe())

In [None]:
df