# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [67]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [3]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [4]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

#Remove old files from test folder
try:
    shutil.rmtree(path_to_test_folder) 
except FileNotFoundError:
    pass

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []

print('\nPaths and folders created\n')

In [5]:
# Create folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

    print('\nSetup folders created with exceptions: {}\n'.format(create_folders_errors))
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

### Create test folder with n random files

In [6]:
'''
Select n random files from all trials and save them in test folder
'''
files_for_test = 10000

import random

# Pending: If files in test folder, delete them
random_files = []
random_files = random.sample(os.listdir(path_to_all_xml_trials), files_for_test)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)


In [7]:
print(random_files[0:5])

## Parse all xml files

In [8]:
'''
Parse all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
#         if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}\n'.format(len(all_parsed_files)))
        
    return all_parsed_files

In [9]:
# Run function
all_parsed_files.clear()
%time parse_xml_files(path_to_test_folder)


[<Element 'clinical_study' at 0x120444ef8>,
 <Element 'clinical_study' at 0x120686688>,
 <Element 'clinical_study' at 0x1209197c8>,
 <Element 'clinical_study' at 0x1206d6188>,
 <Element 'clinical_study' at 0x1208559a8>,
 <Element 'clinical_study' at 0x1205a35e8>,
 <Element 'clinical_study' at 0x1209644f8>,
 <Element 'clinical_study' at 0x120d2f7c8>,
 <Element 'clinical_study' at 0x1214f0db8>,
 <Element 'clinical_study' at 0x120560138>,
 <Element 'clinical_study' at 0x120864e58>,
 <Element 'clinical_study' at 0x120b4bae8>,
 <Element 'clinical_study' at 0x120be2188>,
 <Element 'clinical_study' at 0x120c08638>,
 <Element 'clinical_study' at 0x120e2d318>,
 <Element 'clinical_study' at 0x1210a7138>,
 <Element 'clinical_study' at 0x120f4f138>,
 <Element 'clinical_study' at 0x120f63bd8>,
 <Element 'clinical_study' at 0x121159c28>,
 <Element 'clinical_study' at 0x121174bd8>,
 <Element 'clinical_study' at 0x120da3ea8>,
 <Element 'clinical_study' at 0x1204b6638>,
 <Element 'clinical_study' at 0x

In [10]:
print(len(all_parsed_files))

## Extract tags from parsed trials

In [11]:
# Dictionary to hold all data
all_data_dictionary = {}

# Final json file
json_file = '/all_trials_json' #name json file  

print('\nCreated dictionary and path to json file\n')


### Parse tags in files

In [12]:
'''
Find values by unique tags in XML files 
Save them in a dictionary 
Export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        
        
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')
%time create_dictionary_from_tag('brief_title')

print('Dictionary created')


In [13]:
'''
Check number of values for each key before creating a dataframe
'''

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))

In [14]:
# Function for adding new tag

def add_new_tags(new_key):
    all_data_dictionary.setdefault(new_key, [])

    for n in all_parsed_files:
            value_conditions = n.find(new_key)
            if n.find(new_key) is not None:
                all_data_dictionary[new_key].append(value_conditions.text)
            else:
                all_data_dictionary[new_key].append('None')

    print('{} tag added'.format(new_key))


In [15]:
add_new_tags('condition')
add_new_tags('detailed_description/textblock')
add_new_tags('brief_summary/textblock')

In [16]:
'''
Check keys and number of values for each one
'''

print(all_data_dictionary.keys())

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


## Dump results in a json file

In [17]:
# Dump dictionary into a JSON file
with open('{}{}.json'.format(path_to_json_file, json_file), 'w') as fp:
    json.dump(all_data_dictionary, fp)
    print('\nJSON file created\n')

    json_size = round(os.path.getsize(path_to_json_file + json_file + '.json') / 1000000, 2)
    print("JSON file: {} Mb".format(json_size))

## Import json file in a dataframe

In [102]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

df = pd.read_json(file)


In [103]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT02776787,"May 5, 2016",University of Washington,DEBUT: Diverticulitis Evaluation of Patient Bu...,Diverticulitis,\n This study will examine both clinical ...,\n Half of all Americans over 60 years of...
1,NCT00797472,"November 24, 2008",American Scitech International,Study Comparing R-mabHD and a Combination of A...,Hodgkin's Disease,\n The primary objective is to compare th...,\n 120 subjects with Hodgkin's disease wi...
2,NCT03499821,"April 9, 2018",Staar Surgical Company,Clinical Performance of a Phakic Intraocular L...,Presbyopia,"\n This is a prospective, open-labeled, m...",\n This study will evaluate the clinical ...
3,NCT01224028,"October 18, 2010",Astellas Pharma Inc,A Study to Evaluate the Efficacy and Safety of...,IgA Nephropathy,,\n This study is to evaluate efficacy and...
4,NCT02279680,"June 23, 2014","University Hospital, Caen",Associative Memory in Adults With an Autism Sp...,Autistic Disorder,,\n The objective of this multidisciplinar...


In [104]:
# rename columns & delete \n values

In [105]:
df['detailed_description/textblock'][0]

"\n      This study will examine both clinical and non-clinical reasons for pursuing surgery, taking\n      into account previously unexplored areas in surgical decision-making including the\n      comprehensive impact of this disease, as well as factors that impact a surgeon's willingness\n      to recommend surgery. Ultimately, this patient-centered research approach will help us\n      understand the impact of treatment decisions on a patient's quality of life, work, and\n      clinical symptoms, and will ultimately help to inform our recommendations for the treatment\n      of patients with diverticulitis.\n\n      The DEBUT study was developed from questions raised by our Washington State Surgical\n      Collaborative, (SCOAP Colon and Rectal Surgery Work Group) and will be carried out in clinics\n      and hospitals across Washington State, Oregon, and California. The investigators invite\n      patients to participate in research activities designed to identify the impact of\n  

In [106]:
# Create a new column: study_first_submitted as dates

df['dates'] = pd.to_datetime(df['study_first_submitted'])

In [107]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock,dates
0,NCT02776787,"May 5, 2016",University of Washington,DEBUT: Diverticulitis Evaluation of Patient Bu...,Diverticulitis,\n This study will examine both clinical ...,\n Half of all Americans over 60 years of...,2016-05-05
1,NCT00797472,"November 24, 2008",American Scitech International,Study Comparing R-mabHD and a Combination of A...,Hodgkin's Disease,\n The primary objective is to compare th...,\n 120 subjects with Hodgkin's disease wi...,2008-11-24
2,NCT03499821,"April 9, 2018",Staar Surgical Company,Clinical Performance of a Phakic Intraocular L...,Presbyopia,"\n This is a prospective, open-labeled, m...",\n This study will evaluate the clinical ...,2018-04-09
3,NCT01224028,"October 18, 2010",Astellas Pharma Inc,A Study to Evaluate the Efficacy and Safety of...,IgA Nephropathy,,\n This study is to evaluate efficacy and...,2010-10-18
4,NCT02279680,"June 23, 2014","University Hospital, Caen",Associative Memory in Adults With an Autism Sp...,Autistic Disorder,,\n The objective of this multidisciplinar...,2014-06-23
