# Clinical trials: Parse all XML files


In [1]:
#Data: clinicaltrials.gov

In [2]:
# reset variables and turn off autosave
%reset
%autosave 0

Once deleted, variables cannot be recovered. Proceed (y/[n])? 
Nothing done.


Autosave disabled


In [3]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [4]:
# Print messages to the terminal

# sys.stdout = open('/dev/stdout', 'w')


## Setup: create paths and folders

In [5]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Test folder ~100 files
path_to_test_folder = os.path.abspath('../data/test/')

#Remove old files from test folder
try:
    shutil.rmtree(path_to_test_folder) 
except FileNotFoundError:
    pass

# Path for json folder
path_to_json_file = os.path.abspath('../data/json/')

# Variable for all parsed files
all_parsed_files = []

print('\nPaths and folders created\n')


Paths and folders created



In [6]:
# Create folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

    print('\nSetup folders created with exceptions: {}\n'.format(create_folders_errors))
    
all_folders = [path_to_all_xml_trials, path_to_test_folder, path_to_json_file]
    
# create folders
create_folders(all_folders)

/Users/cmserna/Sites/clinical trials/mvp/data/test created

Setup folders created with exceptions: [FileExistsError(17, 'File exists'), FileExistsError(17, 'File exists')]



### Create test folder with n random files

In [7]:
'''
Select n random files from all trials and save them in test folder
'''
files_for_test = 50

import random

# Pending: If files in test folder, delete them
random_files = []
random_files = random.sample(os.listdir(path_to_all_xml_trials), files_for_test)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)


Random files: 50


In [8]:
print(random_files[0:5])

['NCT03717584.xml', 'NCT02796625.xml', 'NCT03531125.xml', 'NCT02405351.xml', 'NCT00038025.xml']


## Parse xml files

In [9]:
'''
Parse all xml files and save them in a all_parsed_files variable.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
#         if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())
        
        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

    print('Number of files parsed: {}\n'.format(len(all_parsed_files)))
        
    return all_parsed_files

In [10]:
# Run function
all_parsed_files.clear()
%time parse_xml_files(path_to_test_folder)


/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02750033.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01587092.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT01813695.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03647059.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00038025.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00976976.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02946996.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00934011.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03377725.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT03559907.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT02178631.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp/data/test/NCT00677040.xml file parsed

/Users/cmserna/Sites/clinical trials/mvp

[<Element 'clinical_study' at 0x1167d36d8>,
 <Element 'clinical_study' at 0x1167e20e8>,
 <Element 'clinical_study' at 0x1167ec098>,
 <Element 'clinical_study' at 0x1167d3688>,
 <Element 'clinical_study' at 0x1175d49f8>,
 <Element 'clinical_study' at 0x1175dd048>,
 <Element 'clinical_study' at 0x1175e1f98>,
 <Element 'clinical_study' at 0x1175ee138>,
 <Element 'clinical_study' at 0x1175f9688>,
 <Element 'clinical_study' at 0x1177894a8>,
 <Element 'clinical_study' at 0x117790a48>,
 <Element 'clinical_study' at 0x11779a548>,
 <Element 'clinical_study' at 0x1177be8b8>,
 <Element 'clinical_study' at 0x1177c7ae8>,
 <Element 'clinical_study' at 0x10c698e58>,
 <Element 'clinical_study' at 0x10c6aa8b8>,
 <Element 'clinical_study' at 0x10c6b6688>,
 <Element 'clinical_study' at 0x10c6c1408>,
 <Element 'clinical_study' at 0x116d729a8>,
 <Element 'clinical_study' at 0x116d81868>,
 <Element 'clinical_study' at 0x116d97368>,
 <Element 'clinical_study' at 0x116da4b38>,
 <Element 'clinical_study' at 0x

In [11]:
print(len(all_parsed_files))

50


## Extract tags from parsed trials

In [12]:
# Dictionary to hold all data
all_data_dictionary = {}

# Final json file
json_file = '/all_trials_json' #name json file  

print('\nCreated dictionary and path to json file\n')



Created dictionary and path to json file



### Parse tags in files

In [13]:
'''
Find values by unique tags in XML files 
Save them in a dictionary 
Export them to a json file
'''

def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):
    
    # Variable to store values
    keys = []
    values = []
    
    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan') #get values
                # Checking message to the terminal
            print("{} file parsed\n".format(i.text))
    
    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])
    
    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
        print("{} file added to the dictionary\n".format(i))
        
        
# Execute function only with unique values
%time create_dictionary_from_tag('nct_id')
%time create_dictionary_from_tag('study_first_submitted')
%time create_dictionary_from_tag('source')
%time create_dictionary_from_tag('brief_title')

print('Dictionary created')


NCT02750033 file parsed

NCT01587092 file parsed

NCT01813695 file parsed

NCT03647059 file parsed

NCT00038025 file parsed

NCT00976976 file parsed

NCT02946996 file parsed

NCT00934011 file parsed

NCT03377725 file parsed

NCT03559907 file parsed

NCT02178631 file parsed

NCT00677040 file parsed

NCT00781729 file parsed

NCT00954928 file parsed

NCT00740415 file parsed

NCT03623347 file parsed

NCT00023504 file parsed

NCT03745404 file parsed

NCT03418181 file parsed

NCT03651713 file parsed

NCT00002734 file parsed

NCT01916772 file parsed

NCT03717584 file parsed

NCT02796625 file parsed

NCT03531125 file parsed

NCT02667756 file parsed

NCT01265745 file parsed

NCT01289301 file parsed

NCT00253292 file parsed

NCT03614078 file parsed

NCT02862392 file parsed

NCT03494946 file parsed

NCT03608527 file parsed

NCT02405351 file parsed

NCT02506894 file parsed

NCT01267045 file parsed

NCT03348436 file parsed

NCT03364725 file parsed

NCT02430012 file parsed

NCT00919711 file parsed



In [14]:
'''
Check number of values for each key before creating a dataframe
'''

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))

nct_id 50
study_first_submitted 50
source 50
brief_title 50


In [15]:
# Function for adding new tag

def add_new_tags(new_key):
    all_data_dictionary.setdefault(new_key, [])

    for n in all_parsed_files:
            value_conditions = n.find(new_key)
            if n.find(new_key) is not None:
                all_data_dictionary[new_key].append(value_conditions.text)
            else:
                all_data_dictionary[new_key].append('None')

    print('{} tag added'.format(new_key))


In [16]:
add_new_tags('condition')
add_new_tags('detailed_description/textblock')
add_new_tags('brief_summary/textblock')

condition tag added
detailed_description/textblock tag added
brief_summary/textblock tag added


In [17]:
'''
Check keys and number of values for each one
'''

print(all_data_dictionary.keys())

for key, value in all_data_dictionary.items():
    print(key, len(list(filter(bool, value))))


dict_keys(['nct_id', 'study_first_submitted', 'source', 'brief_title', 'condition', 'detailed_description/textblock', 'brief_summary/textblock'])
nct_id 50
study_first_submitted 50
source 50
brief_title 50
condition 50
detailed_description/textblock 50
brief_summary/textblock 50


## Dump results in a json file

In [18]:
# Dump dictionary into a JSON file
with open('{}{}.json'.format(path_to_json_file, json_file), 'w') as fp:
    json.dump(all_data_dictionary, fp)
    print('\nJSON file created\n')

    json_size = round(os.path.getsize(path_to_json_file + json_file + '.json') / 1000000, 2)
    print("JSON file: {} Mb".format(json_size))


JSON file created

JSON file: 0.08 Mb


## Test: Import json file in a dataframe

In [19]:
# Import json into a dataframe
file = '{}{}.json'.format(path_to_json_file, json_file)

df = pd.read_json(file)


In [20]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock
0,NCT02750033,"April 20, 2016",Seton Healthcare Family,Intraoperative Margin Assessment During Mohs S...,Squamous Cell Carcinoma (SCC),\n Acquire intraoperative MMS measurement...,\n The research team will develop an intr...
1,NCT01587092,"April 25, 2012",Pennington Biomedical Research Center,Workstation Pilot Study,Obesity,\n Eligible participants will be randomiz...,\n This is a pilot study to assess the fe...
2,NCT01813695,"March 11, 2013","Children's Hospital Medical Center, Cincinnati",Preemptive Genotyping and Pain Management,Pain,\n Purpose: To determine the feasibility ...,\n The purpose of this study is to see if...
3,NCT03647059,"August 9, 2018","Shanghai General Hospital, Shanghai Jiao Tong ...",Rapid Assessment of Donor Liver Quality,"Transplant; Failure, Liver",,\n Aims：\n\n 1. Conduct multi-cent...
4,NCT00038025,"May 24, 2002",M.D. Anderson Cancer Center,A Study Of Deoxycoformycin(DCF)/Pentostatin In...,Peripheral T-cell Lymphoma,\n Deoxycoformycin(DCF)/Pentostatin is a ...,\n The purpose of this study is to determ...


In [21]:
# rename columns & delete \n values

In [22]:
df['detailed_description/textblock'][0]

'\n      Acquire intraoperative MMS measurements in vivo. After assessing this approach on excised\n      tissues, MMS measurements will be aquired directly on the patient from the NMSC excision\n      site. MMS data will be acquired on patients being treated for NMSC at the Austin Dermatologic\n      Surgery Center, the surgical site for the dermatology practice of Seton/University of Texas\n      Physicians group. Similar to the measurements on freshly excised tissues, MMS data will be\n      acquired in a grid pattern on the excision site. The site will be blotted with gauze to\n      remove residual blood prior to the measurement, and continuously blotted as needed until all\n      measurements have been taken. The handheld probe of the MMS enables assessment of both the\n      wound periphery and deeper layers of tissue to determine if any tumor is remaining. For this\n      initial pilot study, we plan to take measurements on 10 patients (5 BCC, 5 SCC), along with\n      correspo

In [23]:
# Create a new column: study_first_submitted as dates

df['dates'] = pd.to_datetime(df['study_first_submitted'])

In [24]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,condition,detailed_description/textblock,brief_summary/textblock,dates
0,NCT02750033,"April 20, 2016",Seton Healthcare Family,Intraoperative Margin Assessment During Mohs S...,Squamous Cell Carcinoma (SCC),\n Acquire intraoperative MMS measurement...,\n The research team will develop an intr...,2016-04-20
1,NCT01587092,"April 25, 2012",Pennington Biomedical Research Center,Workstation Pilot Study,Obesity,\n Eligible participants will be randomiz...,\n This is a pilot study to assess the fe...,2012-04-25
2,NCT01813695,"March 11, 2013","Children's Hospital Medical Center, Cincinnati",Preemptive Genotyping and Pain Management,Pain,\n Purpose: To determine the feasibility ...,\n The purpose of this study is to see if...,2013-03-11
3,NCT03647059,"August 9, 2018","Shanghai General Hospital, Shanghai Jiao Tong ...",Rapid Assessment of Donor Liver Quality,"Transplant; Failure, Liver",,\n Aims：\n\n 1. Conduct multi-cent...,2018-08-09
4,NCT00038025,"May 24, 2002",M.D. Anderson Cancer Center,A Study Of Deoxycoformycin(DCF)/Pentostatin In...,Peripheral T-cell Lymphoma,\n Deoxycoformycin(DCF)/Pentostatin is a ...,\n The purpose of this study is to determ...,2002-05-24
