## Parse a selection of tags from xml files, export it as json
- Use random function to define number of trials to test
- Check variables for folders for all data

In [None]:
import json
import os
import xml.etree.ElementTree as ET
import shutil
import sys
import pandas as pd

In [None]:
'''
Create paths and folders
'''

# Folders with xml files
path_to_all_xml_trials = os.path.abspath('../data/all_trials/')

# Folders with randome sample xml files for testing and debugging
path_to_test_folder = os.path.abspath('../data/test/')

# Path for json folder [json_test for debugging]
path_to_json_file = os.path.abspath('../data/json_test/')

# Variable for all parsed files
all_parsed_files = []

#substitute first file with main xml folder
all_folders = [path_to_all_xml_trials, path_to_json_file, path_to_test_folder]

print('\nPaths and folders created\n')

In [None]:
#Function for creating folders

create_folders_errors = []

def create_folders(paths=[]):
    for p in paths:
        try:
            os.mkdir(p)
            if p:
                print('{} created'.format(p))
        except IOError as e:
            create_folders_errors.append(e)
            pass

        print('\nFolders created. Except: {}\n'.format(create_folders_errors))


In [None]:
create_folders(all_folders)

### Use it to define a sample of xml files for debugging and test

In [None]:
'''
Select n random files from all trials and save them in test folder
'''
files_for_test = 1000

import random

# Pending: If files in test folder, delete them
random_files = []
random_files = random.sample(os.listdir(path_to_all_xml_trials), files_for_test)
print('Random files: {}'.format(len(random_files)))

for f in random_files:
    src = os.path.join(path_to_all_xml_trials, f)
    shutil.copy(src, path_to_test_folder)

In [None]:
print(random_files[0:5])

In [None]:
'''
Parse all xml files and save them in all_parsed_files.
Slow. Checks for xml files
'''

def parse_xml_files(path_to_folder):
    for filename in os.listdir(path_to_folder):
        # if not filename.endswith('.xml'): continue
        fullname = os.path.join(path_to_folder, filename)
        all_parsed_files.append(ET.parse(fullname).getroot())

        # Checking message to the terminal
        print("{} file parsed\n".format(fullname))

#     print('Number of files parsed: {}\n'.format(len(all_parsed_files)))

    return all_parsed_files

In [None]:
##
#substitute argument with main xml folder with all trials
##
%%capture
parse_xml_files(path_to_test_folder)


In [None]:
'''
Find values by unique tags in XML files
Save them in a dictionary
Export them to a json file
'''

all_data_dictionary = {}
print('\n-------------------------\n')
print('\nCreated all_data_dictionary\n')

In [None]:
#function to create dictionary from tags
def create_dictionary_from_tag(tag, parsed_files = all_parsed_files, name_dictionary = all_data_dictionary):

    # Variable to store values
    # keys = []
    values = []

    # Iterate through all xml parsed files and tags, and append data to values
    for n in parsed_files:
        for i in n.iter(tag):
            if i.text != 0:
                values.append(i.text)
            else:
                values.append('nan')
            print("{} file parsed\n".format(i.text))

    # Create dictionary and set tags as keys
    name_dictionary.setdefault(tag, [])

    # Append values to dictionary
    for i in values:
        name_dictionary[tag].append(i)
#         print("{} file added to the dictionary\n".format(i))

In [None]:
%%capture
#First batch
create_dictionary_from_tag('nct_id')
create_dictionary_from_tag('study_first_submitted')
create_dictionary_from_tag('source')
create_dictionary_from_tag('brief_title')
create_dictionary_from_tag('overall_status')
create_dictionary_from_tag('verification_date')
create_dictionary_from_tag('study_type')
create_dictionary_from_tag('study_first_posted')
create_dictionary_from_tag('last_update_submitted')
create_dictionary_from_tag('last_update_posted')
create_dictionary_from_tag('phase')

In [None]:
def check_values_key():
    print('\nChecking values by key\n')

    # Check number of values by key
    for key, value in all_data_dictionary.items():
        print(key, len(list(filter(bool, value))))

In [None]:
#All values need to match for JSON file
check_values_key()

In [None]:
# Function for adding nested tags

def add_new_tags(new_key):
    all_data_dictionary.setdefault(new_key, [])

    for n in all_parsed_files:
            value_conditions = n.find(new_key)
            if n.find(new_key) is not None:
                all_data_dictionary[new_key].append(value_conditions.text)
            else:
                all_data_dictionary[new_key].append('None')

    print('{} tag added'.format(new_key))

In [None]:
add_new_tags('condition')
add_new_tags('condition_browse/mesh_term')
add_new_tags('intervention_browse/mesh_term')
add_new_tags('detailed_description/textblock')
add_new_tags('brief_summary/textblock')
add_new_tags('location/facility/address/city')
add_new_tags('location/facility/address/country')
add_new_tags('location/facility/address/zip')
add_new_tags('sponsors/lead_sponsor/agency')
add_new_tags('sponsors/lead_sponsor/agency_class')
add_new_tags('study_design_info/allocation')
add_new_tags('study_design_info/intervention_model')
add_new_tags('study_design_info/primary_purpose')
# add_new_tags('reference/citation')

In [None]:
check_values_key()

### Put all data from dictionary in a Dataframe, fill none values

In [None]:
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in all_data_dictionary.items()]))

In [None]:
df.sample(10)

In [None]:
df.to_json('../data/json/sample_json.json')

In [None]:
# Load json file into new dataframe
df_json = pd.read_json('../data/json/sample_json.json')

In [None]:
df_json.shape

In [None]:
df_json.head()

In [None]:
df.size