# Clinical trials: setup & data importing


In [1]:
#Data: clinicaltrials.gov
#Source: wget https://clinicaltrials.gov/AllPublicXML.zip
#Date: September 14, 2018 [9pm]

In [2]:
# reset variables if needed
# %reset

In [3]:
import zipfile
import sys
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

In [4]:
# Print messages to the terminal

sys.stdout = open('/dev/stdout', 'w')


## Parse one random trial

In [5]:
# Save all paths to xml files in a new variable
# Not dependent of unzip script

xml_trials = '../data/all_trials/'

data_xml = []

for xml in os.listdir(xml_trials):
    data_xml.append(xml)

print('Number of files in data_trials_xml: {}'.format(len(data_xml)))
print('First files: {}'.format(data_xml[0:5]))

In [6]:
'''
START HERE FOR PARSING A NEW RANDOME FILE
'''

'\nSTART HERE FOR PARSING A NEW RANDOME FILE\n'

In [7]:
#parse random xml file
import random
n = random.randint(0, 1000)

trial = xml_trials + data_xml[n] #full path
tree = ET.parse(trial).getroot()
print("Random: {}\n".format(n))
print('File: {}\n'.format(data_xml[n]))

In [8]:
print('\n----------- FILE ----------\n')
print(ET.tostring(tree).decode())

In [None]:

'''GO TO METHOD 4 FOR PARSING THE FILE'''


## Method 1: Store keys & values from xml file in list

In [11]:
#Create Keys & Values with all info from the parsed trial
keys_trial = []
value_trial = []

for nct in tree.iter('nct_id'):
    id_trial = nct.text
    
for tag in tree:
    for i in tag:
        keys_trial.append(tag.tag)
        keys_trial.append(i.tag)
        value_trial.append(tag.text)
        value_trial.append(i.text)
        
print("Random: {}".format(n))
print("Trial id: {}".format(id_trial))
print("Number of keys: {}".format(len(keys_trial)))

In [12]:
# Check id, keys, values
print("Trial id: {}\n".format(id_trial))
for i, n in zip(keys_trial, value_trial):
    print('Key: {}, Value: {}'.format(i, n))

### Zip lists into a dictionary and save it as a json file
- Use only for files with no duplicated keys

In [13]:
# Save data in a dictionary

trial_dict = dict(zip(keys_trial, value_trial))


# Dump dictionary into a json file

with open('{}{}.json'.format(json_folder, id_trial), 'w') as fp:
    json.dump(trial_dict, fp)
    print('JSON file created\n')

 
print("Keys in list: {}".format(len(keys_trial)))
print('Keys in dictionary: {}\n'.format(len(trial_dict.keys())))
# print('Dictionary:\n{}'.format(trial_dict))

NameError: name 'json_folder' is not defined

## Method 2: keep all duplicated keys

In [None]:
counts = Counter(keys_trial)
for value, count in counts.most_common():
    if count > 1:
        print('{},{}'.format(value, count))

In [None]:
for s,num in counts.items():
    if num > 1: 
        for suffix in range(1, num + 1): # suffix starts at 1 and increases by 1 each time
            keys_trial[keys_trial.index(s)] = s + str(suffix) # replace each appearance of s

In [None]:
# Check all keys are preserved
print("Number of keys in xml file: {}".format(len(keys_trial)))
print('Number of keys after changing names: {}'.format(len(keys_trial)))

In [None]:
# Create dictionary
full_trial_dict = dict(zip(keys_trial, value_trial))

print("Keys in list: {}".format(len(keys_trial)))
print('Keys in dictionary: {}\n'.format(len(full_trial_dict.keys())))
print('ID: {}\n'.format(full_trial_dict['nct_id']))
# print('Dictionary:\n{}'.format(full_trial_dict))

In [None]:
# create dataframe from dictionary
df_dict = pd.DataFrame.from_dict(full_trial_dict, orient='index', columns=[id_trial])

pd.set_option('display.max_columns', 100)
# display(df_dict)

In [None]:
# Dump dataframe into a json file

json_folder = 'data/json/'
df_dict.to_json('{}{}.json'.format(json_folder, id_trial))
print('JSON file created\n')

In [None]:
# Check: Import json file into a dataframe
file_location = json_folder+id_trial
df_trial = pd.read_json(file_location+'.json')

print('File location: {}'.format(file_location))
print('Trial id: {}'.format(id_trial))
print('Dataframe shape: {}'.format(df_trial.shape))
print("Keys in list: {}".format(len(keys_trial)))

In [None]:
pd.set_option('display.max_rows', 200)
display(df_trial)

In [None]:
# check basic data
# Slice it
print(df_trial[id_trial]['affiliation'])
print(df_trial[id_trial]['country'])

## Method 3: Parse only selected fields and save them in a dictionary

In [None]:
# Check XML fields are available
tree[0:]

In [None]:
final_dictionary = {}

def parsing_specific_tags(dictionary, tag):
    k = []
    v = []
    
    for i in tree.iter(tag):
        k.append(i.tag)
        v.append(i.text)
    print('Key: {}, Value: {}'.format(k, v))
    
    for n, h in zip(k, v):
        dictionary[n] = h
    
    return dictionary

parsing_specific_tags(final_dictionary, 'nct_id')
parsing_specific_tags(final_dictionary, 'country')

## Method 4: iterate over several tags and save info in a dictionary

In [9]:
def parsing_several_tags(dictionary, tag, nested, condition):
    k = []
    v = []
    
    # append first condition
    k.append(condition)
    value_condition = tree.find(condition)
    v.append(value_condition.text)
    
    # append all condition tags
#     for c in tree.findall(condition):
#         print('Conditions: {} : {}'.format(condition, c.text))
    
    # append list of tags
    for t in tag:
        for i in tree.iter(t):
            k.append(i.tag)
            v.append(i.text)
     
    # append list of nested tags
    for n in nested:
        for child in tree.findall(n + '/textblock'):
            k.append(n)
            v.append(child.text)
            
    # zip data into a dictionary
    dictionary = dict(zip(k, v))
    
    return dictionary


# Variables for the function
tags_dictionary = {}

condition = 'condition'

list_of_tags = ['nct_id', 
                'download_date',
                'url', 
                'brief_title',
                'acronym',
                'official_title',
                'agency',
                'source',
               'overall_status',
                'start_date',
                'phase',
                'study_type',
                'allocation',
                'study_first_submitted',
                'verification_date',
                'mesh_term'
               ]

nested_tags = ['brief_summary',
              'detailed_description']


parsing_several_tags(tags_dictionary, list_of_tags, nested_tags, condition)

{'condition': 'HIV Infections',
 'nct_id': 'NCT00575315',
 'download_date': 'ClinicalTrials.gov processed this data on May 17, 2019',
 'url': 'https://clinicaltrials.gov/show/NCT00575315',
 'brief_title': 'HIV-HCV Coinfection: Impact of Immune Dysfunction',
 'official_title': 'HIV-HCV Coinfection: Impact of Immune Dysfunction',
 'agency': 'Virginia Commonwealth University',
 'source': 'Virginia Commonwealth University',
 'overall_status': 'Completed',
 'start_date': 'July 2004',
 'study_type': 'Observational',
 'study_first_submitted': 'December 14, 2007',
 'verification_date': 'August 2014',
 'mesh_term': 'Immune System Diseases',
 'brief_summary': '\n      Effective therapy for human immunodeficiency virus (HIV) infection has markedly prolonged\n      survival in infected individuals. As a result, other diseases are now becoming clinically\n      significant. Approximately 30% of HIV infected patients are co-infected with hepatitis C\n      virus (HCV) which is now the leading co-mor