# Clinical trials: setup & data importing


In [1]:
#Data: clinicaltrials.gov
#Source: wget https://clinicaltrials.gov/AllPublicXML.zip
#Date: September 14, 2018 [9pm]

In [3]:
# reset variables if needed
# %reset

In [4]:
import zipfile
import time
import json
import re
import os
import xml.etree.ElementTree as ET
import shutil
import pandas as pd
from collections import Counter 

## Parse one random trial

In [5]:
# Save all paths to xml files in a new variable
# Not dependent of unzip script

xml_trials = 'data/all_trials/'

data_xml = []

for xml in os.listdir(xml_trials):
    data_xml.append(xml)

print('Number of files in data_trials_xml: {}'.format(len(data_xml)))
print('First files: {}'.format(data_xml[0:5]))

Number of files in data_trials_xml: 284374
First files: ['NCT00391586.xml', 'NCT03472664.xml', 'NCT02443948.xml', 'NCT02818309.xml', 'NCT00092989.xml']


In [6]:
#parse random xml file
import random
n = random.randint(0, 150000)

trial = xml_trials + data_xml[n] #full path
tree = ET.parse(trial).getroot()
print("Random: {}\n".format(n))
print('File: {}\n'.format(data_xml[n]))

Random: 124252

File: NCT02004106.xml



In [7]:
ET.tostring(tree)

b'<clinical_study>\n  \n  <required_header>\n    <download_date>ClinicalTrials.gov processed this data on September 14, 2018</download_date>\n    <link_text>Link to the current ClinicalTrials.gov record.</link_text>\n    <url>https://clinicaltrials.gov/show/NCT02004106</url>\n  </required_header>\n  <id_info>\n    <org_study_id>BP28920</org_study_id>\n    <secondary_id>2013-003041-41</secondary_id>\n    <nct_id>NCT02004106</nct_id>\n  </id_info>\n  <brief_title>A Study to Evaluate Safety, Pharmacokinetics, and Efficacy of RO6895882 in Participants With Advanced and/or Metastatic Solid Tumors</brief_title>\n  <official_title>An Open-Label, Multi-Center, Dose-Escalation, Phase I Study With an Expansion Phase, to Evaluate Safety, Pharmacokinetics and Therapeutic Activity of RO6895882, an Immunocytokine, Consisting of a Variant of Interleukin-2 (IL-2v) Targeting Carcinoembryonic Antigen (CEA) Administered Intravenously, in Patients With Advanced and/or Metastatic Solid Tumors</official_tit

## Method 1: Store keys & values from xml file in list

In [None]:
#Create Keys & Values with all info from the parsed trial
keys_trial = []
value_trial = []

for nct in tree.iter('nct_id'):
    id_trial = nct.text
    
for tag in tree:
    for i in tag:
        keys_trial.append(tag.tag)
        keys_trial.append(i.tag)
        value_trial.append(tag.text)
        value_trial.append(i.text)
        
print("Random: {}".format(n))
print("Trial id: {}".format(id_trial))
print("Number of keys: {}".format(len(keys_trial)))

In [None]:
# Check id, keys, values
print("Trial id: {}\n".format(id_trial))
for i, n in zip(keys_trial, value_trial):
    print('Key: {}, Value: {}'.format(i, n))

### Zip lists into a dictionary and save it as a json file
- Use only for files with no duplicated keys

In [None]:
# Save data in a dictionary

trial_dict = dict(zip(keys_trial, value_trial))


# Dump dictionary into a json file

with open('{}{}.json'.format(json_folder, id_trial), 'w') as fp:
    json.dump(trial_dict, fp)
    print('JSON file created\n')

 
print("Keys in list: {}".format(len(keys_trial)))
print('Keys in dictionary: {}\n'.format(len(trial_dict.keys())))
# print('Dictionary:\n{}'.format(trial_dict))

## Method 2: keep all duplicated keys

In [None]:
counts = Counter(keys_trial)
for value, count in counts.most_common():
    if count > 1:
        print('{},{}'.format(value, count))

In [None]:
for s,num in counts.items():
    if num > 1: 
        for suffix in range(1, num + 1): # suffix starts at 1 and increases by 1 each time
            keys_trial[keys_trial.index(s)] = s + str(suffix) # replace each appearance of s

In [None]:
# Check all keys are preserved
print("Number of keys in xml file: {}".format(len(keys_trial)))
print('Number of keys after changing names: {}'.format(len(keys_trial)))

In [None]:
# Create dictionary
full_trial_dict = dict(zip(keys_trial, value_trial))

print("Keys in list: {}".format(len(keys_trial)))
print('Keys in dictionary: {}\n'.format(len(full_trial_dict.keys())))
print('ID: {}\n'.format(full_trial_dict['nct_id']))
# print('Dictionary:\n{}'.format(full_trial_dict))

In [None]:
# create dataframe from dictionary
df_dict = pd.DataFrame.from_dict(full_trial_dict, orient='index', columns=[id_trial])

pd.set_option('display.max_columns', 100)
# display(df_dict)

In [None]:
# Dump dataframe into a json file

json_folder = 'data/json/'
df_dict.to_json('{}{}.json'.format(json_folder, id_trial))
print('JSON file created\n')

In [None]:
# Check: Import json file into a dataframe
file_location = json_folder+id_trial
df_trial = pd.read_json(file_location+'.json')

print('File location: {}'.format(file_location))
print('Trial id: {}'.format(id_trial))
print('Dataframe shape: {}'.format(df_trial.shape))
print("Keys in list: {}".format(len(keys_trial)))

In [None]:
pd.set_option('display.max_rows', 200)
display(df_trial)

In [None]:
# check basic data
# Slice it
print(df_trial[id_trial]['affiliation'])
print(df_trial[id_trial]['country'])

## Method 3: Parse only selected fields and save them in a dictionary

In [None]:
# Check XML fields are available
tree[0:]

In [None]:
final_dictionary = {}

def parsing_specific_tags(dictionary, tag):
    k = []
    v = []
    
    for i in tree.iter(tag):
        k.append(i.tag)
        v.append(i.text)
    print('Key: {}, Value: {}'.format(k, v))
    
    for n, h in zip(k, v):
        dictionary[n] = h
    
    return dictionary

parsing_specific_tags(final_dictionary, 'nct_id')
parsing_specific_tags(final_dictionary, 'country')

## Method 4: iterate over several tags and save info in a dictionary

In [8]:
tags_dictionary = {}

def parsing_several_tags(dictionary = tags_dictionary, tag = []):
    k = []
    v = []
    
    for p in tag:
        for i in tree.iter(p):
            k.append(i.tag)
            v.append(i.text)
            
    print('Keys: {}, Values: {}'.format(k, v))
    
    dictionary = dict(zip(k, v))
    
    return dictionary


list_of_tags = ['nct_id','country']

parsing_several_tags(tags_dictionary, list_of_tags)

Keys: ['nct_id', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country', 'country'], Values: ['NCT02004106', 'United States', 'Denmark', 'Finland', 'France', 'Netherlands', 'Netherlands', 'Spain', 'Spain', 'Switzerland', 'United Kingdom', 'Denmark', 'Finland', 'France', 'Netherlands', 'Spain', 'Switzerland', 'United Kingdom', 'United States']


{'nct_id': 'NCT02004106', 'country': 'United States'}