In [1]:
#Useful references for building this scraper
#https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe
#https://www.dataquest.io/blog/web-scraping-beautifulsoup/

from requests import get
from bs4 import BeautifulSoup
import re
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import pandas as pd

In [2]:
#We will need the base EUCTR URL throughout the project
euctr_base_url = 'https://www.clinicaltrialsregister.eu'

#For testing, we pull page 1 of the advanced search URL that produces only trials that have results

url = 'https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page1'
response = get(url, verify = False)
html = response.content

#what does our parsed html look like?
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())


<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible">
   <meta content="" name="description"/>
   <link href="/ctr-search/css/960.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/reset.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/text.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/print.css" media="print" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/jquery-ui.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/styles.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/nav.css" rel="stylesheet" type="text/css"/>
   <script type="text/javascript">
    var ctx = "/ctr-search"
   </script>
   <script src="/ctr-search/js/jquery.js" type="text/javascript">
   </script>
   <script src="/ctr-search/js/jquery.ui.datepicker.js" type="text/javascript">
   </script>
   



In [3]:
#Each trial appears in the search as a series of tables. There are 20 on a full page of search results.
#This extracts all the trial tables from the larger HTML
tables = soup.find_all('table', {'class': 'result'})

In [4]:
#Quick check to make sure everything looks correct
print(type(tables))
print(len(tables))

<class 'bs4.element.ResultSet'>
20


In [5]:
#Looking at just to tables HTML so we can start extracting what we need
print(tables)

[<table class="result">
<tr>
<td style="width: 33%">
<input name="enchx" style="display:none" title="Mark this result for download" type="checkbox" value="2015-001216-35">
<span class="label">EudraCT Number:</span> 2015-001216-35
				</input></td>
<td style="width: 33%"><span class="label">Sponsor Protocol Number:</span> CONCERTAATT4086</td>
<td style="width: 33%"><span class="label">Start Date<span class="info-tip startdatetip">*</span>:</span> 2015-04-24</td>
</tr>
<tr class="even">
<td colspan="3">
<span class="label">Sponsor Name:</span>Johnson &amp; Johnson Taiwan Ltd
					
				</td>
</tr>
<tr>
<td colspan="3"><span class="label">Full Title:</span>  From Immediate-release MPH to OROS MPH: The Impact Upon Family of Children and Adolescents With ADHD </td>
</tr>
<tr class="even">
<td colspan="3"><span class="label">Medical condition:</span> Attention Deficit Hyperactivity Disorder</td>
</tr>
<tr>
<td colspan="3">
<table class="meddra">
<tr>
<td class="label" rowspan="2" style="wi

In [6]:
#Experiment with just the first search result
first_trial = tables[0]

In [7]:
#Extracts the EudraCT number from the first trial, then prints to test that it extracted correctly
first_trial_id = first_trial.input.get('value')
print(len(first_trial_id))
print(first_trial_id)

14
2015-001216-35


In [8]:
#Extracts the part of the URL that leads to the results URL and appends it to the base URL
first_results_link = euctr_base_url + first_trial.find_all('a')[-1].get('href')
print(first_results_link)

https://www.clinicaltrialsregister.eu/ctr-search/trial/2015-001216-35/results


In [9]:
#checking the next trial for testing to make sure above code still works
second_trial = tables[1]

In [10]:
second_trial_id = second_trial.input.get('value')
print(len(second_trial_id))
print(second_trial_id)

14
2004-000086-35


In [11]:
second_results_link = euctr_base_url + second_trial.find_all('a')[-1].get('href')
print(second_results_link)

https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000086-35/results


In [12]:
#blank lists for collectiong all trial ids and results urls from the first page of results. That's all we need for now.
trial_ids_first_page = []
results_urls_first_page = []

In [13]:
#Takes the above testing for getting tht data from single trials and generalizes it to a loop and then makes sure it worked
for table in tables:
    trial_id = table.input.get('value')
    trial_ids_first_page.append(trial_id)
    url = euctr_base_url + table.find_all('a')[-1].get('href')
    results_urls_first_page.append(url)
print(trial_ids_first_page)
print(results_urls_first_page)

['2015-001216-35', '2004-000086-35', '2007-002462-35', '2006-000870-63', '2004-000087-27', '2008-005768-15', '2007-004169-16', '2007-002476-33', '2007-002474-60', '2008-005769-71', '2009-013412-13', '2009-011105-17', '2005-004782-41', '2008-003789-24', '2009-012355-15', '2008-005767-34', '2009-011271-78', '2005-003632-22', '2004-002854-78', '2009-015721-36']
['https://www.clinicaltrialsregister.eu/ctr-search/trial/2015-001216-35/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000086-35/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2007-002462-35/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2006-000870-63/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000087-27/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2008-005768-15/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2007-004169-16/results', 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2007-002476-33/

In [14]:
#For when this scrape us run on the full EUCTR, we will need to know how many pages long the "has results" search result is.
#This is how we extract that information. For lack of a better method, this uses a regular expression.
number_of_pages = soup.find('div', {'class': 'margin-bottom: 6px;'})
max_page_link = str(number_of_pages.find_all('a')[-1])
max_page = re.findall(r'\d+', max_page_link)[0]
print(max_page)

574


In [15]:
#using the method from the link at the beginning of this notebook, 
#we use the testing above to create a test crawler that will run on the first 5 pages of search results
pages = [str(i) for i in range(1,6)]
print(pages)
print('https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page=' + pages[0])

['1', '2', '3', '4', '5']
https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page=1


In [16]:
trial_ids = []
results_urls = []

In [17]:
start_time = time()
requests = 0

#for each of the first 5 pages of results
for page in pages:
    
    #make this request
    response = get('https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page=' + page, verify = False)
    
    #pause to look like a human
    sleep(randint(0,1)) #this can likely be reduced quite a bit. Perhaps to just 1,2 or even just 1
    
    #mointor the requests to ensure everything is working
    requests += 1
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)
    
    # Throw a warning for a non-200 status code
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #Break the looop if we exceed the number of requests which will need to change when i do full scrape
    if requests > 5:
        warn('Number of requests was greater than expected.')  
        break 
    
    #Parse the requests
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #select all the trial tables
    trial_tables = page_html.find_all('table', {'class': 'result'})
    
    #get the trial id and the trial url for each thing
    for trial_table in trial_tables:
        trial_id = trial_table.input.get('value')
        trial_ids.append(trial_id)
        url = euctr_base_url + trial_table.find_all('a')[-1].get('href')
        results_urls.append(url)
    



Request: 5; Frequency: 1.250327389730399 requests/s


In [19]:
#It works! We now can extract all the trial IDs that appear in the search results for a results only search.
print(trial_ids)

['2015-001216-35', '2004-000086-35', '2007-002462-35', '2006-000870-63', '2004-000087-27', '2008-005768-15', '2007-004169-16', '2007-002476-33', '2007-002474-60', '2008-005769-71', '2009-013412-13', '2009-011105-17', '2005-004782-41', '2008-003789-24', '2009-012355-15', '2008-005767-34', '2009-011271-78', '2005-003632-22', '2004-002854-78', '2009-015721-36', '2010-019051-21', '2010-018331-18', '2005-000888-26', '2007-003766-17', '2008-005777-35', '2010-023482-21', '2008-006443-39', '2009-011672-29', '2005-006161-13', '2005-000859-15', '2005-004662-16', '2009-013326-17', '2006-005144-84', '2007-006611-23', '2008-005774-13', '2009-012218-30', '2009-010662-28', '2009-015595-10', '2009-012923-27', '2006-001378-26', '2006-004093-27', '2006-001518-34', '2008-005776-27', '2010-022337-29', '2008-001524-31', '2007-002716-26', '2006-006529-25', '2006-004565-33', '2007-001471-11', '2010-023636-17', '2008-008259-41', '2009-009986-32', '2011-006066-40', '2010-024100-10', '2011-004983-32', '2011-004

In [20]:
#For the next step, we first need to create the components of the actual results URL that we can stick the above trial_ids into
results_base_url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/'
results_end_url = '/results'

In [21]:
#So far, I have only ever come across 2 ways an EUCTR results page can look. It will either be a page with tabular results
#or a page with a synopsis. They each have slightly different data so we need to do some testing with both.
#Examples of each type below

#tabular
results_test_url_1 = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2015-001216-35/results'

#synopsis
results_test_url_2 = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000086-35/results'

In [23]:
#Extracting and parsing the tabular example

results_1 = get(results_test_url_1, verify = False)
results_1_html = results_1.content

soup1 = BeautifulSoup(results_1_html, "html.parser")
print(soup1.prettify())



<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible">
   <meta content="" name="description"/>
   <link href="/ctr-search/css/960.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/reset.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/text.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/print.css" media="print" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/jquery-ui.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/styles.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/nav.css" rel="stylesheet" type="text/css"/>
   <script type="text/javascript">
    var ctx = "/ctr-search"
   </script>
   <script src="/ctr-search/js/jquery.js" type="text/javascript">
   </script>
   <script src="/ctr-search/js/jquery.ui.datepicker.js" type="text/javascript">
   </script>
   

In [None]:
#It appears that the table containing the information we want on the trial results is always the 5th one down on a page (zero indexed)

results_table_1 = soup1.find_all('table')[4]
print(results_table_1)

In [None]:
#this is how we get the trial id
r1_trial_id = results_table_1.find_all('a')[0].get_text()
print(r1_trial_id)

In [None]:
#this is a sub-extraction of the section that contains everything else we need
r1_tds = results_table_1.find_all('td', class_ = 'valueColumn')
print(r1_tds)

In [24]:
#creating a function that gets the piece of data from the part of the table we want, and cleans it up
def tds_strip(td_table, td):
    return td_table[td].div.get_text().strip()


In [None]:
#global end of trial date
print(tds_strip(r1_tds,3))

In [None]:
#First version publication date
print(tds_strip(r1_tds,7))

In [None]:
#This version publication date
print(tds_strip(r1_tds,6))

In [None]:
#current version
rd1_version = r1_tds[5].get_text().strip()
print(rd1_version)

In [25]:
#now we can move on to the synopsis results page for testing. First we parse.

results_2 = get(results_test_url_2, verify = False)
results_2_html = results_2.content

soup2 = BeautifulSoup(results_2_html, "html.parser")
print(soup2.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible">
   <meta content="" name="description"/>
   <link href="/ctr-search/css/960.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/reset.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/text.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/print.css" media="print" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/jquery-ui.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/styles.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/nav.css" rel="stylesheet" type="text/css"/>
   <script type="text/javascript">
    var ctx = "/ctr-search"
   </script>
   <script src="/ctr-search/js/jquery.js" type="text/javascript">
   </script>
   <script src="/ctr-search/js/jquery.ui.datepicker.js" type="text/javascript">
   </script>
   



In [71]:
#this text is only available when there is only a synopsis. This will become useful later on as we will use it as part of our conditional
synopsis_text_test = soup3.find('div', id = 'synopsisLegislationNote')
print(synopsis_text_test)

if synopsis_text_test is not None:
    print(0)
else:
    print(1)

None
1


In [62]:
#once again, we want the 5th table on the page
results_table_2 = soup2.find_all('table')[4]
#print(results_table_2)

In [33]:
#getting the trial id
r2_trial_id = results_table_2.find_all('a')[0].get_text()
print(r2_trial_id)

2004-000086-35


In [35]:
#getting the sub-extraction for the test of the data 
#and then using our function from before to extract and clean up at the locations for the data we want
r2_tds = results_table_2.find_all('td', class_ = 'valueColumn')
#print(r2_tds)

In [36]:
#Global end of trial date
print(tds_strip(r2_tds,3))

20 Dec 2005


In [None]:
#This version publication date
print(tds_strip(r2_tds,10))

In [None]:
#First version publication date
print(tds_strip(r2_tds,11))

In [None]:
#current version
rd2_version = r2_tds[9].get_text().strip()
print(rd2_version)

In [None]:
#One extra thing to extract here is the presense of the link to synopsis. This is what will allow us to differentiate 
#what type of page we are scraping when we run the full crawler so we know which information indexes to use
r2_tds_lc = results_table_2.find_all('td', class_ = 'labelColumn')
print(r2_tds_lc)

In [None]:
#This will turn into our results type indicator
rd2_attachment = r2_tds_lc[-1].div.get_text().strip()
print(rd2_attachment)

In [48]:
#on testing for full run of scrape came across a URL that has both a synopsis link and tabular results. Need to adjust crawler to account for this.
#the url for testing is https://www.clinicaltrialsregister.eu/ctr-search/trial/2011-005336-25/results

results_test_url_3 = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2011-005336-25/results'
results_3 = get(results_test_url_3, verify = False)
results_3_html = results_3.content

soup3 = BeautifulSoup(results_3_html, "html.parser")
print(soup3.prettify())

#it looks like we can just use the tabular logic and look for the "legislation language" text to sort this into the right place



<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible">
   <meta content="" name="description"/>
   <link href="/ctr-search/css/960.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/reset.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/text.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/print.css" media="print" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/jquery-ui.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/styles.css" rel="stylesheet" type="text/css"/>
   <link href="/ctr-search/css/nav.css" rel="stylesheet" type="text/css"/>
   <script type="text/javascript">
    var ctx = "/ctr-search"
   </script>
   <script src="/ctr-search/js/jquery.js" type="text/javascript">
   </script>
   <script src="/ctr-search/js/jquery.ui.datepicker.js" type="text/javascript">
   </script>
   

In [72]:
#creating the lists for our data
results_trial_id = []
global_end_of_trial_date = []
first_publication_date = []
current_publication_date = []
results_version = []
results_type = []

In [73]:
#a quick test on 11 trial ids to run in the crawler below
test_results = trial_ids[0:11]
print(test_results)

['2015-001216-35', '2004-000086-35', '2007-002462-35', '2006-000870-63', '2004-000087-27', '2008-005768-15', '2007-004169-16', '2007-002476-33', '2007-002474-60', '2008-005769-71', '2009-013412-13']


In [74]:
start_time_2 = time()
requests_2 = 0

#for each of the first 5 pages of results
for test_result in test_results:
    
    #make this request
    response = get(results_base_url + test_result + results_end_url, verify = False)
    
    #pause to look like a human
    sleep(randint(0,1))
    
    #mointor the requests to ensure everything is working
    requests_2 += 1
    elapsed_time = time() - start_time_2
    print('Request: {}; Frequency: {} requests/s'.format(requests_2, requests_2/elapsed_time))
    clear_output(wait = True)
    
    # Throw a warning for a non-200 status code
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests_2, response.status_code))

    #Break the looop if we exceed the number of requests which will need to change when i do full scrape
    if requests_2 > 100:
        warn('Number of requests was greater than expected.')  
        break 
    
    #Parse the requests
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #select all the results tables
    leg_text = page_html.find('div', id = 'synopsisLegislationNote')
    trial_tables = page_html.find_all('table')[4]
    td_value = trial_tables.find_all('td', class_ = 'valueColumn')
    td_label = trial_tables.find_all('td', class_ = 'labelColumn') 
    
    if td_label[-1].div.get_text().strip() == 'Summary report(s)' and leg_text is not None:
        trial_id = trial_tables.find_all('a')[0].get_text()
        results_trial_id.append(trial_id)
        global_end_date = tds_strip(td_value,3)
        global_end_of_trial_date.append(global_end_date)
        first_pub = tds_strip(td_value,11)
        first_publication_date.append(first_pub)
        current_pub = tds_strip(td_value,10)
        current_publication_date.append(current_pub)
        version = td_value[9].get_text().strip()
        results_version.append(version)
        results_type.append("Document")
    
    elif td_label[-1].div.get_text().strip() == 'Summary report(s)' and leg_text is None:
        trial_id = trial_tables.find_all('a')[0].get_text()
        results_trial_id.append(trial_id)
        global_end_date = tds_strip(td_value,3)
        global_end_of_trial_date.append(global_end_date)
        first_pub = tds_strip(td_value,7)
        first_publication_date.append(first_pub)
        current_pub = tds_strip(td_value,6)
        current_publication_date.append(current_pub)
        version = td_value[5].get_text().strip()
        results_version.append(version)
        results_type.append("Mixed")
        
    else:
        trial_id = trial_tables.find_all('a')[0].get_text()
        results_trial_id.append(trial_id)
        global_end_date = tds_strip(td_value,3)
        global_end_of_trial_date.append(global_end_date)
        first_pub = tds_strip(td_value,7)
        first_publication_date.append(first_pub)
        current_pub = tds_strip(td_value,6)
        current_publication_date.append(current_pub)
        version = td_value[5].get_text().strip()
        results_version.append(version)
        results_type.append("Tabular")



Request: 11; Frequency: 1.3153778951276698 requests/s


In [75]:
#lets print a few of our lists just to check that it looked ok
print(results_trial_id)
print(global_end_of_trial_date)

['2015-001216-35', '2004-000086-35', '2007-002462-35', '2006-000870-63', '2004-000087-27', '2008-005768-15', '2007-004169-16', '2007-002476-33', '2007-002474-60', '2008-005769-71', '2009-013412-13']
['23 Jul 2008', '20 Dec 2005', '27 Aug 2008', '16 Apr 2010', '27 Dec 2005', '23 Aug 2010', '14 Sep 2010', '23 Jul 2008', '20 Jun 2008', '31 May 2010', '02 Dec 2010']


In [76]:
#now lets make a dataframe to check how everything turned out

test_df = pd.DataFrame({'trial_id': results_trial_id,
                       'global_trial_end_date': global_end_of_trial_date,
                       'first_pub_date': first_publication_date,
                       'current_pub_date': current_publication_date,
                       'version': results_version,
                       'results_type': results_type})

print(test_df)

          trial_id global_trial_end_date first_pub_date current_pub_date  \
0   2015-001216-35           23 Jul 2008    29 Jan 2016      01 Jul 2016   
1   2004-000086-35           20 Dec 2005    30 Jan 2016      30 Jan 2016   
2   2007-002462-35           27 Aug 2008    30 Jan 2016      30 Jan 2016   
3   2006-000870-63           16 Apr 2010    30 Jan 2016      30 Jan 2016   
4   2004-000087-27           27 Dec 2005    30 Jan 2016      30 Jan 2016   
5   2008-005768-15           23 Aug 2010    30 Jan 2016      30 Jan 2016   
6   2007-004169-16           14 Sep 2010    30 Jan 2016      30 Jan 2016   
7   2007-002476-33           23 Jul 2008    30 Jan 2016      30 Jan 2016   
8   2007-002474-60           20 Jun 2008    30 Jan 2016      30 Jan 2016   
9   2008-005769-71           31 May 2010    30 Jan 2016      30 Jan 2016   
10  2009-013412-13           02 Dec 2010    30 Jan 2016      30 Jan 2016   

        version results_type  
0   v2(current)      Tabular  
1   v1(current)     Docum

In [None]:
test_df.dtypes

In [None]:
#next things to do
# 1. Get the dates to act like dates
# 2. Make a much more condensed version of the crawlers without all the testing
# 3. Run them in full and make sure they work!

In [77]:
test_df.head()

Unnamed: 0,trial_id,global_trial_end_date,first_pub_date,current_pub_date,version,results_type
0,2015-001216-35,23 Jul 2008,29 Jan 2016,01 Jul 2016,v2(current),Tabular
1,2004-000086-35,20 Dec 2005,30 Jan 2016,30 Jan 2016,v1(current),Document
2,2007-002462-35,27 Aug 2008,30 Jan 2016,30 Jan 2016,v1(current),Document
3,2006-000870-63,16 Apr 2010,30 Jan 2016,30 Jan 2016,v1(current),Document
4,2004-000087-27,27 Dec 2005,30 Jan 2016,30 Jan 2016,v1(current),Document
