In [None]:
#Useful references for building this scraper
#https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe
#https://www.dataquest.io/blog/web-scraping-beautifulsoup/

from requests import get
from bs4 import BeautifulSoup
import re
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import pandas as pd

In [None]:
#We will need the base EUCTR URL throughout the project
euctr_base_url = 'https://www.clinicaltrialsregister.eu'

#For testing, we pull page 1 of the advanced search URL that produces only trials that have results

url = 'https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page1'
response = get(url, verify = False)
html = response.content

#what does our parsed html look like?
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())


In [None]:
#Each trial appears in the search as a series of tables. There are 20 on a full page of search results.
#This extracts all the trial tables from the larger HTML
tables = soup.find_all('table', {'class': 'result'})

In [None]:
#Quick check to make sure everything looks correct
print(type(tables))
print(len(tables))

In [None]:
#Looking at just to tables HTML so we can start extracting what we need
print(tables)

In [None]:
#Experiment with just the first search result
first_trial = tables[0]

In [None]:
#Extracts the EudraCT number from the first trial, then prints to test that it extracted correctly
first_trial_id = first_trial.input.get('value')
print(len(first_trial_id))
print(first_trial_id)

In [None]:
#Extracts the part of the URL that leads to the results URL and appends it to the base URL
first_results_link = euctr_base_url + first_trial.find_all('a')[-1].get('href')
print(first_results_link)

In [None]:
#checking the next trial for testing to make sure above code still works
second_trial = tables[1]

In [None]:
second_trial_id = second_trial.input.get('value')
print(len(second_trial_id))
print(second_trial_id)

In [None]:
second_results_link = euctr_base_url + second_trial.find_all('a')[-1].get('href')
print(second_results_link)

In [None]:
#blank lists for collectiong all trial ids and results urls from the first page of results. That's all we need for now.
trial_ids_first_page = []
results_urls_first_page = []

In [None]:
#Takes the above testing for getting tht data from single trials and generalizes it to a loop and then makes sure it worked
for table in tables:
    trial_id = table.input.get('value')
    trial_ids_first_page.append(trial_id)
    url = euctr_base_url + table.find_all('a')[-1].get('href')
    results_urls_first_page.append(url)
print(trial_ids_first_page)
print(results_urls_first_page)

In [None]:
#For when this scrape us run on the full EUCTR, we will need to know how many pages long the "has results" search result is.
#This is how we extract that information. For lack of a better method, this uses a regular expression.
number_of_pages = soup.find('div', {'class': 'margin-bottom: 6px;'})
max_page_link = str(number_of_pages.find_all('a')[-1])
max_page = re.findall(r'\d+', max_page_link)[0]
print(max_page)

In [None]:
#using the method from the link at the beginning of this notebook, 
#we use the testing above to create a test crawler that will run on the first 5 pages of search results
pages = [str(i) for i in range(1,6)]
print(pages)
print('https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page=' + pages[0])

In [None]:
trial_ids = []
results_urls = []

In [None]:
start_time = time()
requests = 0

#for each of the first 5 pages of results
for page in pages:
    
    #make this request
    response = get('https://www.clinicaltrialsregister.eu/ctr-search/search?query=&resultsstatus=trials-with-results&page=' + page, verify = False)
    
    #pause to look like a human
    sleep(randint(1,4)) #this can likely be reduced quite a bit. Perhaps to just 1,2 or even just 1
    
    #mointor the requests to ensure everything is working
    requests += 1
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)
    
    # Throw a warning for a non-200 status code
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #Break the looop if we exceed the number of requests which will need to change when i do full scrape
    if requests > 5:
        warn('Number of requests was greater than expected.')  
        break 
    
    #Parse the requests
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #select all the trial tables
    trial_tables = page_html.find_all('table', {'class': 'result'})
    
    #get the trial id and the trial url for each thing
    for trial_table in trial_tables:
        trial_id = trial_table.input.get('value')
        trial_ids.append(trial_id)
        url = euctr_base_url + trial_table.find_all('a')[-1].get('href')
        results_urls.append(url)
    

In [None]:
#It works! We now can extract all the trial IDs that appear in the search results for a results only search.
print(trial_ids)

In [None]:
#For the next step, we first need to create the components of the actual results URL that we can stick the above trial_ids into
results_base_url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/'
results_end_url = '/results'

In [None]:
#So far, I have only ever come across 2 ways an EUCTR results page can look. It will either be a page with tabular results
#or a page with a synopsis. They each have slightly different data so we need to do some testing with both.
#Examples of each type below

#tabular
results_test_url_1 = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2015-001216-35/results'

#synopsis
results_test_url_2 = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000086-35/results'

In [None]:
#Extracting and parsing the tabular example

results_1 = get(results_test_url_1, verify = False)
results_1_html = results_1.content

soup2 = BeautifulSoup(results_1_html, "html.parser")
print(soup2.prettify())

In [None]:
#It appears that the table containing the information we want on the trial results is always the 5th one down on a page (zero indexed)

results_table_1 = soup2.find_all('table')[4]
print(results_table_1)

In [None]:
#this is how we get the trial id
r1_trial_id = results_table_1.find_all('a')[0].get_text()
print(r1_trial_id)

In [None]:
#this is a sub-extraction of the section that contains everything else we need
r1_tds = results_table_1.find_all('td', class_ = 'valueColumn')
print(r1_tds)

In [None]:
#creating a function that gets the piece of data from the part of the table we want, and cleans it up
def tds_strip(td_table, td):
    return td_table[td].div.get_text().strip()


In [None]:
#global end of trial date
print(tds_strip(r1_tds,3))

In [None]:
#First version publication date
print(tds_strip(r1_tds,7))

In [None]:
#This version publication date
print(tds_strip(r1_tds,6))

In [None]:
#current version
rd1_version = r1_tds[5].get_text().strip()
print(rd1_version)

In [None]:
#now we can move on to the synopsis results page for testing. First we parse.

results_2 = get(results_test_url_2, verify = False)
results_2_html = results_2.content

soup3 = BeautifulSoup(results_2_html, "html.parser")
print(soup3.prettify())

In [None]:
#once again, we want the 5th table on the page
results_table_2 = soup3.find_all('table')[4]
print(results_table_2)

In [None]:
#getting the trial id
r2_trial_id = results_table_2.find_all('a')[0].get_text()
print(r2_trial_id)

In [None]:
#getting the sub-extraction for the test of the data 
#and then using our function from before to extract and clean up at the locations for the data we want
r2_tds = results_table_2.find_all('td', class_ = 'valueColumn')
print(r2_tds)

In [None]:
#Global end of trial date
print(tds_strip(r2_tds,3))

In [None]:
#This version publication date
print(tds_strip(r2_tds,10))

In [None]:
#First version publication date
print(tds_strip(r2_tds,11))

In [None]:
#current version
rd2_version = r2_tds[9].get_text().strip()
print(rd2_version)

In [None]:
#One extra thing to extract here is the presense of the link to synopsis. This is what will allow us to differentiate 
#what type of page we are scraping when we run the full crawler so we know which information indexes to use
r2_tds_lc = results_table_2.find_all('td', class_ = 'labelColumn')
print(r2_tds_lc)

In [None]:
#This will turn into our results type indicator
rd2_attachment = r2_tds_lc[-1].div.get_text().strip()
print(rd2_attachment)

In [None]:
#creating the lists for our data
results_trial_id = []
global_end_of_trial_date = []
first_publication_date = []
current_publication_date = []
results_version = []
results_type = []

In [None]:
#a quick test on 11 trial ids to run in the crawler below
test_results = trial_ids[0:11]
print(test_results)

In [None]:
start_time_2 = time()
requests_2 = 0

#for each of the first 5 pages of results
for test_result in test_results:
    
    #make this request
    response = get(results_base_url + test_result + results_end_url, verify = False)
    
    #pause to look like a human
    sleep(randint(1,4))
    
    #mointor the requests to ensure everything is working
    requests_2 += 1
    elapsed_time = time() - start_time_2
    print('Request: {}; Frequency: {} requests/s'.format(requests_2, requests_2/elapsed_time))
    clear_output(wait = True)
    
    # Throw a warning for a non-200 status code
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests_2, response.status_code))

    #Break the looop if we exceed the number of requests which will need to change when i do full scrape
    if requests_2 > 100:
        warn('Number of requests was greater than expected.')  
        break 
    
    #Parse the requests
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #select all the results tables
    trial_tables = page_html.find_all('table')[4]
    td_value = trial_tables.find_all('td', class_ = 'valueColumn')
    td_label = trial_tables.find_all('td', class_ = 'labelColumn') 
    
    if td_label[-1].div.get_text().strip() == 'Summary report(s)':
        trial_id = trial_tables.find_all('a')[0].get_text()
        results_trial_id.append(trial_id)
        global_end_date = tds_strip(td_value,3)
        global_end_of_trial_date.append(global_end_date)
        first_pub = tds_strip(td_value,11)
        first_publication_date.append(first_pub)
        current_pub = tds_strip(td_value,10)
        current_publication_date.append(current_pub)
        version = td_value[9].get_text().strip()
        results_version.append(version)
        results_type.append("Document")
        
    else:
        trial_id = trial_tables.find_all('a')[0].get_text()
        results_trial_id.append(trial_id)
        global_end_date = tds_strip(td_value,3)
        global_end_of_trial_date.append(global_end_date)
        first_pub = tds_strip(td_value,7)
        first_publication_date.append(first_pub)
        current_pub = tds_strip(td_value,6)
        current_publication_date.append(current_pub)
        version = td_value[5].get_text().strip()
        results_version.append(version)
        results_type.append("Tabular")

In [None]:
#lets print a few of our lists just to check that it looked ok
print(results_trial_id)
print(global_end_of_trial_date)

In [None]:
#now lets make a dataframe to check how everything turned out

test_df = pd.DataFrame({'trial_id': results_trial_id,
                       'global_trial_end_date': global_end_of_trial_date,
                       'first_pub_date': first_publication_date,
                       'current_pub_date': current_publication_date,
                       'version': results_version,
                       'results_type': results_type})

print(test_df)

In [None]:
test_df.dtypes

In [None]:
#next things to do
# 1. Get the dates to act like dates
# 2. Make a much more condensed version of the crawlers without all the testing
# 3. Run them in full and make sure they work!