# Loading main EMA page

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup

In [36]:
BASE_PATH = "C:/Users/"

BASE_URL = "https://www.ema.europa.eu/"
MAIN_PAGE = "en/medicines/human/EPAR/"

# Base functions

In [79]:
def try_get_request(url, tries=1, max_tries=5):
    done = False

    while (tries <= max_tries) and (not done):
        response = requests.get(url)

        if response.status_code == 200:
            done = True
        else:
            print(f"\t>>> Failed to get content. Waiting {tries*5} seconds to retry...")
            time.sleep(tries*5)
            tries += 1
    
    return response

In [115]:
def download_pdf(store_path, id, soup):
    results = soup.find(id=id)

    elements = results.find_all("a",
                            class_ = "standalone align-self-top d-inline-block mt-3-5 mt-md-0 flex-shrink-0",
                            href = True)
    urls = [BASE_URL + element['href'] for element in elements]
    
    print(f"\t>>> Found {len(urls)} documents.")

    for url in urls:
        name = url.rsplit('/', 1)[1]
        if not name.endswith('.pdf'):
            name = name.split('.pdf')
            name = '('.join(name) + ').pdf'
        
        pdf_path = os.path.join(store_path, name)
        response = try_get_request(url)

        if response.status_code == 200:
            with open(pdf_path, 'wb') as file:
                file.write(response.content)
            print(f"\t>>> {name} downloaded succesfully!")
        else:
            print("\t>>> Unable to extract the content.")
        time.sleep(2)

# Get the documents from the main page

The PDF documents are distributed on different ids:
* Medicine overview and risk-management plan summaries -> **id = "ema-inpage-item-overview"**
* Product information -> **id = "ema-inpage-item-product-info"**
* Assessment history -> **id = "ema-inpage-item-assessment-history"**
* More information on the drug -> **id = "ema-inpage-item-related-medicines"**

In [123]:
def get_ema_docs(drug_name):
    # Create a folder for the drug documents
    print("Creating drug folder")
    path = os.path.join(BASE_PATH, drug_name)
    if not os.path.isdir(path):
        os.mkdir(path)
    print(f"\t>>> Done!")

    # Get the main EMA webpage content
    print(f"Reading {drug_name} main page")
    page = try_get_request(BASE_URL + MAIN_PAGE + drug_name)
    soup = BeautifulSoup(page.content, "html.parser")
    print(f"\t>>> Done!")

    # Extract medicine overview
    print("Downloading medicine overview documents")
    id = "ema-inpage-item-overview"
    download_pdf(path, id, soup)
    print(f"\t>>> Done!")

    # Extract product information
    print("Downloading product information documents")
    id = "ema-inpage-item-product-info"
    download_pdf(path, id, soup)
    print(f"\t>>> Done!")

    # Extract documents of changes
    print("Downloading documents of assessment history")
    id = "ema-inpage-item-assessment-history"
    download_pdf(path, id, soup)
    print(f"\t>>> Done!")

    # Read the additional information of the drug
    print("Downloading paediatric investigation plans")
    id = "ema-inpage-item-related-medicines"
    results = soup.find(id=id)
    elements = results.find_all("a",
                                href = True)
    urls = [BASE_URL + element['href'] for element in elements]
    print(f"\t>>> Found {len(urls)} additional links.")

    for url in urls:
        sub_page = try_get_request(url)
        sub_soup = BeautifulSoup(sub_page.content, "html.parser")
        sub_id = "ema-inpage-item-decision"
        download_pdf(path, sub_id, sub_soup)
    print(f"\t>>> Done!")

    print("EMA documents completed!")

In [121]:
drug_names = ["stelara", "tremfya"]

for drug_name in drug_names:
    print(f"Looking for EMA documents on {drug_name}:")
    get_ema_docs(drug_name)
    print()

Looking for EMA documents on stelara:
Creating drug folder
	>>> Done!
Reading stelara main page
	>>> Done!
Downloading medicine overview documents
	>>> Found 2 documents.
	>>> stelara-epar-medicine-overview_en.pdf downloaded succesfully!
	>>> stelara-epar-risk-management-plan-summary_en.pdf downloaded succesfully!
	>>> Done!
Downloading product information documents
	>>> Found 3 documents.
	>>> stelara-epar-product-information_en.pdf downloaded succesfully!
	>>> stelara-epar-all-authorised-presentations_en.pdf downloaded succesfully!
	>>> stelara-epar-conditions-imposed-member-states-safe-and-effective-use-annex-iv_en.pdf downloaded succesfully!
	>>> Done!
Downloading documents of assessment history
	>>> Found 20 documents.
	>>> stelara-epar-procedural-steps-taken-scientific-information-after-authorisation_en.pdf downloaded succesfully!
	>>> stelara-h-c-958-p46-055-epar-assessment-report_en.pdf downloaded succesfully!
	>>> stelara-h-c-psusa-00003085-202112-scientific-conclusions-and-gr