## Downloading NAPs

In [None]:
# Loading packages
import os
import requests
from bs4 import BeautifulSoup
import re

# URL of the webpage containing the PDF links
url = "https://www.who.int/teams/surveillance-prevention-control-AMR/national-action-plan-monitoring-evaluation/library-of-national-action-plans"

# Directory where PDFs will be saved
output_dir = "/Users/giovana/Documents/LSE/PP4B5 Capstone/NAPs"
os.makedirs(output_dir, exist_ok=True)

# Fetching the webpage content
response = requests.get(url)

# Parsing the HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Finding all publication items
publication_items = soup.find_all("div", class_="sf-publications-item__body")

# Downloading each PDF
for item in publication_items:
    # Extracting date
    date_span = item.find("div", class_="sf-publications-item__date").find("span")
    date_text = date_span.get_text(strip=True) if date_span else "Unknown Date"
    
    # Converting date to format YYYY-MM-DD
    match = re.search(r"(\d{1,2}) (\w+) (\d{4})", date_text)
    if match:
        day, month, year = match.groups()
        month_dict = {
            "January": "01", "February": "02", "March": "03", "April": "04",
            "May": "05", "June": "06", "July": "07", "August": "08",
            "September": "09", "October": "10", "November": "11", "December": "12"
        }
        formatted_date = f"{year}-{month_dict.get(month, '00')}-{day.zfill(2)}"
    else:
        formatted_date = "Unknown_Date"

    # Extracting country name from document title
    title_element = item.find("h3", class_="sf-publications-item__title")
    title_text = title_element.get_text(strip=True) if title_element else "Unknown_Country"
    
    # Extracting only the country name (everything before ":")
    country_name = title_text.split(":")[0].replace(" ", "_")

    # Finding the download link
    pdf_link = item.find("a", class_="download-url")
    if pdf_link and pdf_link.get("href"):
        pdf_url = pdf_link.get("href")
        
        # Ensuring URL is absolute
        if not pdf_url.startswith("http"):
            pdf_url = url + pdf_url

        # Constructing the new filename
        pdf_name = f"{country_name}_{formatted_date}.pdf"
        pdf_path = os.path.join(output_dir, pdf_name)

        print(f"Downloading: {pdf_url}")

        # Downloading the file
        pdf_response = requests.get(pdf_url)
        with open(pdf_path, "wb") as pdf_file:
            pdf_file.write(pdf_response.content)

        print(f"Saved to: {pdf_path}")

print("Download complete.")
