TODO:
- Revise the code so that it tells me if there's an error/I'm missing some content

# Notebook to scrape data

Emilio Lehoucq, ChatGPT, and GitHub Copilot

## Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re
from pprint import pprint

## Get the URL with the content for each pope

In [2]:
# URL for the list of popes
URL_LIST_POPES = "https://www.vatican.va/holy_father/index.htm"

# Make a request to the Vatican website to get the list of popes
request = requests.get(URL_LIST_POPES)

# Create a BeautifulSoup object to parse the HTML using the built-in Python parser
soup = BeautifulSoup(request.content, "html.parser")

# This list will store the full pope URLs
pope_urls = []

# Loop over every <a> tag that has an href attribute
for a in soup.find_all("a", href=True):
    href = a["href"]  # Get the URL from the 'href' attribute

    # Keep only links that point to Vatican pope biography pages
    if href.startswith("https://www.vatican.va/content/") and href.endswith("/en.html"):
        pope_urls.append(href)  # Add the full URL to our list

# Print the list of full URLs
print(pope_urls)

['https://www.vatican.va/content/leo-xiv/en.html', 'https://www.vatican.va/content/francesco/en.html', 'https://www.vatican.va/content/benedict-xvi/en.html', 'https://www.vatican.va/content/john-paul-ii/en.html', 'https://www.vatican.va/content/john-paul-i/en.html', 'https://www.vatican.va/content/paul-vi/en.html', 'https://www.vatican.va/content/john-xxiii/en.html', 'https://www.vatican.va/content/pius-xii/en.html', 'https://www.vatican.va/content/pius-xi/en.html', 'https://www.vatican.va/content/benedict-xv/en.html', 'https://www.vatican.va/content/pius-x/en.html', 'https://www.vatican.va/content/leo-xiii/en.html']


## Get the content available for each pope

In [3]:
# Dictionary to store {URL: BeautifulSoup object}
pope_soups = {}

# Loop through each pope URL
for url in pope_urls:
    # Extract the pope slug from the URL
    path_parts = urlparse(url).path.split('/')  # ['', 'content', '<slug>', 'en.html']
    pope_slug = path_parts[2]                   # Get just the slug

    # Fetch the HTML content
    response = requests.get(url)
    response.raise_for_status()

    # Parse the HTML content into a BeautifulSoup object
    soup = BeautifulSoup(response.content, "html.parser")

    # Store it in the dictionary with the slug as the key
    pope_soups[pope_slug] = soup

# Print the keys
print(pope_soups.keys())

# Print an example
print(pope_soups.get("francesco"))

dict_keys(['leo-xiv', 'francesco', 'benedict-xvi', 'john-paul-ii', 'john-paul-i', 'paul-vi', 'john-xxiii', 'pius-xii', 'pius-xi', 'benedict-xv', 'pius-x', 'leo-xiii'])

<!DOCTYPE HTML>

<html lang="en">
<head>
<meta charset="utf-8"/>
<!-- BEGIN: SEO -->
<title>Francis</title>
<meta content="" name="description"/>
<meta content="" name="keywords"/>
<link href="https://www.vatican.va/content/francesco/en.html" rel="canonical"/>
<meta content="index,follow" name="robots"/>
<!-- BEGIN: OG -->
<meta content="Francis" property="og:title"/>
<meta content="https://www.vatican.va/content/francesco/en.html" property="og:url"/>
<meta content="website" property="og:type"/>
<!-- END OG-->
<!-- END: SEO -->
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.paren

## Get the URLs of all the content available for each pope

In [4]:
def _normalize(base_url: str, href: str) -> str:
    """Resolve relative links to absolute; leave absolute links as-is."""
    return urljoin(base_url, href)

def build_simple_content_map(pope_soups: dict) -> dict:
    """
    Return {pope_slug: {content_type: [urls...]}} using only the left nav menu.
    Super simple: for each top-level content type, collect ALL links found
    in its nested <ul> (years, subtypes, etc.), order-preserved & deduped.
    """
    # Final result: {slug: {type: [urls...]}}
    result = {}

    # Loop through every pope we already parsed into BeautifulSoup objects
    for slug, soup in pope_soups.items():
        # Start a fresh dict for this pope
        result[slug] = {}

        # Pick a reliable base URL (prefer canonical link if present)
        canonical = soup.find("link", rel="canonical")
        base_url = canonical["href"] if (canonical and canonical.has_attr("href")) \
                   else f"https://www.vatican.va/content/{slug}/en.html"

        # Find the left navigation that lists content types (Angelus, Speeches, etc.)
        # Prefer the accordion sidenav used on modern pages; fall back to #accordionmenu.
        menu_root = soup.select_one(".holyfatherAccordionSidenav #accordionmenu") or \
                    soup.select_one("#accordionmenu")
        if not menu_root:
            # Some pages (rare) might not expose the same structure; skip gracefully.
            continue

        # The root can be a <div id="accordionmenu"> with an inner <ul>; grab the first <ul>.
        ul = menu_root.find("ul") or menu_root

        # Iterate only the top-level <li> items—these are the content "types".
        for li in ul.find_all("li", recursive=False):
            # The first <a> inside the <li> is the label (e.g., "Speeches", "Homilies").
            a_top = li.find("a", href=True)
            if not a_top:
                # If no link, skip this item (defensive: some pages insert empty LIs)
                continue

            # Use the visible text of the top link as the content type name
            content_type = a_top.get_text(" ", strip=True)
            if not content_type:
                # Skip weird cases with empty text
                continue

            # Collect ALL links that appear inside nested <ul> under this type.
            # This naturally picks up years (e.g., 2025, 2024, ...) and subcategories.
            urls = []
            seen = set()  # to dedupe while preserving order

            # Select any <a href> that is inside a nested <ul> under this li (not the top link)
            for a in li.select("ul a[href]"):
                absolute = _normalize(base_url, a["href"])
                if absolute not in seen:
                    seen.add(absolute)
                    urls.append(absolute)

            # If there were NO nested links (some types are just a single index page),
            # fall back to including the top link itself so the type isn't empty.
            if not urls:
                urls = [_normalize(base_url, a_top["href"])]

            # Save into our result under this pope's slug
            result[slug][content_type] = urls

    return result

# Build the simple content map
simple_map = build_simple_content_map(pope_soups)

# Print the simple map
pprint(simple_map)

{'benedict-xv': {'Apostolic Constitutions': ['http://www.vatican.va/content/benedict-xv/en/apost-constitutions.index.html'],
                 'Apostolic Exhortations': ['http://www.vatican.va/content/benedict-xv/en/apost_exhortations.index.html'],
                 'Apostolic Letters': ['http://www.vatican.va/content/benedict-xv/en/apost_letters.index.html'],
                 'Biography': ['http://www.vatican.va/content/benedict-xv/en/biography.index.html'],
                 'Briefs': ['http://www.vatican.va/content/benedict-xv/en/briefs.index.html'],
                 'Bulls': ['http://www.vatican.va/content/benedict-xv/en/bulls.index.html'],
                 'Encyclicals': ['http://www.vatican.va/content/benedict-xv/en/encyclicals.index.html'],
                 'Homilies': ['http://www.vatican.va/content/benedict-xv/en/homilies.index.html'],
                 'Letters': ['http://www.vatican.va/content/benedict-xv/en/letters.index.html',
                             'http://www.vatican.v