In [12]:
import requests
from bs4 import BeautifulSoup
import os
import re
import time

# --- Configuration ---
START_POPE_NAME = "Pius IV"
END_POPE_NAME = "John Paul II"
WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/List_of_popes"
OUTPUT_FOLDER = "pope_images_indexed" # Changed folder name slightly
USER_AGENT = "PopeImageDownloader/1.1 (https://example.com/bot; myemail@example.com) PythonRequests/2.x"

# --- Helper Functions ---
def sanitize_filename(name):
    """Removes invalid characters for filenames and replaces spaces."""
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = name.replace(" ", "_")
    return name

def get_full_image_url(thumb_url):
    """Converts a Wikipedia thumbnail URL to a full image URL."""
    if not thumb_url:
        return None
    if not thumb_url.startswith("//"):
        return None

    full_url = "https:" + thumb_url
    if "/thumb/" in full_url:
        parts = full_url.split('/')
        try:
            thumb_index = parts.index("thumb")
            image_path_segments = parts[thumb_index+1 : thumb_index+4]
            base_url_segments = parts[:thumb_index]
            full_image_url = "/".join(base_url_segments + image_path_segments)
            return full_image_url
        except (ValueError, IndexError) as e:
            print(f"    Could not parse thumbnail URL: {full_url} - {e}")
            return None
    return full_url

# --- Main Script ---
if __name__ == "__main__":
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)

    print(f"Fetching popes list from {WIKIPEDIA_URL}...")
    headers = {'User-Agent': USER_AGENT}
    try:
        response = requests.get(WIKIPEDIA_URL, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        exit()

    soup = BeautifulSoup(response.content, 'html.parser')
    pope_tables = soup.find_all('table', class_='wikitable')

    if not pope_tables:
        print("No wikitable found. Wikipedia structure might have changed.")
        exit()

    found_start_pope = False
    download_count = 0
    file_index = 0 # Initialize file index for naming

    print(f"Searching for popes from {START_POPE_NAME} to {END_POPE_NAME}...")

    # Use 'break_all_loops' to exit outer loops when end pope is found
    break_all_loops = False

    for table in pope_tables:
        if break_all_loops:
            break
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            
            if len(cells) < 4:
                continue

            pope_name_tag = None
            pope_name_text = ""
            
            # Try cell index 3 (common for tables with portraits)
            name_cell_index = 3
            if len(cells) > name_cell_index:
                name_tag_container = cells[name_cell_index]
                bold_tag = name_tag_container.find('b')
                if bold_tag:
                    pope_name_tag = bold_tag.find('a')
                    if pope_name_tag:
                        pope_name_text = pope_name_tag.get_text(strip=True)
                    elif bold_tag.find(text=True, recursive=False):
                        pope_name_text = bold_tag.get_text(strip=True)
            
            # Fallback for tables where name might be in cell index 2
            if not pope_name_text and len(cells) > 2:
                name_cell_index = 2
                name_tag_container = cells[name_cell_index]
                bold_tag = name_tag_container.find('b')
                if bold_tag:
                    pope_name_tag = bold_tag.find('a')
                    if pope_name_tag:
                        pope_name_text = pope_name_tag.get_text(strip=True)
                    elif bold_tag.find(text=True, recursive=False):
                         pope_name_text = bold_tag.get_text(strip=True)

            pope_name_text_cleaned = re.sub(r"^(St |St\. |Bl |Bl\. |Ven\. |Servant of God )", "", pope_name_text)

            if not pope_name_text_cleaned:
                continue

            # --- Range Check ---
            if not found_start_pope and pope_name_text_cleaned == START_POPE_NAME:
                found_start_pope = True
                print(f"\nFound start pope: {pope_name_text_cleaned}")

            if not found_start_pope:
                continue
            
            # --- Pope is in range, increment index ---
            file_index += 1

            # --- Image Extraction ---
            image_url = None
            portrait_cell_index = 2 # Image is typically in cell index 2 for tables with portraits
            if len(cells) > portrait_cell_index:
                img_tag = cells[portrait_cell_index].find('img')
                if img_tag and img_tag.has_attr('src'):
                    thumbnail_src = img_tag['src']
                    image_url = get_full_image_url(thumbnail_src)

            if image_url:
                # Add index to the processing message
                print(f"  Processing ({file_index}): {pope_name_text_cleaned}")
                print(f"    Found image URL: {image_url}")
                
                # Prepend index to the filename
                filename = f"{file_index}_{sanitize_filename(pope_name_text_cleaned)}.jpg"
                filepath = os.path.join(OUTPUT_FOLDER, filename)

                try:
                    img_response = requests.get(image_url, headers=headers, stream=True)
                    img_response.raise_for_status()
                    with open(filepath, 'wb') as f:
                        for chunk in img_response.iter_content(chunk_size=8192):
                            f.write(chunk)
                    print(f"    Downloaded: {filepath}")
                    download_count += 1
                    time.sleep(0.5) 
                except requests.exceptions.RequestException as e:
                    print(f"    Error downloading image for {pope_name_text_cleaned}: {e}")
                except IOError as e:
                    print(f"    Error saving image {filepath}: {e}")
            else:
                # Add index to the message even if no image
                print(f"  Processing ({file_index}): {pope_name_text_cleaned} (No image found or URL parse failed)")

            if pope_name_text_cleaned == END_POPE_NAME:
                print(f"\nReached end pope: {pope_name_text_cleaned}. Stopping.")
                break_all_loops = True # Signal to break outer loop as well
                break # Stop processing rows in this table
        
    print(f"\nFinished. Downloaded {download_count} images to '{OUTPUT_FOLDER}'.")

Fetching popes list from https://en.wikipedia.org/wiki/List_of_popes...
Searching for popes from Pius IV to John Paul II...

Found start pope: Pius IV
  Processing (1): Pius IV
    Found image URL: https://upload.wikimedia.org/wikipedia/commons/b/be/Portrait_of_Pope_Pius_IV%2C_three-quarter-length%2C_seated_at_a_draped_table_%28Circle_of_Scipione_Pulzone%29.jpg


  elif bold_tag.find(text=True, recursive=False):
  elif bold_tag.find(text=True, recursive=False):


    Downloaded: pope_images_indexed/1_Pius_IV.jpg
  Processing (2): Pius V
    Found image URL: https://upload.wikimedia.org/wikipedia/commons/2/28/Bartolomeo_Passarotti_-_Pius_V.jpg
    Downloaded: pope_images_indexed/2_Pius_V.jpg
  Processing (3): Gregory XIII
    Found image URL: https://upload.wikimedia.org/wikipedia/commons/5/52/Pope_Gregory_XIII_portrait.jpg
    Downloaded: pope_images_indexed/3_Gregory_XIII.jpg
  Processing (4): Sixtus V
    Found image URL: https://upload.wikimedia.org/wikipedia/commons/e/ed/Portrait_of_Pope_Sixtus_V.jpg
    Downloaded: pope_images_indexed/4_Sixtus_V.jpg
  Processing (5): Urban VII
    Found image URL: https://upload.wikimedia.org/wikipedia/commons/e/ec/Papa_Urbano_VII_%28Pope_Urban_VII%29.jpg
    Downloaded: pope_images_indexed/5_Urban_VII.jpg
  Processing (6): Gregory XIV
    Found image URL: https://upload.wikimedia.org/wikipedia/commons/f/f4/Roman_School_%E2%80%93_Portrait_of_Pope_Gregory_XIV_%2816th_Century%29.jpg
    Downloaded: pope_imag