# Wikisource Sublink Extractor

This notebook extracts all sublinks from a Wikisource page that share the same base path.

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
import pandas as pd
from tqdm import tqdm
import time

In [2]:
def get_wikisource_sublinks(base_url: str) -> list[str]:
    """
    Extract all sublinks from a Wikisource page that start with the base URL path.
    
    Args:
        base_url: The main Wikisource page URL (e.g., https://en.wikisource.org/wiki/The_Elements_of_Euclid...)
    
    Returns:
        List of URLs that are subpages of the base URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    # Parse the base URL to get the path
    parsed_base = urlparse(base_url)
    base_path = unquote(parsed_base.path)  # Decode URL encoding
    base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
    
    try:
        response = requests.get(base_url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all links on the page
        all_links = soup.find_all('a', href=True)
        
        sublinks = set()
        
        for a_tag in all_links:
            href = a_tag['href']
            
            # Convert relative URLs to absolute
            full_url = urljoin(base_url, href)
            
            # Parse the full URL
            parsed_link = urlparse(full_url)
            link_path = unquote(parsed_link.path)
            
            # Check if it's on the same domain
            if parsed_link.netloc != parsed_base.netloc:
                continue
            
            # Check if the link path starts with the base path (is a subpage)
            # The link should start with base_path + "/" to be a true subpage
            if link_path.startswith(base_path + '/') or link_path.startswith(base_path + '#'):
                # Exclude fragment-only links to the same page
                if '#' in full_url:
                    clean_url = full_url.split('#')[0]
                else:
                    clean_url = full_url
                
                # Only add if it's actually different from the base URL
                if clean_url != base_url and clean_url:
                    sublinks.add(clean_url)
        
        return sorted(list(sublinks))
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {base_url}: {e}")
        return []

## Example Usage

In [30]:
# Example: Extract sublinks from Euclid's Elements
base_url = "https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges"

print(f"Base URL: {base_url}")
print("="*80)

Base URL: https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges


In [31]:
# Method 1: Get strict subpages (URLs that start with base URL + /)
print("\nMethod 1: Strict subpages")
sublinks = get_wikisource_sublinks(base_url)
print(f"Found {len(sublinks)} sublinks")
for link in sublinks[:10]:  # Show first 10
    print(f"  - {link}")
if len(sublinks) > 10:
    print(f"  ... and {len(sublinks) - 10} more")


Method 1: Strict subpages
Found 14 sublinks
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Appendix
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_I.
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_II
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_III
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_IV
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_V
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_VI
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_XI
  - https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Book_XII
  - https://en.wikisource.org/wiki/

## Extract Content from Sublinks

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random

def extract_wikisource_content(url: str, retry_count: int = 3) -> str | None:
    """
    Extract the main text content from a Wikisource page with rate limiting.
    
    Args:
        url: Wikisource page URL
        retry_count: Number of retries on failure
    
    Returns:
        Extracted text content or None if failed
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    for attempt in range(retry_count):
        try:
            # Add delay between requests (2-4 seconds random)
            if attempt > 0:
                wait_time = random.uniform(5, 10)  # Longer wait on retry
                print(f"Retry {attempt}, waiting {wait_time:.1f}s...")
                time.sleep(wait_time)
            else:
                time.sleep(random.uniform(2, 4))  # Normal delay
            
            response = requests.get(url, headers=headers, timeout=30)
            
            # Handle rate limiting
            if response.status_code == 429:
                wait_time = int(response.headers.get('Retry-After', 60))
                print(f"Rate limited. Waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
            
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            main_content = soup.find('div', {'class': 'mw-parser-output'})
            
            if main_content:
                # Remove navigation elements, edit links, etc.
                for element in main_content.find_all(['script', 'style', 'nav']):
                    element.decompose()
                
                text = main_content.get_text(separator='\n', strip=True)
                return text
            return None
        
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                print(f"Rate limited on attempt {attempt + 1}")
                time.sleep(2)  # Wait 1 minute
                continue
            else:
                print(f"HTTP Error extracting {url}: {e}")
                return None
        
        except Exception as e:
            print(f"Error extracting {url}: {e}")
            if attempt < retry_count - 1:
                continue
            return None
    
    return None




In [34]:
sublinks[0]

'https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges/Appendix'

In [33]:
print(extract_wikisource_content(sublinks[0]))

←
Notes on Euclid's Elements
The Elements of Euclid for the Use of Schools and Colleges
(
1872
)
by
Isaac Todhunter
Appendix
Exercises in Euclid
→
1994106
The Elements of Euclid for the Use of Schools and Colleges
— Appendix
1872
Isaac Todhunter
​
APPENDIX.
This
Appendix consists of a collection of important propositions which will be found useful, both as affording geometrical exercises, and as exhibiting results which are often required in mathematical investigations. The student will have no difficulty in drawing for himself the requisite figures in the cases where they are not given.
​
1.
The sum of the squares on the sides of a triangle
is equal to twice the square on half the base, together with
twice the square on the straight line which joins the vertex
to the middle point of the base.
Let
ABC
be a triangle; and let
D
be the middle point of the base
AB
. Draw
CE
perpendicular to the base
meeting it at
E
; then
B
may be either in
AB
or in
AB
produced.
First, let
B
coincide with
