# Wikisource PDF Extractor - Demo

This notebook demonstrates how to download a single book from Wikisource as a PDF.

In [1]:
import requests
from urllib.parse import quote
import os

## Download a single title from English Wikisource

Let's download "The Canterbury Tales" by Geoffrey Chaucer (14th century)

In [6]:
# Define the title and language
title = "The Canterbury Tales"
language = "en"

# Create output directory
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)

In [8]:
# Build the PDF URL
# Wikisource REST API: https://{lang}.wikisource.org/api/rest_v1/page/pdf/{title}
encoded_title = quote(title.replace(" ", "_"), safe="")
pdf_url = f"https://{language}.wikisource.org/api/rest_v1/page/pdf/{encoded_title}"

pdf_url

'https://en.wikisource.org/api/rest_v1/page/pdf/The_Canterbury_Tales'

https://en.wikisource.org/api/rest_v1/page/pdf/The_Canterbury_Tales


In [17]:
import requests
from urllib.parse import quote

title = "The Canterbury Tales"
language = "en"

encoded_title = quote(title.replace(" ", "_"), safe="")
pdf_url = f"https://{language}.wikisource.org/api/rest_v1/page/pdf/{encoded_title}"
print(pdf_url)

# Add headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Download the PDF with headers
response = requests.get(pdf_url, headers=headers)

if response.status_code == 200:
    with open(f"{title}.pdf", "wb") as f:
        f.write(response.content)
    print(f"✓ Downloaded: {title}.pdf")
else:
    print(f"✗ Error {response.status_code}: {response.reason}")

https://en.wikisource.org/api/rest_v1/page/pdf/The_Canterbury_Tales
✓ Downloaded: The Canterbury Tales.pdf


In [None]:
#https://ws-export.wmcloud.org/?format=pdf&lang=en&page=The_Complete_Works_of_Geoffrey_Chaucer%2FVolume_4

In [None]:
# Pride and Prejudice by Jane Austen
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=Pride_and_Prejudice"

# Alice's Adventures in Wonderland by Lewis Carroll
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=Alice%27s_Adventures_in_Wonderland"

# Frankenstein by Mary Shelley
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=Frankenstein"

# The Adventures of Sherlock Holmes
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=The_Adventures_of_Sherlock_Holmes"

# Moby-Dick by Herman Melville
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=Moby-Dick"

# The Picture of Dorian Gray by Oscar Wilde
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=The_Picture_of_Dorian_Gray"

# A Tale of Two Cities by Charles Dickens
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=A_Tale_of_Two_Cities"

# The Iliad by Homer
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=en&page=The_Iliad_(Pope)

# Les Misérables (French)
pdf_url = "https://ws-export.wmcloud.org/?format=pdf&lang=fr&page=Les_Mis%C3%A9rables"

In [None]:
https://ws-export.wmcloud.org/?format=pdf&lang=en&page=Moby-Dick_(1851)_US_edition

100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


## Download Frankenstein, First Edition (1818)

Extracting the full content from Wikisource using the ws-export API.

In [None]:
import requests
from urllib.parse import quote
import os

# Frankenstein, or the Modern Prometheus (First Edition, 1818)
# The page name on Wikisource
page_name = "Frankenstein,_or_the_Modern_Prometheus_(First_Edition,_1818)"

# URL encode the page name
encoded_page = quote(page_name, safe="")

# Build the export URL for PDF format
pdf_url = f"https://ws-export.wmcloud.org/?format=pdf&lang=en&page={encoded_page}"
print(f"PDF URL: {pdf_url}")

# Create output directory
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)

# Download with browser-like headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

print("Downloading Frankenstein (First Edition, 1818)...")
response = requests.get(pdf_url, headers=headers, timeout=120)

if response.status_code == 200:
    output_path = os.path.join(output_dir, "Frankenstein_First_Edition_1818.pdf")
    with open(output_path, "wb") as f:
        f.write(response.content)
    print(f"Downloaded: {output_path}")
    print(f"File size: {len(response.content) / 1024 / 1024:.2f} MB")
else:
    print(f"Error {response.status_code}: {response.reason}")
    print(response.text[:500] if response.text else "No response body")