In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
import os
import re

In [4]:
from urllib.parse import urljoin

In [9]:
# Configuration
BASE_URL = "https://www.gutenberg.org/ebooks/search/?sort_order=downloads"
NUM_BOOKS = 2000  # Number of books to download
BOOK_DIR = "gutenberg_books"
BOOKS_PER_PAGE = 25  # Number of books per page (Gutenberg default is 25)

# Create directory for books if it doesn't exist
os.makedirs(BOOK_DIR, exist_ok=True)

# Function to download a page
def download_page(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

# Function to extract book URLs from the search page
def extract_book_urls(search_page_html):
    soup = BeautifulSoup(search_page_html, "html.parser")
    book_links = soup.find_all("a", href=re.compile(r"^/ebooks/\d+$"))
    book_urls = ["https://www.gutenberg.org" + link['href'] for link in book_links]
    return list(set(book_urls))

# Function to extract text file URL from the book page
def extract_text_url(book_page_html, book_url):
    soup = BeautifulSoup(book_page_html, "html.parser")
    link = soup.find("a", href=re.compile(r".*\.txt"))
    if link and "Plain Text UTF-8" in link.text:
        return urljoin(book_url, link['href'])  # Use urljoin to construct a full URL
    return None

# Function to download a book
def download_book(text_url, book_dir):
    file_name = os.path.basename(text_url)
    file_path = os.path.join(book_dir, file_name)
    response = requests.get(text_url)
    response.raise_for_status()
    with open(file_path, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded {file_name}")

# Main logic
def main():
    downloaded_books = 0
    page = 1

    while downloaded_books < NUM_BOOKS:
        print(f"Downloading search page {page}...")
        search_url = f"{BASE_URL}&start_index={(page - 1) * BOOKS_PER_PAGE + 1}"
        search_page_html = download_page(search_url)

        print("Extracting book URLs...")
        book_urls = extract_book_urls(search_page_html)

        if not book_urls:
            print("No more books found.")
            break

        for book_url in book_urls:
            if downloaded_books >= NUM_BOOKS:
                break

            print(f"Processing {book_url}...")
            book_page_html = download_page(book_url)
            text_url = extract_text_url(book_page_html, book_url)

            if text_url:
                download_book(text_url, BOOK_DIR)
                downloaded_books += 1
            else:
                print(f"No suitable text file found for {book_url}")

        page += 1

if __name__ == "__main__":
    main()

Downloading search page 1...
Extracting book URLs...
Processing https://www.gutenberg.org/ebooks/16389...
Downloaded 16389.txt.utf-8
Processing https://www.gutenberg.org/ebooks/2641...
Downloaded 2641.txt.utf-8
Processing https://www.gutenberg.org/ebooks/11...
Downloaded 11.txt.utf-8
Processing https://www.gutenberg.org/ebooks/37106...
Downloaded 37106.txt.utf-8
Processing https://www.gutenberg.org/ebooks/2600...
Downloaded 2600.txt.utf-8
Processing https://www.gutenberg.org/ebooks/5200...
Downloaded 5200.txt.utf-8
Processing https://www.gutenberg.org/ebooks/1513...
Downloaded 1513.txt.utf-8
Processing https://www.gutenberg.org/ebooks/2160...
Downloaded 2160.txt.utf-8
Processing https://www.gutenberg.org/ebooks/33283...
No suitable text file found for https://www.gutenberg.org/ebooks/33283
Processing https://www.gutenberg.org/ebooks/145...
Downloaded 145.txt.utf-8
Processing https://www.gutenberg.org/ebooks/1259...
Downloaded 1259.txt.utf-8
Processing https://www.gutenberg.org/ebooks/8

HTTPError: 400 Client Error: Bad Request for url: https://www.gutenberg.org/ebooks/search/?sort_order=downloads&start_index=1001