In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
import json
from IPython.display import display, HTML

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml',
    'Accept-Language': 'en-US,en;q=0.9',
}

def get_bookshelves():
    
    url = "https://www.gutenberg.org/ebooks/bookshelf/"
    
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        bookshelf_links = {}
        
        # Find all bookshelf links
        for link in soup.select('li a[href^="/ebooks/bookshelf/"]'):
            name = link.text.strip()
            href = link['href']
            bookshelf_id = href.split('/')[-1]
            
            # Only include numeric bookshelf IDs
            if bookshelf_id.isdigit():
                bookshelf_links[name] = f"https://www.gutenberg.org{href}"
        
        print(f"Found {len(bookshelf_links)} bookshelves.")
        return bookshelf_links
        
    except Exception as e:
        print(f"Error getting bookshelves: {e}")
        return {}

def get_books_from_bookshelf(bookshelf_url, limit=500, delay_range=(1, 3)):
   
    books = []
    page = 1
    failures = 0
    max_failures = 5
    
    print(f"Retrieving books from: {bookshelf_url}")
    
    while len(books) < limit:
        try:
            # Construct URL for pagination
            if page == 1:
                page_url = bookshelf_url
            else:
                page_url = f"{bookshelf_url}?start_index={1 + (page-1)*25}"
                
            print(f"Fetching page {page}... ({len(books)} books collected so far)")
            
            response = requests.get(page_url, headers=headers, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all book entries on the page
            book_elements = soup.select('li.booklink')
            
            if not book_elements:
                print("No more books found on this page.")
                break
                
            for book_element in book_elements:
                if len(books) >= limit:
                    break
                    
                book_data = {}
                
                # Get title and URL
                title_element = book_element.select_one('span.title')
                if title_element:
                    book_data['title'] = title_element.text.strip()
                
                # Get book URL and ID
                link_element = book_element.select_one('a[href^="/ebooks/"]')
                if link_element:
                    book_url = link_element['href']
                    book_data['url'] = f"https://www.gutenberg.org{book_url}"
                    book_data['id'] = book_url.split('/')[-1]
                
                # Get author
                author_element = book_element.select_one('span.subtitle')
                if author_element:
                    book_data['author'] = author_element.text.strip()
                
                # Only add if we have at least a title
                if 'title' in book_data:
                    books.append(book_data)
            
            # Reset failure counter on success
            failures = 0
            
            # Check if there's a next page
            next_link = soup.select_one('a[title="Go to the next page of results."]')
            if not next_link:
                print("No more pages available.")
                break
                
            # Move to the next page
            page += 1
            
            # Respect rate limits with a random delay
            delay = random.uniform(delay_range[0], delay_range[1])
            print(f"Waiting {delay:.2f} seconds before next request...")
            time.sleep(delay)
            
        except Exception as e:
            failures += 1
            print(f"Error on page {page}: {type(e).__name__}: {str(e)}")
            print(f"Attempt {failures} of {max_failures}")
            
            if failures >= max_failures:
                print("Too many consecutive failures. Stopping.")
                break
                
            # Increase delay after failure
            delay = random.uniform(delay_range[0] * 2, delay_range[1] * 2)
            print(f"Waiting {delay:.2f} seconds before retrying...")
            time.sleep(delay)
    
    print(f"Total books collected from this bookshelf: {len(books)}")
    return books[:limit]  # Ensure we don't exceed the limit

def get_book_details(book_id, delay_range=(0.5, 1.5)):
    """
    Get detailed information about a specific book
    
    Parameters:
    book_id (str): ID of the book
    delay_range (tuple): Range of seconds to wait after the request
    
    Returns:
    dict: Detailed book information
    """
    url = f"https://www.gutenberg.org/ebooks/{book_id}"
    
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        details = {}
        
        # Get basic metadata
        details['id'] = book_id
        details['title'] = soup.select_one('h1[itemprop="name"]').text.strip() if soup.select_one('h1[itemprop="name"]') else "Unknown"
        
        # Get author
        author_element = soup.select_one('a[itemprop="creator"]')
        if author_element:
            details['author'] = author_element.text.strip()
        
        # Get language
        language_element = soup.select_one('tr:has(th:-soup-contains("Language")) td')
        if language_element:
            details['language'] = language_element.text.strip()
        
        # Get subjects/categories
        subject_elements = soup.select('td[property="dcterms:subject"] a')
        if subject_elements:
            details['subjects'] = [s.text.strip() for s in subject_elements]
        
        # Get download links
        download_links = {}
        for link in soup.select('table.files a'):
            if 'href' in link.attrs and '.' in link.text:
                format_type = link.text.strip()
                download_links[format_type] = f"https://www.gutenberg.org{link['href']}"
        
        details['download_links'] = download_links
        
        # Respect rate limits
        delay = random.uniform(delay_range[0], delay_range[1])
        time.sleep(delay)
        
        return details
        
    except Exception as e:
        print(f"Error getting details for book {book_id}: {type(e).__name__}: {str(e)}")
        return {'id': book_id, 'error': str(e)}

def get_books(bookshelf_name=None, book_count=500, get_details=False, sample_size=50):
    
    # Get all available bookshelves
    bookshelves = get_bookshelves()
    
    if not bookshelves:
        print("No bookshelves found. Exiting.")
        return pd.DataFrame(), bookshelves
    
    # If no specific bookshelf provided, pick one at random
    if not bookshelf_name or bookshelf_name not in bookshelves:
        if not bookshelf_name:
            print("No bookshelf specified. Selecting a random bookshelf.")
        else:
            print(f"Bookshelf '{bookshelf_name}' not found. Selecting a random bookshelf.")
            
        bookshelf_name = random.choice(list(bookshelves.keys()))
    
    print(f"Selected bookshelf: {bookshelf_name}")
    bookshelf_url = bookshelves[bookshelf_name]
    
    # Get books from the selected bookshelf
    books = get_books_from_bookshelf(bookshelf_url, limit=book_count)
    
    if not books:
        print("No books found. Exiting.")
        return pd.DataFrame(), bookshelves
    
    # Get detailed information for a sample of books if requested
    if get_details:
        print(f"Getting detailed information for {min(sample_size, len(books))} books...")
        
        # Use a smaller sample if requested
        sample_books = books[:sample_size] if sample_size < len(books) else books
        
        for i, book in enumerate(sample_books):
            print(f"Getting details for book {i+1}/{len(sample_books)}: {book.get('title', 'Unknown')}")
            details = get_book_details(book['id'])
            
            # Update the book with detailed information
            for key, value in details.items():
                if key != 'id':  # Skip the ID as we already have it
                    book[key] = value
    
    # Convert to DataFrame for easier viewing and export
    df = pd.DataFrame(books)
    
    # Save to CSV
    filename = f"gutenberg_{bookshelf_name.replace(' ', '_').lower()}_{len(books)}_books.csv"
    df.to_csv(filename, index=False)
    print(f"Books saved to {filename}")
    
    return df, bookshelves

# Example 1: Get 500 books from a specific bookshelf
fiction_df, all_bookshelves = get_books(bookshelf_name="Browsing: Fiction", book_count=500)
display(fiction_df.head(10))

# Example 2: Get 500 books from a random bookshelf
# random_df, _ = get_books(book_count=500)
# display(random_df.head(10))

# Example 3: Get 100 books with detailed information
# detailed_df, _ = get_books(bookshelf_name="Science Fiction", book_count=100, get_details=True, sample_size=25)
# display(detailed_df.head(10))

# Display all available bookshelves (for reference)
print("\nAvailable Bookshelves:")
for name in sorted(all_bookshelves.keys()):
    print(f"- {name}")

Found 402 bookshelves.
Selected bookshelf: Browsing: Fiction
Retrieving books from: https://www.gutenberg.org/ebooks/bookshelf/486
Fetching page 1... (0 books collected so far)
Waiting 1.16 seconds before next request...
Fetching page 2... (25 books collected so far)
Waiting 1.46 seconds before next request...
Fetching page 3... (50 books collected so far)
Waiting 1.12 seconds before next request...
Fetching page 4... (75 books collected so far)
Waiting 2.18 seconds before next request...
Fetching page 5... (100 books collected so far)
Waiting 1.06 seconds before next request...
Fetching page 6... (125 books collected so far)
Waiting 1.74 seconds before next request...
Fetching page 7... (150 books collected so far)
Waiting 2.55 seconds before next request...
Fetching page 8... (175 books collected so far)
Waiting 1.07 seconds before next request...
Fetching page 9... (200 books collected so far)
Waiting 2.97 seconds before next request...
Fetching page 10... (225 books collected so fa

Unnamed: 0,title,url,id,author
0,"Frankenstein; Or, The Modern Prometheus",https://www.gutenberg.org/ebooks/84,84,Mary Wollstonecraft Shelley
1,"Moby Dick; Or, The Whale",https://www.gutenberg.org/ebooks/2701,2701,Herman Melville
2,Romeo and Juliet,https://www.gutenberg.org/ebooks/1513,1513,William Shakespeare
3,Pride and Prejudice,https://www.gutenberg.org/ebooks/1342,1342,Jane Austen
4,Alice's Adventures in Wonderland,https://www.gutenberg.org/ebooks/11,11,Lewis Carroll
5,The Great Gatsby,https://www.gutenberg.org/ebooks/64317,64317,F. Scott Fitzgerald
6,A Doll's House : a play,https://www.gutenberg.org/ebooks/2542,2542,Henrik Ibsen
7,The Complete Works of William Shakespeare,https://www.gutenberg.org/ebooks/100,100,William Shakespeare
8,Middlemarch,https://www.gutenberg.org/ebooks/145,145,George Eliot
9,A Room with a View,https://www.gutenberg.org/ebooks/2641,2641,E. M. Forster



Available Bookshelves:
- 6 Best Loved Spanish Literary Classics
- Adventure
- Africa
- African American Writers
- Ainslee's
- American Revolutionary War
- Anarchism
- Animal
- Animals-Domestic
- Animals-Wild
- Animals-Wild-Birds
- Animals-Wild-Insects
- Animals-Wild-Mammals
- Animals-Wild-Reptiles and Amphibians
- Animals-Wild-Trapping
- Anthropology
- Archaeology
- Architecture
- Argentina
- Armour's Monthly Cook Book
- Art
- Arthurian Legends
- Astounding Stories
- Astronomy
- Atheism
- Australia
- BahÃ¡'Ã­ Faith
- Banned Books List from the American Library Association
- Banned Books from Anne Haight's list
- BarnavÃ¤nnen
- Best Books Ever Listings
- Bestsellers, American, 1895-1923
- Bibliomania
- Biographies
- Biology
- Bird-Lore
- Birds, Illustrated by Color Photography
- Blackwood's Edinburgh Magazine
- Boer War
- Botany
- British Law
- Browsing: Archaeology
- Browsing: Architecture
- Browsing: Art & Photography
- Browsing: Biographies
- Browsing: Business/Management
- Browsing