In [5]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from IPython.display import display, HTML

# Configure user agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml',
    'Accept-Language': 'en-US,en;q=0.9',
}

def get_books_from_list(list_url, limit=500, delay_range=(2, 5)):
   
    books = []
    page = 1
    failures = 0
    max_failures = 5
    
    print(f"Retrieving books from: {list_url}")
    
    while len(books) < limit:
        try:
            # Construct URL for pagination
            if page == 1:
                page_url = list_url
            else:
                page_url = f"{list_url}?page={page}"
                
            print(f"Fetching page {page}... ({len(books)} books collected so far)")
            
            response = requests.get(page_url, headers=headers, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all book entries on the page
            book_elements = soup.select('tr[itemtype="http://schema.org/Book"]')
            
            if not book_elements:
                book_elements = soup.select('tr.bookalike')
            
            if not book_elements:
                book_elements = soup.select('div.bookTitle')
            
            if not book_elements:
                print("No more books found on this page. Trying alternative selectors...")
                
                # Alternative selectors for newer Goodreads layout
                book_elements = soup.select('div.BookListItem, article.BookListItem')
                
                if not book_elements:
                    print("No books found with alternative selectors either.")
                    book_elements = soup.select('div[class*="Book"]')
                    
                    if not book_elements:
                        print("No books found on this page. Moving to the next page.")
                        page += 1
                        if page > 10:  # Limit to 10 pages if we're not finding books
                            print("Reached 10 pages without finding books. Stopping.")
                            break
                        continue
            
            print(f"Found {len(book_elements)} books on this page.")
            
            for book_element in book_elements:
                if len(books) >= limit:
                    break
                
                book_data = {}
                
                # Parse book data based on the element type
                if book_element.name == 'tr':
                    # Old Goodreads layout
                    
                    # Get title and URL
                    title_element = book_element.select_one('a.bookTitle')
                    if title_element:
                        book_data['title'] = title_element.text.strip()
                        book_data['url'] = title_element['href']
                        if not book_data['url'].startswith('http'):
                            book_data['url'] = f"https://www.goodreads.com{book_data['url']}"
                    
                    # Get author
                    author_element = book_element.select_one('a.authorName')
                    if author_element:
                        book_data['author'] = author_element.text.strip()
                    
                    # Get rating
                    rating_element = book_element.select_one('span.minirating')
                    if rating_element:
                        rating_text = rating_element.text.strip()
                        rating_match = re.search(r'(\d+\.\d+)', rating_text)
                        if rating_match:
                            book_data['rating'] = float(rating_match.group(1))
                        
                        # Get number of ratings
                        rating_count_match = re.search(r'(\d+(?:,\d+)*) ratings', rating_text)
                        if rating_count_match:
                            book_data['rating_count'] = rating_count_match.group(1).replace(',', '')
                    
                elif book_element.name == 'div' or book_element.name == 'article':
                    # New Goodreads layout
                    
                    # Get title and URL
                    title_element = book_element.select_one('a[href*="/book/show/"], h3 a, .BookTitle a')
                    if title_element:
                        book_data['title'] = title_element.text.strip()
                        book_data['url'] = title_element['href']
                        if not book_data['url'].startswith('http'):
                            book_data['url'] = f"https://www.goodreads.com{book_data['url']}"
                    
                    # Get author
                    author_element = book_element.select_one('a.AuthorName, span.AuthorName a, a[href*="/author/show/"]')
                    if author_element:
                        book_data['author'] = author_element.text.strip()
                    
                    # Get rating
                    rating_element = book_element.select_one('span.RatingStars__RatingsValue, span[class*="RatingValue"]')
                    if rating_element:
                        rating_text = rating_element.text.strip()
                        try:
                            book_data['rating'] = float(rating_text)
                        except:
                            pass
                
                # Only add if we have at least a title
                if 'title' in book_data:
                    # Extract book ID from URL if available
                    if 'url' in book_data:
                        id_match = re.search(r'/show/(\d+)', book_data['url'])
                        if id_match:
                            book_data['id'] = id_match.group(1)
                    
                    books.append(book_data)
            
            # Reset failure counter on success
            failures = 0
            
            # Check if there's a next page
            next_link = soup.select_one('a.next_page')
            if not next_link:
                next_link = soup.select_one('a[rel="next"]')
                
            if not next_link:
                print("No more pages available.")
                break
                
            # Move to the next page
            page += 1
            
            # Respect rate limits with a random delay
            delay = random.uniform(delay_range[0], delay_range[1])
            print(f"Waiting {delay:.2f} seconds before next request...")
            time.sleep(delay)
            
        except Exception as e:
            failures += 1
            print(f"Error on page {page}: {type(e).__name__}: {str(e)}")
            print(f"Attempt {failures} of {max_failures}")
            
            if failures >= max_failures:
                print("Too many consecutive failures. Stopping.")
                break
                
            # Increase delay after failure
            delay = random.uniform(delay_range[0] * 2, delay_range[1] * 2)
            print(f"Waiting {delay:.2f} seconds before retrying...")
            time.sleep(delay)
    
    print(f"Total books collected: {len(books)}")
    return books[:limit]  # Ensure we don't exceed the limit


def get_books_from_search(query, limit=500, delay_range=(2, 5)):
    """
    Get books from a Goodreads search query
    
    Parameters:
    query (str): Search query
    limit (int): Maximum number of books to retrieve
    delay_range (tuple): Range of seconds to wait between pagination requests
    
    Returns:
    list: List of book details
    """
    # Format the search URL
    search_url = f"https://www.goodreads.com/search?q={query.replace(' ', '+')}&search_type=books"
    
    return get_books_from_list(search_url, limit, delay_range)

def get_books_from_genre(genre, limit=500, delay_range=(2, 5)):
   
    genre_url = f"https://www.goodreads.com/genres/{genre.lower().replace(' ', '-')}"
    
    books = []
    failures = 0
    max_failures = 5
    
    try:
        print(f"Fetching genre page: {genre_url}")
        
        response = requests.get(genre_url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find popular books on the genre page
        book_elements = soup.select('div.leftContainer div.bookBox')
        
        print(f"Found {len(book_elements)} books on genre page.")
        
        for book_element in book_elements:
            if len(books) >= limit:
                break
                
            book_data = {}
            
            # Get title and URL
            title_element = book_element.select_one('a.bookTitle')
            if title_element:
                book_data['title'] = title_element.text.strip()
                book_data['url'] = title_element['href']
                if not book_data['url'].startswith('http'):
                    book_data['url'] = f"https://www.goodreads.com{book_data['url']}"
            
            # Get author
            author_element = book_element.select_one('a.authorName')
            if author_element:
                book_data['author'] = author_element.text.strip()
                # Get author URL
                book_data['author_url'] = author_element['href']
                if not book_data['author_url'].startswith('http'):
                    book_data['author_url'] = f"https://www.goodreads.com{book_data['author_url']}"
                # Extract author ID
                author_id_match = re.search(r'/author/show/(\d+)', book_data['author_url'])
                if author_id_match:
                    book_data['author_id'] = author_id_match.group(1)
            
            # Get rating
            rating_element = book_element.select_one('span.minirating')
            if rating_element:
                rating_text = rating_element.text.strip()
                # Extract average rating
                rating_match = re.search(r'(\d+\.\d+)', rating_text)
                if rating_match:
                    book_data['rating'] = float(rating_match.group(1))
                
                # Extract number of ratings
                rating_count_match = re.search(r'(\d+(?:,\d+)*) ratings', rating_text)
                if rating_count_match:
                    book_data['rating_count'] = int(rating_count_match.group(1).replace(',', ''))
            
            # Get book cover image
            cover_element = book_element.select_one('img.bookCover')
            if cover_element:
                book_data['cover_url'] = cover_element['src']
                # Sometimes the image is lazy-loaded
                if not book_data['cover_url'].startswith('http') or book_data['cover_url'].endswith('nophoto'):
                    data_url = cover_element.get('data-lazy', '')
                    if data_url and data_url.startswith('http'):
                        book_data['cover_url'] = data_url
            
            # Get description snippet if available
            description_element = book_element.select_one('div.description, span.smallText')
            if description_element:
                book_data['description_snippet'] = description_element.text.strip()
            
            # Get publication year if available
            pub_year_element = book_element.select_one('div.uitext, div.smallText')
            if pub_year_element:
                pub_text = pub_year_element.text.strip()
                pub_year_match = re.search(r'published\s+(\d{4})', pub_text, re.IGNORECASE)
                if pub_year_match:
                    book_data['publication_year'] = int(pub_year_match.group(1))
            
            # Only add if we have at least a title
            if 'title' in book_data:
                # Extract book ID from URL if available
                if 'url' in book_data:
                    id_match = re.search(r'/show/(\d+)', book_data['url'])
                    if id_match:
                        book_data['id'] = id_match.group(1)
                
                # Extract featured shelves/genres from the page if available
                try:
                    shelves_div = book_element.select_one('div.elementList div.left')
                    if shelves_div:
                        shelf_text = shelves_div.text.strip()
                        shelves_match = re.search(r'genre:(.*?)(?:$|shelved as)', shelf_text, re.IGNORECASE)
                        if shelves_match:
                            shelf_list = [s.strip() for s in shelves_match.group(1).split(',')]
                            book_data['shelves'] = shelf_list
                except:
                    pass
                
                books.append(book_data)
        
        # If we need more books, check for "most read this week" or "popular" sections
        if len(books) < limit:
            more_book_elements = soup.select('div.readable div.left a.bookTitle, div.readable div.bookTitleContainer')
            
            if more_book_elements:
                print(f"Found {len(more_book_elements)} additional books in 'most read' section.")
                
                for book_element in more_book_elements:
                    if len(books) >= limit:
                        break
                    
                    book_data = {}
                    
                    # If it's a direct title link
                    if 'bookTitle' in book_element.get('class', []):
                        book_data['title'] = book_element.text.strip()
                        book_data['url'] = book_element['href']
                        if not book_data['url'].startswith('http'):
                            book_data['url'] = f"https://www.goodreads.com{book_data['url']}"
                            
                        # Try to find the author and rating in the parent container
                        parent = book_element.find_parent('div')
                        if parent:
                            author_element = parent.select_one('a.authorName')
                            if author_element:
                                book_data['author'] = author_element.text.strip()
                            
                            rating_element = parent.select_one('span.minirating, span.greyText')
                            if rating_element:
                                rating_text = rating_element.text.strip()
                                rating_match = re.search(r'(\d+\.\d+)', rating_text)
                                if rating_match:
                                    book_data['rating'] = float(rating_match.group(1))
                    
                    # If it's a container, parse its contents
                    elif 'bookTitleContainer' in book_element.get('class', []):
                        title_element = book_element.select_one('a.bookTitle')
                        if title_element:
                            book_data['title'] = title_element.text.strip()
                            book_data['url'] = title_element['href']
                            if not book_data['url'].startswith('http'):
                                book_data['url'] = f"https://www.goodreads.com{book_data['url']}"
                        
                        author_element = book_element.select_one('a.authorName')
                        if author_element:
                            book_data['author'] = author_element.text.strip()
                        
                        rating_element = book_element.select_one('span.minirating, span.greyText')
                        if rating_element:
                            rating_text = rating_element.text.strip()
                            rating_match = re.search(r'(\d+\.\d+)', rating_text)
                            if rating_match:
                                book_data['rating'] = float(rating_match.group(1))
                    
                    # Extract book ID from URL
                    if 'url' in book_data:
                        id_match = re.search(r'/show/(\d+)', book_data['url'])
                        if id_match:
                            book_data['id'] = id_match.group(1)
                    
                    # Only add if we have at least a title and it's not a duplicate
                    if 'title' in book_data and 'id' in book_data:
                        # Check for duplicates by ID
                        if not any(b.get('id') == book_data['id'] for b in books):
                            books.append(book_data)
        
        # If we still need more books, look for related lists
        if len(books) < limit:
            list_elements = soup.select('div.listItem a.listTitle, div.list a.listTitle')
            
            if list_elements:
                print(f"Found {len(list_elements)} related lists. Fetching more books...")
                
                # Get a random list to fetch more books
                random_list = random.choice(list_elements)
                list_url = random_list['href']
                if not list_url.startswith('http'):
                    list_url = f"https://www.goodreads.com{list_url}"
                
                # Get books from the list
                list_books = get_books_from_list(list_url, limit=limit-len(books), delay_range=delay_range)
                
                # Add only non-duplicate books from the list
                for list_book in list_books:
                    if len(books) >= limit:
                        break
                        
                    if 'id' in list_book:
                        # Check for duplicates by ID
                        if not any(b.get('id') == list_book['id'] for b in books):
                            books.append(list_book)
                    else:
                        # If no ID, check for duplicates by title and author
                        if not any(b.get('title') == list_book.get('title') and b.get('author') == list_book.get('author') for b in books):
                            books.append(list_book)
    
    except Exception as e:
        print(f"Error fetching genre page: {type(e).__name__}: {str(e)}")
        print(f"Attempt {failures + 1} of {max_failures}")
        
        failures += 1
        if failures < max_failures:
            delay = random.uniform(delay_range[0] * 2, delay_range[1] * 2)
            print(f"Waiting {delay:.2f} seconds before retrying...")
            time.sleep(delay)
            # Recursively try again with remaining limit
            additional_books = get_books_from_genre(genre, limit=limit-len(books), delay_range=delay_range)
            books.extend(additional_books)
    
    print(f"Total books collected from genre: {len(books)}")
    return books[:limit]  # Ensure we don't exceed the limit

def save_to_csv(books_data, filename="goodreads_books.csv"):
    """Save the books data to a CSV file"""
    df = pd.DataFrame(books_data)
    df.to_csv(filename, index=False)
    print(f"Books saved to {filename}")
    return df

bestsellers_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever"
bestsellers = get_books_from_list(bestsellers_url, limit=1000)
bestsellers_df = save_to_csv(bestsellers, "goodreads_best_books_ever.csv")
display(bestsellers_df.head(10))



Retrieving books from: https://www.goodreads.com/list/show/1.Best_Books_Ever
Fetching page 1... (0 books collected so far)
List title: Best Books Ever
Found 100 books on this page.
Waiting 3.03 seconds before next request...
Fetching page 2... (100 books collected so far)
List title: Best Books Ever
Found 100 books on this page.
Waiting 3.69 seconds before next request...
Fetching page 3... (200 books collected so far)
List title: Best Books Ever
Found 100 books on this page.
Waiting 3.07 seconds before next request...
Fetching page 4... (300 books collected so far)
List title: Best Books Ever
Found 100 books on this page.
Waiting 2.34 seconds before next request...
Fetching page 5... (400 books collected so far)
List title: Best Books Ever
Found 100 books on this page.
Waiting 4.03 seconds before next request...
Fetching page 6... (500 books collected so far)
List title: Best Books Ever
Found 100 books on this page.
Waiting 4.38 seconds before next request...
Fetching page 7... (600 b

Unnamed: 0,list_genre,title,url,author,author_url,author_id,rating,rating_count,cover_url,id,url_genres
0,Best Books Ever,The Hunger Games (The Hunger Games #1),https://www.goodreads.com/book/show/2767052-th...,Suzanne Collins,https://www.goodreads.com/author/show/153394.S...,153394,4.34,9317126,https://i.gr-assets.com/images/S/compressed.ph...,2767052,[2767052 the hunger games]
1,Best Books Ever,Harry Potter and the Order of the Phoenix (Har...,https://www.goodreads.com/book/show/2.Harry_Po...,J.K. Rowling,https://www.goodreads.com/author/show/1077326....,1077326,4.5,3616355,https://i.gr-assets.com/images/S/compressed.ph...,2,
2,Best Books Ever,Pride and Prejudice,https://www.goodreads.com/book/show/1885.Pride...,Jane Austen,https://www.goodreads.com/author/show/1265.Jan...,1265,4.29,4517211,https://i.gr-assets.com/images/S/compressed.ph...,1885,"[1885, Pride_and_prejudice]"
3,Best Books Ever,To Kill a Mockingbird,https://www.goodreads.com/book/show/2657.To_Ki...,Harper Lee,https://www.goodreads.com/author/show/1825.Har...,1825,4.26,6563388,https://i.gr-assets.com/images/S/compressed.ph...,2657,"[2657, To_kill_a_mockingbird]"
4,Best Books Ever,The Book Thief,https://www.goodreads.com/book/show/19063.The_...,Markus Zusak,https://www.goodreads.com/author/show/11466.Ma...,11466,4.39,2745941,https://i.gr-assets.com/images/S/compressed.ph...,19063,"[19063, The_book_thief]"
5,Best Books Ever,"Twilight (The Twilight Saga, #1)",https://www.goodreads.com/book/show/41865.Twil...,Stephenie Meyer,https://www.goodreads.com/author/show/941441.S...,941441,3.66,7000710,https://i.gr-assets.com/images/S/compressed.ph...,41865,"[41865, Twilight]"
6,Best Books Ever,Animal Farm,https://www.goodreads.com/book/show/170448.Ani...,George Orwell,https://www.goodreads.com/author/show/3706.Geo...,3706,4.0,4227067,https://i.gr-assets.com/images/S/compressed.ph...,170448,"[170448, Animal_farm]"
7,Best Books Ever,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,https://www.goodreads.com/book/show/30.J_R_R_T...,J.R.R. Tolkien,https://www.goodreads.com/author/show/656983.J...,656983,4.61,139663,https://i.gr-assets.com/images/S/compressed.ph...,30,
8,Best Books Ever,The Chronicles of Narnia (The Chronicles of Na...,https://www.goodreads.com/book/show/11127.The_...,C.S. Lewis,https://www.goodreads.com/author/show/1069006....,1069006,4.28,685618,https://i.gr-assets.com/images/S/compressed.ph...,11127,"[11127, The_chronicles_of_narnia]"
9,Best Books Ever,The Fault in Our Stars,https://www.goodreads.com/book/show/11870085-t...,John Green,https://www.goodreads.com/author/show/1406384....,1406384,4.13,5480099,https://i.gr-assets.com/images/S/compressed.ph...,11870085,[11870085 the fault in our stars]
