In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
from concurrent.futures import ThreadPoolExecutor
import httplib2

def scrape_page_and_get_links(url):
    http = httplib2.Http()
    response, content = http.request(url)
    page = content.decode('utf-8')

    data = BeautifulSoup(page, 'html.parser')
    book_link_tags = data.find_all('a', attrs={'class': 'bookTitle'})
    book_links = ['https://www.goodreads.com/' + link['href'] for link in book_link_tags]
    return book_links

if __name__ == '__main__':
    all_pages = []
    for serial in range(1, 100, 1):
        page = f"https://www.goodreads.com/list/show/19106.MUST_READS_?page={serial}"
        all_pages.append(page)

    # Use multithreading with ThreadPoolExecutor
    num_threads = 10  # You can adjust the number of threads as needed
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(scrape_page_and_get_links, all_pages))

    # Flatten the results into a single list of book links
    book_links = [link for sublist in results for link in sublist]


In [49]:
len(book_links)

9699

In [50]:
from concurrent.futures import ThreadPoolExecutor
import time

def scrape_book_details(link):
    try:
        response = requests.get(link)
        response.raise_for_status()  # Raise an exception if the response status code is not 200

        book_data = BeautifulSoup(response.text, 'html.parser')

        title = book_data.find('h1', attrs={'class': 'Text Text__title1'})
        author = book_data.find('span', attrs={'class': 'ContributorLink__name'})
        rating = book_data.find('div', attrs={'class': 'RatingStatistics__rating'})
        no_of_ratings = book_data.find('div', attrs={'class': 'RatingStatistics__meta'})
        description = book_data.find('div', attrs={'class': 'DetailsLayoutRightParagraph__widthConstrained'})
        genres = book_data.find_all('span', attrs={'class': 'BookPageMetadataSection__genreButton'})

        # Extract the text from each element (if found) or set as None if not found
        title_text = title.text.strip() if title else None
        author_text = author.text.strip() if author else None
        rating_text = rating.text.strip() if rating else None
        no_of_ratings_text = no_of_ratings.text.strip() if no_of_ratings else None
        description_text = description.text.strip() if description else None
        genres_list = [genre.text.strip() for genre in genres] if genres else []

        # Create a dictionary for the current book
        book_dict = {
            'Title': title_text,
            'Author': author_text,
            'Rating': rating_text,
            'Number of Ratings': no_of_ratings_text,
            'Description': description_text,
            'Genres': genres_list
        }

        return book_dict
    except requests.exceptions.RequestException as e:
        print(f"Error while scraping {link}: {e}")
        return None

if __name__ == '__main__':
    # Your list of book links
    your_book_links = book_links

    # Use multithreading to scrape book details for each link
    num_threads = 300  # Adjust this number based on server capacity and your system capabilities

    # Introduce a delay of 2 seconds between each request
    time_delay = 2

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        book_details = list(executor.map(scrape_book_details, your_book_links))
        time.sleep(time_delay)  # Add a delay between requests to be considerate to the server

Error while scraping https://www.goodreads.com//book/show/41865.Twilight: 504 Server Error: Gateway Time-out for url: https://www.goodreads.com//book/show/41865.TwilightError while scraping https://www.goodreads.com//book/show/231804.The_Outsiders: 504 Server Error: Gateway Time-out for url: https://www.goodreads.com//book/show/231804.The_Outsiders

Error while scraping https://www.goodreads.com//book/show/6752378-city-of-fallen-angels: 504 Server Error: Gateway Time-out for url: https://www.goodreads.com//book/show/6752378-city-of-fallen-angels
Error while scraping https://www.goodreads.com//book/show/2800905-the-summoning: 504 Server Error: Gateway Time-out for url: https://www.goodreads.com//book/show/2800905-the-summoning
Error while scraping https://www.goodreads.com//book/show/32191737-grunt: HTTPSConnectionPool(host='www.goodreads.com', port=443): Max retries exceeded with url: //book/show/32191737-grunt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object a

In [51]:
len(book_details)

9699

In [52]:
# Filter out the None values from book_details
book_details = [book for book in book_details if book is not None]

# Create the DataFrame
df = pd.DataFrame(book_details)

In [53]:
df = df.dropna(how='any')

In [54]:
df.shape

(2531, 6)

In [55]:
df_2 = pd.read_csv('book_details.csv')

In [56]:
final_df = pd.concat([df,df_2])

In [57]:
final_df = final_df.drop_duplicates(subset='Title')

In [58]:
final_df.to_csv('book_details.csv',index=False)

In [59]:
final_df.shape

(13324, 6)