In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [88]:
import threading

# Function to fetch book links from a given URL
def fetch_book_links(url, book_links):
    page = requests.get(url).text
    data = BeautifulSoup(page, 'html.parser')
    book_link = data.find_all('a', attrs={'class': 'bookTitle'})
    for link in book_link:
        final_book_link = 'https://www.goodreads.com/' + link['href']
        book_links.append(final_book_link)

# List to store book links
book_links = []

# List of URLs to fetch
all_pages = []
for serial in range(1,100,1):
    pages = f"https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once?page={serial}"
    all_pages.append(pages)

# Create and start threads for each URL
threads = []
for url in all_pages:
    thread = threading.Thread(target=fetch_book_links, args=(url, book_links))
    threads.append(thread)
    thread.start()

# Wait for all threads to finish
for thread in threads:
    thread.join()

In [89]:
len(book_links)

9900

In [105]:
from concurrent.futures import ThreadPoolExecutor

def fetch_reviews_and_ratings(link):
    try:
        book_page = requests.get(link).text
        book_data = BeautifulSoup(book_page, 'html.parser')
        reviews = book_data.find_all('span', attrs={'class': 'Formatted'})
        stars = book_data.find_all('span', attrs={'class': 'RatingStars RatingStars__small'})

        ratings_list = [i['aria-label'] for i in stars]
        reviews_list = [j.text for j in reviews]

        book_info = {
            'ratings': ratings_list,
            'reviews': reviews_list
        }

        return book_info
    except Exception as e:
        print(f"An error occurred for link: {link}. Error: {e}")
        return None

if __name__ == '__main__':
    book_links = book_links  # Replace this with the list of book links

    total_reviews = []
    with ThreadPoolExecutor(max_workers=500) as executor:  # You can adjust the max_workers as per your requirements
        results = executor.map(fetch_reviews_and_ratings, book_links)

        for result in results:
            if result:
                total_reviews.append(result)

An error occurred for link: https://www.goodreads.com//book/show/50807.Mister_God_This_is_Anna. Error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Max retries exceeded with url: //book/show/50807.Mister_God_This_is_Anna (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1002)')))
An error occurred for link: https://www.goodreads.com//book/show/210329.The_Dark_Is_Rising. Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
An error occurred for link: https://www.goodreads.com//book/show/50623864-the-invisible-life-of-addie-larue. Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
An error occurred for link: https://www.goodreads.com//book/show/20958539-unlimited-memory. Error: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
An error occurred for link: https://ww

In [106]:
df = pd.DataFrame(total_reviews)

In [107]:
df = df.apply(lambda col: col.apply(lambda x: x if len(x) > 0 else None))

In [108]:
df = df.dropna(how='any')

In [109]:
df

Unnamed: 0,ratings,reviews
1,"[Rating 2 out of 5, Rating 2 out of 5]","[After a betrayal and a stolen love, an averag..."
4,"[Rating 5 out of 5, Rating 4 out of 5, Rating ...","[""When I was little I would think of ways to k..."
8,"[Rating 5 out of 5, Rating 5 out of 5, Rating ...",[This Elibron Classics book is a facsimile rep...
43,"[Rating 4 out of 5, Rating 4 out of 5, Rating ...",[\nSgt. Don Malarkey takes us not only into th...
48,"[Rating 4 out of 5, Rating 1 out of 5, Rating ...",[An utterly absorbing tale of mystery and obse...
...,...,...
9713,"[Rating 4 out of 5, Rating 2 out of 5, Rating ...","[NEW EDITION, REVISED AND UPDATEDLike Leadersh..."
9718,"[Rating 4 out of 5, Rating 3 out of 5, Rating ...",[New York Times BestsellerThe project that cap...
9735,"[Rating 5 out of 5, Rating 5 out of 5, Rating ...",[A dragon's fire cannot be tamed.In 22nd centu...
9738,"[Rating 5 out of 5, Rating 5 out of 5, Rating ...",[The Well House: A man's incredible journey to...


In [110]:
# Function to extract ratings
def extract_ratings(rating_list):
    ratings = []
    for rating in rating_list:
        rating_match = re.search(r'Rating (\d) out of \d', rating)
        if rating_match:
            ratings.append(int(rating_match.group(1)))
    return ratings

# Extract ratings and assign to 'ratings' column
df['ratings'] = df['ratings'].apply(extract_ratings)

# Explode the 'ratings' and 'reviews' lists into separate rows
df = df.explode('ratings').explode('reviews').reset_index(drop=True)

print(df)

        ratings                                            reviews
0             2  After a betrayal and a stolen love, an average...
1             2                                                   
2             2  Born and raised in southern California, Cody W...
3             2  Well written and very well laid out story, but...
4             2  Gave this one two chances to win me over and i...
...         ...                                                ...
1687074       2   “What she had realized was that love was that...
1687075       2  When I watched the films I had no idea they we...
1687076       2  Claire Foy as Lisbeth SalanderIn this continua...
1687077       2  My all-time favorite badass heroine returns in...
1687078       2  2.5 StarsThis should never have happened. Desp...

[1687079 rows x 2 columns]


In [111]:
df = df.drop_duplicates(subset='reviews')

In [112]:
df

Unnamed: 0,ratings,reviews
0,2,"After a betrayal and a stolen love, an average..."
1,2,
2,2,"Born and raised in southern California, Cody W..."
3,2,"Well written and very well laid out story, but..."
4,2,Gave this one two chances to win me over and i...
...,...,...
1686150,4,“What she had realized was that love was that...
1686151,4,When I watched the films I had no idea they we...
1686152,4,Claire Foy as Lisbeth SalanderIn this continua...
1686153,4,My all-time favorite badass heroine returns in...


In [113]:
df.to_csv('final_df_1.csv',index=False)

In [114]:
final_df = pd.read_csv('final_df.csv')

In [115]:
final_df_1 = pd.read_csv('final_df_1.csv')

In [117]:
book_review = pd.concat([final_df,final_df_1])

In [120]:
book_review = book_review.drop_duplicates(subset='reviews')

In [123]:
book_review = book_review.dropna(how='any')

In [125]:
book_review['reviews'].nunique()

66592

In [126]:
book_review.to_csv('book_review.csv',index=False)