In [22]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_quotes(page):
    url = f"https://www.goodreads.com/quotes?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    quotes_data = []
    quotes = soup.find_all('div', class_='quoteDetails')

    for quote in quotes:
        quote_text_div = quote.find('div', class_='quoteText')

        # Replacing <br> tags within the quote with \n
        for br in quote_text_div.find_all("br"):
            br.replace_with("\n")

        # Extracting the quote text up to the "―"
        text = quote_text_div.get_text().split("―")[0].strip()

        author = quote.find('span', class_='authorOrTitle').text.strip().replace(',', '')

        # Extracting book title if present
        book_link = quote.find('a', class_='authorOrTitle')
        book_title = book_link.text.strip() if book_link else ''

        quotes_data.append((text, author, book_title))

    return quotes_data


all_quotes = []
for page in range(1, 101):
    quotes = scrape_quotes(page)
    all_quotes.extend(quotes)

with open('goodreads_quotes.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Quote', 'Author', 'Book Title'])
    writer.writerows(all_quotes)


In [13]:
import pandas as pd
from langdetect import detect

# Read the csv file
df = pd.read_csv('goodreads_quotes.csv')

# Define a function for language detection with an additional check for text length
def is_english(text):
    try:
        return detect(text) == 'en' if len(text) > 30 else False
    except:
        return False

# Remove all quotes that aren't in English or are too short to analyze
df = df[df['Quote'].apply(lambda x: is_english(x) if pd.notnull(x) else False)]

# Display the dataframe
print(df)


                                                  Quote  \
0        “Be yourself; everyone else is already taken.”   
1     “I'm selfish, impatient and a little insecure....   
2                      “So many books, so little time.”   
3     “Two things are infinite: the universe and hum...   
4     “A room without books is like a body without a...   
...                                                 ...   
2995  “A man should hear a little music, read a litt...   
2996             “We are all born mad. Some remain so.”   
2997    “The past beats inside me like a second heart.”   
2998  “What do you fear, lady?" [Aragorn] asked. "A ...   
2999  “If the person you are talking to doesn't appe...   

                          Author              Book Title  
0                    Oscar Wilde                     NaN  
1                 Marilyn Monroe                     NaN  
2                    Frank Zappa                     NaN  
3                Albert Einstein                     Na

In [14]:
df.to_csv('filtered_goodreads_quotes.csv', index=False)
