## Quote Crawler - goodreads.com

In [1]:
import requests
from bs4 import BeautifulSoup
import pickle
import os
from langdetect import detect_langs

import sqlite3

In [2]:
conn = sqlite3.connect('quotes.db')
c = conn.cursor()

def create_table():
    c.execute(
        """
        CREATE TABLE IF NOT EXISTS quotes(
            quote TEXT UNIQUE ON CONFLICT IGNORE,
            author TEXT,
            book TEXT,
            likes INT,
            english TEXT
        )
        """
    )

def insert_row(quote, author, book, likes, english):
    c.execute(
        """
        INSERT INTO quotes (quote, author, book, likes, english)
        VALUES (?, ?, ?, ?, ?)
        """,
            (quote, author, book, likes, english))
    conn.commit()

create_table()

In [3]:
TAG = 'inspirational'
MAX_QUOTES = 5000

# this character signifies the end of the quote
h_bar = '―'

def clean_quote(text):
    to_remove = '()[]_~'
    
    for bad_char in to_remove:
        text = text.replace(bad_char, '')
        
    text = text.replace('“\n', '“').replace('\n”', '”')
    text.replace('&', 'and')
    
    return text

def likely_english(text):
    return any(item.lang=='en' for item in detect_langs(text))

In [4]:
page = 1
exit_flag = False
processed_quotes = 0

while processed_quotes < MAX_QUOTES:
    
    print(f'Crawling page: {page}')
    
    html = requests.get(f'https://www.goodreads.com/quotes/tag/{TAG}?page={page}').content
    soup = BeautifulSoup(html, 'html.parser')
    
    raw_quotes = soup.findAll('div', {'class': 'quoteDetails'})
    
    if not raw_quotes:
        break
    
    for q in raw_quotes:
        
        if processed_quotes >= MAX_QUOTES:
            exit_flag=True
            break

        qt = q.find('div', {'class' : 'quoteText'})
        quote = qt.get_text('\n', strip=' ').split(h_bar)[0].strip(' ,\t\n')
        author = qt.find('span', {'class' : 'authorOrTitle'}).get_text().strip(' ,\t\n')
        book = qt.find('a', {'class' : 'authorOrTitle'})
        book = 'Unknown' if not book else book.get_text().strip(' ,\t\n')
        likes = int(q.find('div', {'class' : 'quoteFooter'}).find('a', {'class' : 'smallText'}).get_text().split(' ')[0])

        quote = clean_quote(quote)
        english = 'Yes' if likely_english(quote) else 'No'

        insert_row(quote, author, book, likes, english)
        
        processed_quotes += 1
            
    print(f'Total quotes scraped: {processed_quotes}', end = '\n\n')
    
    if exit_flag == True:
        break
    
    page += 1

Crawling page: 1
Total quotes scraped: 30

Crawling page: 2
Total quotes scraped: 60

Crawling page: 3
Total quotes scraped: 90

Crawling page: 4
Total quotes scraped: 120

Crawling page: 5
Total quotes scraped: 150

Crawling page: 6
Total quotes scraped: 180

Crawling page: 7
Total quotes scraped: 210

Crawling page: 8
Total quotes scraped: 240

Crawling page: 9
Total quotes scraped: 270

Crawling page: 10
Total quotes scraped: 300

Crawling page: 11
Total quotes scraped: 330

Crawling page: 12
Total quotes scraped: 360

Crawling page: 13
Total quotes scraped: 390

Crawling page: 14
Total quotes scraped: 420

Crawling page: 15
Total quotes scraped: 450

Crawling page: 16
Total quotes scraped: 480

Crawling page: 17
Total quotes scraped: 510

Crawling page: 18
Total quotes scraped: 540

Crawling page: 19
Total quotes scraped: 570

Crawling page: 20
Total quotes scraped: 600

Crawling page: 21
Total quotes scraped: 630

Crawling page: 22
Total quotes scraped: 660

Crawling page: 23
Tota

In [5]:
c.close()
conn.close()