# Goodreads scraping

https://www.goodreads.com/book/most_read?category=all&country=all&duration=m

In [1]:
## GoodReads Scraping
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup as bs
import lxml
import re

# base url
url = 'https://www.goodreads.com/book/most_read?category=all&country=all&duration=m'

## Make the request

In [2]:
resp = rq.get(url)

soup = bs(resp.text, "lxml")
soup.find("a", class_="bookTitle").text.strip()

'The Ballad of Songbirds and Snakes (The Hunger Games, #0)'

## Scrape ranks

In [3]:
ranks = [x.text for x in soup.find_all("td", "number")]
ranks[:5]

['1', '2', '3', '4', '5']

## Scrape titles

In [4]:
#titles
titles = [x.text.strip().title() for x in soup.find_all("a", class_="bookTitle")]
titles[:5]

['The Ballad Of Songbirds And Snakes (The Hunger Games, #0)',
 'Where The Crawdads Sing',
 'Normal People',
 'Little Fires Everywhere',
 'The Silent Patient']

## Scrape number of readers

In [5]:
# reads
pattern = r"(\d+,\d+)"
readers_el = soup.find_all("span", "statistic")
readers = [''.join(re.findall(pattern, x.text)) for x in readers_el]
readers[:5]

['43,504', '37,373', '33,466', '27,724', '26,540']

## Scrape author names

In [6]:
#author names
authors = soup.find_all('a', "authorName")
authors = [author.text.strip() for author in authors]
authors[:5]

#Remove 'Want to read', published, and spaces
#Remove out of stars

['Suzanne Collins',
 'Delia Owens',
 'Sally Rooney',
 'Celeste Ng',
 'Alex Michaelides']

## Scrape ratings

In [7]:
#rating out of 5 stars
pattern = r"(\d\.\d{1,2})"
ratings = ["".join(re.findall(pattern, x.text)) for x in soup.find_all("span", "minirating")]
ratings[:5]

['3.89', '4.48', '3.87', '4.10', '4.09']

## Scrape n_reviews

In [8]:
pattern = r"(\d+,\d+ | \d+,\d+,\d+)"
n_reviews = ["".join(re.findall(pattern, x.text)).strip() for x in soup.find_all("span", "minirating")]
n_reviews[:5]

['68,810', '795,279', '306,659', '620,243', '394,512']

## Create DataFrame

In [9]:
data = pd.DataFrame ({
    "rank": ranks,
    "title": titles,
    "author": authors,
    "ratings": ratings,
    "reviews": n_reviews,
    "reads": readers,
    })

data.head()

Unnamed: 0,rank,title,author,ratings,reviews,reads
0,1,The Ballad Of Songbirds And Snakes (The Hunger...,Suzanne Collins,3.89,68810,43504
1,2,Where The Crawdads Sing,Delia Owens,4.48,795279,37373
2,3,Normal People,Sally Rooney,3.87,306659,33466
3,4,Little Fires Everywhere,Celeste Ng,4.1,620243,27724
4,5,The Silent Patient,Alex Michaelides,4.09,394512,26540


## Export csv

In [10]:
data.to_csv("../data/goodreads.csv", index=False)

## Import csv

In [11]:
pd.read_csv("../data/goodreads.csv").head()

Unnamed: 0,rank,title,author,ratings,reviews,reads
0,1,Where The Crawdads Sing,Delia Owens,4.48,783760,38943
1,2,The Ballad Of Songbirds And Snakes (The Hunger...,Suzanne Collins,3.91,59792,36867
2,3,Normal People,Sally Rooney,3.87,299006,36847
3,4,Little Fires Everywhere,Celeste Ng,4.1,613221,29086
4,5,The Silent Patient,Alex Michaelides,4.09,386979,27879
