In [None]:
# imports
from datetime import datetime
import re

from bs4 import BeautifulSoup
from dotenv import load_dotenv
import pandas as pd
import requests

In [None]:
# read .env file
load_dotenv()

GOODREADS_EMAIL = os.getenv('GOODREADS_EMAIL')
GOODREADS_PASSWORD = os.getenv('GOODREADS_PASSWORD')

In [None]:
# define key urls
LOGIN_URL = "https://www.goodreads.com/user/sign_in"
REVIEW_URL = "https://www.goodreads.com/review/show/{}"
REVIEW_LIST_URL = (
    "https://www.goodreads.com/review/list/{}?view=reviews&shelf=read&page={}"
)

In [None]:
# login helpers
def get_authenticity_token(html):
    soup = BeautifulSoup(html, "html.parser")
    token = soup.find("input", attrs={"name": "authenticity_token"})
    if not token:
        print("could not find `authenticity_token` on login form")
    return token.get("value").strip()


def get_login_n(html):
    # there is a hidden input named `n` that also needs to be passed
    soup = BeautifulSoup(html, "html.parser")
    n = soup.find("input", attrs={"name": "n"})
    if not n:
        print("could not find `n` on login form")
    return n.get("value").strip()

In [None]:
# login to goodreads
payload = {
        "user[email]": GOODREADS_EMAIL,
        "user[password]": GOODREADS_PASSWORD,
        "utf8": "&#x2713;",
    }

session = requests.Session()
session.headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
response = session.get(LOGIN_URL)

token = get_authenticity_token(response.text)
n = get_login_n(response.text)
payload.update({"authenticity_token": token, "n": n})

print(f"Attempting to log in as {payload['user[email]']}")
p = session.post(LOGIN_URL, data=payload)
if p.ok:
    print(f"Logged in as {payload['user[email]']}")

In [None]:
# book page parsing functions
def parse_timeline_event(event_raw):
    event_string = re.sub(r"[\r\n]+", "|", event_raw)
    event_string = re.sub(r"[|–|]+", "|", event_string)
    event_string = re.sub(r"[():]", "", event_string)

    date, status, _ = event_string.split("|")[0].strip(), event_string.split("|")[1].strip(), event_string.split("|")[2:]

    if status == "Add a date":
        status = date
        date = None
        shelf = None
        try:
            edition = _[0]
        except IndexError:
            edition = None
    elif status == "Shelved as":
        shelf = _[0]
        try: 
            edition = _[1]
        except IndexError:
            edition = None
    else:
        shelf = None
        try:
            edition = _[0]
        except IndexError:
            edition = None

    return {"date": date, "status": status, "edition": edition, "shelf": shelf}

def parse_book(book_id, html):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find("h1", id="bookTitle").get_text().replace("\n", "").strip()
    edition = soup.find("span", attrs={"itemprop": "bookFormat"}).get_text().strip()
    timeline_html = soup.find_all("div", class_="readingTimeline__text")

    book = []
    for div in timeline_html[::-1]:
        if div.get_text():
            event = parse_timeline_event(div.get_text().strip())
            event["title"] = title
            
            event["book_id"] = book_id
            if not event["edition"]:
                event["edition"] = edition

            book.append(event)
    print(f"Parsed '{title}' (id: {book_id})")
    return book

In [None]:
# parsing of books - example ids
book_ids = [257149, 6251565]
books = []

for id in book_ids:
    response = session.get(f"https://www.goodreads.com/book/show/{id}")
    books += parse_book(id, response.text)

pd.DataFrame(books)