****Project Brief****
Problem: Choice Overload. Users waste more time searching than reading because of decision paralysis and a "winner-takes-all" market.
Solution: A context-aware engine that replaces popularity bias with situational matching (mood, time, environment e.g. bedtime, 10mins, long holiday/travelling).
Goal: Reduce decision fatigue and unlock the "Long Tail" of publishing/giving niche books the spotlight while helping readers find the perfect book for their current moment.

****Goodreads Webscraping****
Book data required 
- Genre 
- Title 
- Author
- Rating
- Rating counts 
- Description 
- Page numbers 
- ISBN
- Language 
- Published Year 
- Book Cover Image 
- Link to the book 

****Open Library API***
Identifiers: ISBN-13
Physical Specs: Number of pages, physical dimensions, weight, and binding type (Hardcover, mass-market paperback, etc.).

Publishing Info: Publisher name, specific publication date, and series name.

Table of Contents: Often includes a full list of chapters (a feature many other APIs lack).

3. The "Author" Layer
Open Library treats authors as distinct entities with their own metadata.

Biographical Data: Full name, birth/death dates, and a biography.

Identifiers: Links to external authority files like VIAF, Wikidata, and Library of Congress ID.

Photos: Portraits of the author when available.

4. Digital & Community Data
Because Open Library is part of the Internet Archive, it includes unique "living" data:

Availability: Data on whether an eBook version is available to borrow, read online, or download.

Community Activity: User-generated Reading Logs (Want to Read, Currently Reading, Have Read), public Book Lists, and user ratings.

Revision History: Every single change made to a record is stored, meaning you can access previous "versions" of a book's data.

In [5]:

import json
import re
import requests
from bs4 import BeautifulSoup

In [6]:
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0 Safari/537.36"
    )
}

def fetch_html(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=25)
    r.raise_for_status()
    return r.text

def extract_jsonld_blocks(html: str) -> list[dict]:
    """
    Goodreads 详情页通常会有 1~多个 JSON-LD script。
    我们把能解析成 dict 的都收集起来。
    """
    soup = BeautifulSoup(html, "lxml")
    blocks = []
    for tag in soup.select('script[type="application/ld+json"]'):
        raw = tag.get_text(strip=True)
        if not raw:
            continue
        try:
            data = json.loads(raw)
            # 有时是 list，有时是 dict
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict):
                        blocks.append(item)
            elif isinstance(data, dict):
                blocks.append(data)
        except json.JSONDecodeError:
            # 极少数页面 JSON-LD 可能含奇怪字符，跳过
            continue
    return blocks

def pick_book_jsonld(blocks: list[dict]) -> dict | None:
    """
    在 JSON-LD 里挑出最像“Book”的那一块。
    常见：@type = "Book" 或 "Product"（里面也可能含 book 信息）
    """
    for b in blocks:
        t = b.get("@type")
        if isinstance(t, str) and t.lower() == "book":
            return b

    # 兜底：找包含 aggregateRating + author 的块
    for b in blocks:
        if "aggregateRating" in b and "author" in b and ("name" in b or "url" in b):
            return b

    return None

def parse_book_from_jsonld(book: dict) -> dict:
    """
    从 JSON-LD 提取你要的核心字段：
    Title / Author / Rating / RatingCount / Description / ISBN / Image / URL
    """
    title = book.get("name", "") or ""
    url = book.get("url", "") or ""

    # author 可能是 dict 或 list
    author = ""
    a = book.get("author")
    if isinstance(a, dict):
        author = a.get("name", "") or ""
    elif isinstance(a, list) and a:
        if isinstance(a[0], dict):
            author = a[0].get("name", "") or ""
        elif isinstance(a[0], str):
            author = a[0]

    desc = book.get("description", "") or ""
    # 简单清理 description 里的多余空白
    desc = re.sub(r"\s+", " ", desc).strip()

    isbn = book.get("isbn", "") or ""

    image = book.get("image", "") or ""
    # image 有时是 list
    if isinstance(image, list) and image:
        image = image[0]

    rating_value = ""
    rating_count = ""
    ar = book.get("aggregateRating")
    if isinstance(ar, dict):
        rating_value = str(ar.get("ratingValue", "") or "")
        rating_count = str(ar.get("ratingCount", "") or "")

    return {
        "title": title,
        "author": author,
        "rating": rating_value,
        "rating_count": rating_count,
        "description": desc,
        "isbn": isbn,
        "image": image,
        "url": url,
    }

def main():
    # 你可以先用 The Hunger Games 这本书的详情页 URL（从榜单点进去复制）
    test_url = "https://www.goodreads.com/book/show/2767052-the-hunger-games"
    html = fetch_html(test_url)

    blocks = extract_jsonld_blocks(html)
    if not blocks:
        raise RuntimeError("没有找到 JSON-LD script（application/ld+json）")

    book_block = pick_book_jsonld(blocks)
    if not book_block:
        raise RuntimeError("找到 JSON-LD，但没定位到 Book 的那一块")

    data = parse_book_from_jsonld(book_block)

    print("\n=== Parsed JSON-LD fields ===")
    for k, v in data.items():
        if k == "description":
            print(f"{k}: {v[:160]}{'...' if len(v) > 160 else ''}")
        else:
            print(f"{k}: {v}")

if __name__ == "__main__":
    main()


=== Parsed JSON-LD fields ===
title: The Hunger Games (The Hunger Games, #1)
author: Suzanne Collins
rating: 4.35
rating_count: 9943052
description: 
isbn: 9780439023481
image: https://m.media-amazon.com/images/S/compressed.photo.goodreads.com/books/1586722975i/2767052.jpg
url: 


In [8]:
import json
import re
import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0 Safari/537.36"
    )
}

YEAR_RE = re.compile(r"\b(18|19|20)\d{2}\b")
PAGES_RE = re.compile(r"(\d+)\s+pages", re.IGNORECASE)

def fetch_html(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=25)
    r.raise_for_status()
    return r.text

def extract_jsonld_blocks(soup: BeautifulSoup) -> list[dict]:
    blocks = []
    for tag in soup.select('script[type="application/ld+json"]'):
        raw = tag.get_text(strip=True)
        if not raw:
            continue
        try:
            data = json.loads(raw)
            if isinstance(data, list):
                blocks.extend([x for x in data if isinstance(x, dict)])
            elif isinstance(data, dict):
                blocks.append(data)
        except json.JSONDecodeError:
            continue
    return blocks

def pick_book_jsonld(blocks: list[dict]) -> dict | None:
    for b in blocks:
        t = b.get("@type")
        if isinstance(t, str) and t.lower() == "book":
            return b
    for b in blocks:
        if "aggregateRating" in b and "author" in b and ("name" in b or "@id" in b):
            return b
    return None

def parse_from_jsonld(book: dict) -> dict:
    title = book.get("name", "") or ""

    author = ""
    a = book.get("author")
    if isinstance(a, dict):
        author = a.get("name", "") or ""
    elif isinstance(a, list) and a:
        if isinstance(a[0], dict):
            author = a[0].get("name", "") or ""
        elif isinstance(a[0], str):
            author = a[0]

    desc = book.get("description", "") or ""
    desc = re.sub(r"\s+", " ", desc).strip()

    isbn = book.get("isbn", "") or ""

    image = book.get("image", "") or ""
    if isinstance(image, list) and image:
        image = image[0]

    rating_value = ""
    rating_count = ""
    ar = book.get("aggregateRating")
    if isinstance(ar, dict):
        rating_value = str(ar.get("ratingValue", "") or "")
        rating_count = str(ar.get("ratingCount", "") or "")

    # 有些 JSON-LD 没有 url，但可能有 @id
    url = book.get("url") or book.get("@id") or ""

    return {
        "title": title,
        "author": author,
        "rating": rating_value,
        "rating_count": rating_count,
        "description": desc,
        "isbn": isbn,
        "image": image,
        "url_from_jsonld": url,
    }

def extract_description_html(soup: BeautifulSoup) -> str:
    """
    JSON-LD 没有 description 时，从页面描述区块兜底。
    Goodreads 老页面常见 #description，新页面可能是 data-testid。
    """
    # 1) 老页面
    node = soup.select_one("#description")
    if node:
        text = node.get_text(" ", strip=True)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # 2) 新页面（兜底：找包含 “Description” 的区域可能不稳定，这里尽量温和）
    for cand in soup.select("[data-testid]"):
        if cand.get("data-testid", "").lower() in {"description"}:
            text = cand.get_text(" ", strip=True)
            text = re.sub(r"\s+", " ", text).strip()
            if text:
                return text

    return ""

def extract_pages(soup: BeautifulSoup) -> int | None:
    """
    从页面里找类似 “374 pages” 的文本并提取数字。
    """
    text = soup.get_text(" ", strip=True)
    m = PAGES_RE.search(text)
    if not m:
        return None
    try:
        return int(m.group(1))
    except ValueError:
        return None

def extract_published_year(soup: BeautifulSoup) -> int | None:
    """
    你截图里的 TruncatedContent_text--small 里有：
    'October 14, 2008 by Scholastic Press'
    我们提取其中的年份。
    """
    # 先精准抓 TruncatedContent 的小文本
    node = soup.select_one(".TruncatedContent_text--small")
    if node:
        s = node.get_text(" ", strip=True)
        m = YEAR_RE.search(s)
        if m:
            return int(m.group(0))

    # 兜底：整页搜年份，但可能误命中（所以优先用上面的精准方式）
    text = soup.get_text(" ", strip=True)
    m = YEAR_RE.search(text)
    if m:
        return int(m.group(0))
    return None

def extract_language(soup: BeautifulSoup) -> str:
    """
    Language 在 TruncatedContent_text--small 里，
    且通常是一个单词（English / Spanish / German）
    """
    for div in soup.select("div.TruncatedContent_text--small"):
        text = div.get_text(strip=True)
        # 语言通常是纯字母，且长度合理
        if text.isalpha() and 3 <= len(text) <= 15:
            return text
    return ""

def extract_genres(soup: BeautifulSoup, topk: int = 5) -> list[str]:
    """
    Goodreads 新页面：
    Genre 是 a.Button.Button--tag
    文本在 span.Button__labelItem
    """
    genres = []
    for span in soup.select("a.Button.Button--tag span.Button__labelItem"):
        g = span.get_text(strip=True)
        if g:
            genres.append(g)

    # 去重，保持顺序
    genres = list(dict.fromkeys(genres))
    return genres[:topk]

def parse_full_book(book_url: str) -> dict:
    html = fetch_html(book_url)
    soup = BeautifulSoup(html, "lxml")

    blocks = extract_jsonld_blocks(soup)
    book_json = pick_book_jsonld(blocks) if blocks else None
    base = parse_from_jsonld(book_json) if book_json else {}

    # 你的最终 Link to the book：直接用请求的 URL，最可靠
    base["book_url"] = book_url

    # Description 兜底
    if not base.get("description"):
        base["description"] = extract_description_html(soup)

    # Pages / Year / Language / Genres
    base["pages"] = extract_pages(soup)
    base["published_year"] = extract_published_year(soup)
    base["language"] = extract_language(soup)
    base["genres"] = extract_genres(soup, topk=5)

    return base

def main():
    test_url = "https://www.goodreads.com/book/show/2767052-the-hunger-games"
    data = parse_full_book(test_url)

    print("\n=== Full parsed fields ===")
    for k, v in data.items():
        if k == "description":
            print(f"{k}: {v[:200]}{'...' if v and len(v) > 200 else ''}")
        else:
            print(f"{k}: {v}")

if __name__ == "__main__":
    main()


=== Full parsed fields ===
title: The Hunger Games (The Hunger Games, #1)
author: Suzanne Collins
rating: 4.35
rating_count: 9943052
description: Winning means fame and fortune. Losing means certain death. The Hunger Games have begun. . . . In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounde...
isbn: 9780439023481
image: https://m.media-amazon.com/images/S/compressed.photo.goodreads.com/books/1586722975i/2767052.jpg
url_from_jsonld: 
book_url: https://www.goodreads.com/book/show/2767052-the-hunger-games
pages: 374
published_year: 2008
language: 
genres: ['Young Adult', 'Dystopia', 'Fiction', 'Fantasy', 'Science Fiction']
