In [None]:
# %% [markdown]
# # Web scraping demo: Books to Scrape
# https://books.toscrape.com/
# - Scrapes title, price, rating, availability, and product URL
# - Follows pagination (Next →)
# - Optional: scrape a specific category

# %%
import time, re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pandas as pd
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE = "https://books.toscrape.com/"

def make_session():
    s = requests.Session()
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome Safari"
    })
    retries = Retry(
        total=5,
        backoff_factor=0.4,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def clean_price(txt):
    # '£51.77' -> 51.77
    if not txt: return None
    m = re.search(r"(\d+(?:\.\d+)?)", txt.replace(",", ""))
    return float(m.group(1)) if m else None

def get_soup(session, url, sleep=0.2, timeout=15):
    resp = session.get(url, timeout=timeout)
    resp.raise_for_status()
    time.sleep(sleep)  # be polite
    return BeautifulSoup(resp.text, "html.parser")

def parse_book_card(card, page_url):
    # Each book is inside <article class="product_pod">
    title_el = card.select_one("h3 a")
    title = title_el.get("title") if title_el else None
    rel_link = title_el.get("href") if title_el else None
    product_url = urljoin(page_url, rel_link) if rel_link else None

    price = clean_price(card.select_one(".price_color").get_text(strip=True) if card.select_one(".price_color") else None)

    # Rating is in class, e.g., <p class="star-rating Three">
    rating_el = card.select_one(".star-rating")
    rating = None
    if rating_el:
        classes = rating_el.get("class", [])
        # classes like ["star-rating", "Three"]
        for c in classes:
            if c in {"One","Two","Three","Four","Five"}:
                rating = c

    # availability appears on product page; on list page it may not be present
    availability = card.select_one(".availability")
    availability = availability.get_text(strip=True) if availability else None

    return {
        "title": title,
        "price": price,
        "rating": rating,
        "availability": availability,
        "product_url": product_url,
        "page_url": page_url,
    }

def iter_pages(start_url, session, limit_pages=None):
    """Yield soup & url for each page, following 'next' links."""
    url = start_url
    pages = 0
    while url:
        soup = get_soup(session, url)
        yield soup, url
        pages += 1
        if limit_pages and pages >= limit_pages:
            break
        next_link = soup.select_one("li.next a")
        url = urljoin(url, next_link.get("href")) if next_link else None

def scrape_books(category_url=None, limit_pages=None):
    """
    Scrape all books from the main listing or a specific category.
    - category_url example: 'https://books.toscrape.com/catalogue/category/books/travel_2/index.html'
    """
    session = make_session()
    start = category_url or urljoin(BASE, "catalogue/page-1.html")  # main listing
    rows = []

    for soup, page_url in iter_pages(start, session, limit_pages=limit_pages):
        cards = soup.select("article.product_pod")
        for c in cards:
            rows.append(parse_book_card(c, page_url))
        print(f"[info] {page_url} -> +{len(cards)} items (total {len(rows)})")

    return pd.DataFrame(rows)

# %% Run: scrape first 2 pages of all books (quick classroom run)
df_demo = scrape_books(limit_pages=2)
df_demo.head(), df_demo.shape

# %% Optional: scrape full site (all pages). Comment back in if you want the full run.
# df_all = scrape_books(limit_pages=None)
# df_all.to_csv("books_all.csv", index=False)
# print("Saved books_all.csv with", len(df_all), "rows")

# %% Optional: pick a category
# How to find a category: browse homepage → click a category → copy its URL
# Example (Travel):
# cat_url = "https://books.toscrape.com/catalogue/category/books/travel_2/index.html"
# df_travel = scrape_books(category_url=cat_url, limit_pages=None)
# df_travel.to_csv("books_travel.csv", index=False)
