In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

base_url = "https://books.toscrape.com/catalogue/"
home_url = "https://books.toscrape.com/"

books_data = []

def get_soup(url):
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def get_book_details(book_url, category):
    soup = get_soup(book_url)
    table = soup.find("table", class_="table table-striped")
    tds = table.find_all("td")

    # Ambil detail dari tabel produk
    code = tds[0].text.strip()
    price_excl_tax = tds[2].text.strip()
    price_incl_tax = tds[3].text.strip()
    tax = tds[4].text.strip()
    number_of_reviews = tds[6].text.strip()

    # Deskripsi
    desc_tag = soup.find("meta", {"name": "description"})
    description = desc_tag["content"].strip() if desc_tag else ""

    # Judul
    title = soup.find("div", class_="product_main").h1.text.strip()

    # Rating
    rating_tag = soup.find("p", class_="star-rating")
    rating = rating_tag["class"][1] if rating_tag else "None"

    # Harga, stok, cover
    stock_info = soup.find("p", class_="instock availability").text.strip()
    number_stock = re.search(r'\d+', stock_info)
    number_stock = int(number_stock.group()) if number_stock else 0
    stock_status = "In stock" if number_stock > 0 else "Out of stock"

    cover_url = soup.find("div", class_="item active").img["src"].replace("../../", home_url)

    books_data.append({
        "category": category,
        "code": code,
        "cover": cover_url,
        "title": title,
        "rating": rating,
        "price (excl. tax)": price_excl_tax,
        "price (incl. tax)": price_incl_tax,
        "tax": tax,
        "stock status": stock_status,
        "number of stock available": number_stock,
        "description": description,
        "number of reviews": number_of_reviews
    })

def scrape_books():
    category_soup = get_soup(home_url)
    categories = category_soup.select(".side_categories ul li ul li a")

    for cat in categories:
        category_name = cat.text.strip()
        category_link = home_url + cat["href"]

        while True:
            soup = get_soup(category_link)
            books = soup.select("article.product_pod h3 a")

            for book in books:
                book_link = base_url + book["href"].replace("../../../", "")
                try:
                    get_book_details(book_link, category_name)
                except Exception as e:
                    print(f"Error scraping {book_link}: {e}")
                time.sleep(0.1)

                # Batasi sampai 1000 buku
                if len(books_data) >= 1000:
                    return

            # Cek apakah ada halaman berikutnya
            next_page = soup.select_one("li.next a")
            if next_page:
                category_link = "/".join(category_link.split("/")[:-1]) + "/" + next_page["href"]
            else:
                break

scrape_books()

# Konversi ke DataFrame
df = pd.DataFrame(books_data)
print(df.head())
print(f"\nTotal buku terkumpul: {len(df)}")

# Simpan ke file CSV jika diinginkan
# df.to_csv("books_toscrape.csv", index=False)

  category              code  \
0   Travel  a22124811bfa8350   
1   Travel  ce60436f52c5ee68   
2   Travel  f9705c362f070608   
3   Travel  1809259a5a5f1d8d   
4   Travel  a94350ee74deaa07   

                                               cover  \
0  https://books.toscrape.com/media/cache/6d/41/6...   
1  https://books.toscrape.com/media/cache/fe/8a/f...   
2  https://books.toscrape.com/media/cache/c7/1a/c...   
3  https://books.toscrape.com/media/cache/ca/30/c...   
4  https://books.toscrape.com/media/cache/45/21/4...   

                                               title rating price (excl. tax)  \
0                            It's Only the Himalayas    Two           Â£45.17   
1  Full Moon over Noahâs Ark: An Odyssey to Mou...   Four           Â£49.43   
2  See America: A Celebration of Our National Par...  Three           Â£48.87   
3  Vagabonding: An Uncommon Guide to the Art of L...    Two           Â£36.94   
4                               Under the Tuscan Sun  Three      