In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import os
import re

In [None]:
def scrape_books(pages=50):
    base_url = "http://books.toscrape.com/catalogue/page-{}.html"
    books = []
    count = 1

    for page in range(1, pages + 1):
        res = requests.get(base_url.format(page))
        if res.status_code != 200:
            print(f"Falha ao acessar a página {page}")
            continue

        soup = BeautifulSoup(res.text, 'html.parser')
        articles = soup.find_all("article", class_="product_pod")

        for book in articles:
            title = book.h3.a['title']
            price_text = book.find("p", class_="price_color").text
            price = float(re.sub(r'[^\d.]', '', price_text))
            availability = book.find("p", class_="instock availability").text.strip()
            rating = book.p['class'][1]

            book_url = "http://books.toscrape.com/catalogue/" + book.h3.a['href']

            book_res = requests.get(book_url)
            book_soup = BeautifulSoup(book_res.text, 'html.parser')

            category = book_soup.select("ul.breadcrumb li a")[-1].text.strip()

            image_relative_url = book_soup.find("div", class_="item active").img['src']
            image_url = "http://books.toscrape.com/" + image_relative_url.replace('../', '')

            books.append({
                "id": count,
                "title": title,
                "Sigla_Moeda": "Libra",
                "Simbolo_Moeda": "£",
                "price": price,
                "availability": availability,
                "rating": rating,
                "category": category,
                "image_url": image_url
            })
            
            count = count+1

    df = pd.DataFrame(books)

    os.makedirs("data", exist_ok=True)
    df.to_csv('../data_base/books.csv', index=False)
    print("Dados salvos em data/books.csv")

    conn = sqlite3.connect("../data_base/books.db")
    df.to_sql("books", conn, if_exists="replace", index=False)
    conn.close()
    print("Dados salvos em data_base/books.db na tabela 'books'.")

if __name__ == "__main__":
    scrape_books()

Dados salvos em data/books.csv
Dados salvos em data/books.db na tabela 'books'.


In [3]:
import sqlite3
conn = sqlite3.connect("../data/books.db")
df = pd.read_sql("SELECT * FROM books", conn)
print(df.head())
conn.close()


   id                                  title Sigla_Moeda Simbolo_Moeda  price  \
0   1                   A Light in the Attic       Libra             £  51.77   
1   2                     Tipping the Velvet       Libra             £  53.74   
2   3                             Soumission       Libra             £  50.10   
3   4                          Sharp Objects       Libra             £  47.82   
4   5  Sapiens: A Brief History of Humankind       Libra             £  54.23   

  availability rating            category  \
0     In stock  Three              Poetry   
1     In stock    One  Historical Fiction   
2     In stock    One             Fiction   
3     In stock   Four             Mystery   
4     In stock   Five             History   

                                           image_url  
0  http://books.toscrape.com/media/cache/fe/72/fe...  
1  http://books.toscrape.com/media/cache/08/e9/08...  
2  http://books.toscrape.com/media/cache/ee/cf/ee...  
3  http://books.toscra