In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install pandas


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# Scape Quotes

Scrape the quotes on https://quotes.toscrape.com/

In [None]:
# Function 1: Get page content
def get_page_content(url):
    """Fetches the HTML content of a URL."""
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve content from {url}")
        return None


In [None]:
# Test Function 1
test_url = "https://quotes.toscrape.com/page/1/"
content = get_page_content(test_url)
print(content[:500])  # Print the first 500 characters to check

In [None]:

# Function 2: Parse quotes from page content
def parse_quotes_from_page(content):
    """Parses quotes, authors, and tags from the page content."""
    soup = BeautifulSoup(content, "html.parser")
    quotes_data = []

    quotes = soup.find_all("div", class_="quote")
    for quote in quotes:
        text = quote.find("span", class_="text").get_text()
        author = quote.find("small", class_="author").get_text()
        tags = [tag.get_text() for tag in quote.find_all("a", class_="tag")]
        
        quotes_data.append({
            "text": text,
            "author": author,
            "tags": tags
        })

    return quotes_data

In [None]:
# Function 3: Scrape one page based on the page number
def scrape_one_page(page_num):
    """Scrapes quotes from a single page based on the page number."""
    page_url = f"https://quotes.toscrape.com/page/{page_num}/"
    content = get_page_content(page_url)
    
    if content is None:
        return []  # Return an empty list if the page couldn't be retrieved
    
    quotes = parse_quotes_from_page(content)
    print(f"Scraped page {page_num}")
    return quotes


In [None]:
# Test Function 2
test_quotes = parse_quotes_from_page(content)
pprint(test_quotes[:2])  # Print the first 2 quotes to verify parsing


In [None]:

# Test Function 3
page_num = 1
quotes_on_page_1 = scrape_one_page(page_num)
print(f"Total quotes on page {page_num}: {len(quotes_on_page_1)}")
pprint(quotes_on_page_1[:2])  # Display the first 3 quotes for verification

In [None]:
quotes_on_page_2 = scrape_one_page(2)
print(f"Total quotes on page 2: {len(quotes_on_page_2)}")
pprint(quotes_on_page_2[:2])  # Display the first 3 quotes for verification

# Scrape books

Scrape books on https://books.toscrape.com/

In [None]:
# Test Function 1
books_url = "https://books.toscrape.com"
content = get_page_content(books_url)
# print(content[:500])  # Print the first 500 characters to check


In [None]:
# Function 2: Parse books from page content and get URLs
def parse_books_from_page(content):
    """Parses book titles, prices, availability, ratings, and URLs from the page content."""
    soup = BeautifulSoup(content, "html.parser")
    books_data = []

    books = soup.find_all("article", class_="product_pod")
    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").get_text().strip()
        availability = book.find("p", class_="instock availability").get_text().strip()
        rating = book.find("p", class_="star-rating")["class"][1]
        
        # Get book URL (relative URL)
        relative_url = book.h3.a["href"]
        book_url = f"https://books.toscrape.com/{relative_url}"
        
        books_data.append({
            "title": title,
            "price": price,
            "availability": availability,
            "rating": rating,
            "url": book_url
        })

    return books_data


In [None]:
# Test Function 2
test_books = parse_books_from_page(content)
pprint(test_books[:2])  # Print the first 3 books to verify parsing
print(f"=== Total books on page: {len(test_books)}")

In [None]:
# Test Function 4
book1_url = test_books[0]["url"]  # Take the URL of the first book from the previous test
book1_content = get_page_content(book1_url)


In [None]:
# Function 4: Scrape book details from the detail page
def scrape_book_details(content):
    """Scrapes detailed information from a book's detail page."""
    # content = get_page_content(book_url)
    
    if content is None:
        return None
    
    soup = BeautifulSoup(content, "html.parser")
    
    # Extract details
    upc = soup.find("th", string="UPC").find_next_sibling("td").get_text()
    product_type = soup.find("th", string="Product Type").find_next_sibling("td").get_text()
    price_excl_tax = soup.find("th", string="Price (excl. tax)").find_next_sibling("td").get_text()
    price_incl_tax = soup.find("th", string="Price (incl. tax)").find_next_sibling("td").get_text()
    tax = soup.find("th", string="Tax").find_next_sibling("td").get_text()
    availability = soup.find("th", string="Availability").find_next_sibling("td").get_text()
    num_reviews = soup.find("th", string="Number of reviews").find_next_sibling("td").get_text()
    description = soup.find("meta", {"name": "description"})["content"].strip()
    
    book_details = {
        "upc": upc,
        "product_type": product_type,
        "price_excl_tax": price_excl_tax,
        "price_incl_tax": price_incl_tax,
        "tax": tax,
        "availability": availability,
        "num_reviews": num_reviews,
        # "description": description
    }
    
    return book_details

In [None]:
book1_detail = scrape_book_details(book1_content)
book1_detail  # Display the details of the book

In [None]:
detailed_books = []
for book in test_books[:5]:
    print("scraping book", book["url"])
    content = get_page_content(book["url"])
    detail = scrape_book_details(content)
    detailed_books.append(detail)

In [None]:
detailed_books