In [None]:
!pip install requests beautifulsoup4 pandas




In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pprint import pprint


# Scape Quotes

Scrape the quotes on https://quotes.toscrape.com/

In [11]:
# Function 1: Get page content
def get_page_content(url):
    """Fetches the HTML content of a URL."""
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve content from {url}")
        return None


In [None]:
# Test Function 1
test_url = "https://quotes.toscrape.com/page/1/"
content1 = get_page_content(test_url)
print(content1[:500])  # Print the first 500 characters to check

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
    
    
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div cla


In [13]:

# Function 2: Parse quotes from page content
def parse_quotes_from_page(content):
    """Parses quotes, authors, and tags from the page content."""
    soup = BeautifulSoup(content, "html.parser")
    quotes_data = []

    quotes = soup.find_all("div", class_="quote")
    for quote in quotes:
        text = quote.find("span", class_="text").get_text()
        author = quote.find("small", class_="author").get_text()
        tags = [tag.get_text() for tag in quote.find_all("a", class_="tag")]
        
        quotes_data.append({
            "text": text,
            "author": author,
            "tags": tags
        })

    return quotes_data

In [14]:
# Function 3: Scrape one page based on the page number
def scrape_one_page(page_num):
    """Scrapes quotes from a single page based on the page number."""
    page_url = f"https://quotes.toscrape.com/page/{page_num}/"
    content = get_page_content(page_url)
    
    if content is None:
        return []  # Return an empty list if the page couldn't be retrieved
    
    quotes = parse_quotes_from_page(content)
    print(f"Scraped page {page_num}")
    return quotes


In [None]:
# Test Function 2
test_quotes = parse_quotes_from_page(content1)
pprint(test_quotes[:2])  # Print the first 2 quotes to verify parsing


[{'author': 'Albert Einstein',
  'tags': ['change', 'deep-thoughts', 'thinking', 'world'],
  'text': '“The world as we have created it is a process of our thinking. It '
          'cannot be changed without changing our thinking.”'},
 {'author': 'J.K. Rowling',
  'tags': ['abilities', 'choices'],
  'text': '“It is our choices, Harry, that show what we truly are, far more '
          'than our abilities.”'}]


In [16]:

# Test Function 3
page_num = 1
quotes_on_page_1 = scrape_one_page(page_num)
print(f"Total quotes on page {page_num}: {len(quotes_on_page_1)}")
pprint(quotes_on_page_1[:2])  # Display the first 3 quotes for verification

Scraped page 1
Total quotes on page 1: 10
[{'author': 'Albert Einstein',
  'tags': ['change', 'deep-thoughts', 'thinking', 'world'],
  'text': '“The world as we have created it is a process of our thinking. It '
          'cannot be changed without changing our thinking.”'},
 {'author': 'J.K. Rowling',
  'tags': ['abilities', 'choices'],
  'text': '“It is our choices, Harry, that show what we truly are, far more '
          'than our abilities.”'}]


In [17]:
quotes_on_page_2 = scrape_one_page(2)
print(f"Total quotes on page 2: {len(quotes_on_page_2)}")
pprint(quotes_on_page_2[:2])  # Display the first 3 quotes for verification

Scraped page 2
Total quotes on page 2: 10
[{'author': 'Marilyn Monroe',
  'tags': ['friends', 'heartbreak', 'inspirational', 'life', 'love', 'sisters'],
  'text': "“This life is what you make it. No matter what, you're going to "
          "mess up sometimes, it's a universal truth. But the good part is you "
          "get to decide how you're going to mess it up. Girls will be your "
          "friends - they'll act like it anyway. But just remember, some come, "
          "some go. The ones that stay with you through everything - they're "
          "your true best friends. Don't let go of them. Also remember, "
          'sisters make the best friends in the world. As for lovers, well, '
          "they'll come and go too. And baby, I hate to say it, most of them - "
          'actually pretty much all of them are going to break your heart, but '
          "you can't give up because if you give up, you'll never find your "
          "soulmate. You'll never find that half who makes 

# Scrape books

Scrape books on https://books.toscrape.com/

In [None]:
# Test Function 1
books_url = "https://books.toscrape.com"
content2 = get_page_content(books_url)
# print(content[:500])  # Print the first 500 characters to check


In [19]:
# Function 2: Parse books from page content and get URLs
def parse_books_from_page(content):
    """Parses book titles, prices, availability, ratings, and URLs from the page content."""
    soup = BeautifulSoup(content, "html.parser")
    books_data = []

    books = soup.find_all("article", class_="product_pod")
    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").get_text().strip()
        availability = book.find("p", class_="instock availability").get_text().strip()
        rating = book.find("p", class_="star-rating")["class"][1]
        
        # Get book URL (relative URL)
        relative_url = book.h3.a["href"]
        book_url = f"https://books.toscrape.com/{relative_url}"
        
        books_data.append({
            "title": title,
            "price": price,
            "availability": availability,
            "rating": rating,
            "url": book_url
        })

    return books_data


In [None]:
# Test Function 2
test_books = parse_books_from_page(content2)
pprint(test_books[:2])  # Print the first 3 books to verify parsing
print(f"=== Total books on page: {len(test_books)}")

[{'availability': 'In stock',
  'price': 'Â£51.77',
  'rating': 'Three',
  'title': 'A Light in the Attic',
  'url': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'},
 {'availability': 'In stock',
  'price': 'Â£53.74',
  'rating': 'One',
  'title': 'Tipping the Velvet',
  'url': 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'}]
=== Total books on page: 20


In [21]:
# Test Function 4
book1_url = test_books[0]["url"]  # Take the URL of the first book from the previous test
book1_content = get_page_content(book1_url)


In [22]:
# Function 4: Scrape book details from the detail page
def scrape_book_details(content):
    """Scrapes detailed information from a book's detail page."""
    # content = get_page_content(book_url)
    
    if content is None:
        return None
    
    soup = BeautifulSoup(content, "html.parser")
    
    # Extract details
    upc = soup.find("th", string="UPC").find_next_sibling("td").get_text()
    product_type = soup.find("th", string="Product Type").find_next_sibling("td").get_text()
    price_excl_tax = soup.find("th", string="Price (excl. tax)").find_next_sibling("td").get_text()
    price_incl_tax = soup.find("th", string="Price (incl. tax)").find_next_sibling("td").get_text()
    tax = soup.find("th", string="Tax").find_next_sibling("td").get_text()
    availability = soup.find("th", string="Availability").find_next_sibling("td").get_text()
    num_reviews = soup.find("th", string="Number of reviews").find_next_sibling("td").get_text()
    description = soup.find("meta", {"name": "description"})["content"].strip()
    
    book_details = {
        "upc": upc,
        "product_type": product_type,
        "price_excl_tax": price_excl_tax,
        "price_incl_tax": price_incl_tax,
        "tax": tax,
        "availability": availability,
        "num_reviews": num_reviews,
        # "description": description
    }
    
    return book_details

In [23]:
book1_detail = scrape_book_details(book1_content)
book1_detail  # Display the details of the book

{'upc': 'a897fe39b1053632',
 'product_type': 'Books',
 'price_excl_tax': 'Â£51.77',
 'price_incl_tax': 'Â£51.77',
 'tax': 'Â£0.00',
 'availability': 'In stock (22 available)',
 'num_reviews': '0'}

In [24]:
detailed_books = []
for book in test_books[:5]:
    print("scraping book", book["url"])
    content = get_page_content(book["url"])
    detail = scrape_book_details(content)
    detailed_books.append(detail)

scraping book https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
scraping book https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html
scraping book https://books.toscrape.com/catalogue/soumission_998/index.html
scraping book https://books.toscrape.com/catalogue/sharp-objects_997/index.html
scraping book https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html


In [25]:
detailed_books

[{'upc': 'a897fe39b1053632',
  'product_type': 'Books',
  'price_excl_tax': 'Â£51.77',
  'price_incl_tax': 'Â£51.77',
  'tax': 'Â£0.00',
  'availability': 'In stock (22 available)',
  'num_reviews': '0'},
 {'upc': '90fa61229261140a',
  'product_type': 'Books',
  'price_excl_tax': 'Â£53.74',
  'price_incl_tax': 'Â£53.74',
  'tax': 'Â£0.00',
  'availability': 'In stock (20 available)',
  'num_reviews': '0'},
 {'upc': '6957f44c3847a760',
  'product_type': 'Books',
  'price_excl_tax': 'Â£50.10',
  'price_incl_tax': 'Â£50.10',
  'tax': 'Â£0.00',
  'availability': 'In stock (20 available)',
  'num_reviews': '0'},
 {'upc': 'e00eb4fd7b871a48',
  'product_type': 'Books',
  'price_excl_tax': 'Â£47.82',
  'price_incl_tax': 'Â£47.82',
  'tax': 'Â£0.00',
  'availability': 'In stock (20 available)',
  'num_reviews': '0'},
 {'upc': '4165285e1663650f',
  'product_type': 'Books',
  'price_excl_tax': 'Â£54.23',
  'price_incl_tax': 'Â£54.23',
  'tax': 'Â£0.00',
  'availability': 'In stock (20 available)'