In [3]:
import requests

In [4]:
page = requests.get("https://dataquestio.github.io/web-scraping-pages/simple.html")

In [5]:
page

<Response [200]>

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [7]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [8]:
html = list(soup.children)[2]

In [9]:
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [10]:
content = soup.find('p').get_text()
print("Extracted Content:", content)

Extracted Content: Here is some simple content for this page.


In [11]:
with open("extracted_data1.txt", "w") as file:
    file.write(content)

## Code to scrape specific content from single page

In [15]:
import requests
from bs4 import BeautifulSoup
import csv

# Send a GET request to the website
# Similar to your first image: page = requests.get(...)
page = requests.get('https://quotes.toscrape.com')

# Parse the HTML content
# Similar to your second image: soup = BeautifulSoup(...)
soup = BeautifulSoup(page.text, 'html.parser')

# Create a list to store quotes
quotes = []

# Find all quote elements
quote_elements = soup.find_all('div', class_='quote')

# Extract information from each quote element
for quote_element in quote_elements:
    # extract the text of the quote
    # Similar to your third image: soup.find('p').get_text()
    text = quote_element.find('span', class_='text').text
    
    # extract the author of the quote
    author = quote_element.find('small', class_='author').text
    
    # extract the tag <a> HTML elements related to the quote
    tag_elements = quote_element.select('.tags .tag')
    
    # store the list of tag strings in a list
    tags = []
    for tag_element in tag_elements:
        tags.append(tag_element.text)

    # FIXED: This block must be indented inside the 'for quote_element' loop
    quotes.append(
        {
            'text': text,
            'author': author,
            'tags': ', '.join(tags) # merge the tags into a "A, B, ..., Z" string
        }
    )

# Print the scraped quotes - optional
for quote in quotes:
    print("Quote: ", quote['text'])
    print("Author: ", quote['author'])
    # FIXED: This print was outside the loop in your original code
    print("Tags: ", quote['tags'])
    print()

# Save quotes to a CSV file
# Similar to your fourth image: with open(...) as file:
with open('quotes.csv', 'w', encoding='utf-8', newline='') as csvfile:
    fieldnames = ['text', 'author', 'tags']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write headers
    writer.writeheader()

    # Write quotes
    # FIXED: Indentation fixed for the writing loop
    for quote in quotes:
        writer.writerow(quote)

print("Quotes have been saved to quotes.csv")

Quote:  “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
Author:  Albert Einstein
Tags:  change, deep-thoughts, thinking, world

Quote:  “It is our choices, Harry, that show what we truly are, far more than our abilities.”
Author:  J.K. Rowling
Tags:  abilities, choices

Quote:  “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
Author:  Albert Einstein
Tags:  inspirational, life, live, miracle, miracles

Quote:  “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
Author:  Jane Austen
Tags:  aliteracy, books, classic, humor

Quote:  “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
Author:  Marilyn Monroe
Tags:  be-yourself, inspirational

Quote:  “Try not to become a man of success. Rather become a man of value.”
Author:  Albe

## Print Specific Content of Multiple Pages

In [16]:
import requests
from bs4 import BeautifulSoup
import csv

# Function to scrape quotes from a page
def scrape_page(soup, quotes):
    # Fixed indentation: all code inside the function must be indented
    for quote in soup.find_all('div', class_='quote'):
        text = quote.find('span', class_='text').text
        author = quote.find('small', class_='author').text
        tags = ', '.join(tag.text for tag in quote.find_all('a', class_='tag'))
        quotes.append({'Text': text, 'Author': author, 'Tags': tags})

# Base URL and headers
base_url = 'https://quotes.toscrape.com'
headers = {'User-Agent': 'Mozilla/5.0'}

# List to store quotes
quotes = []

# Function to scrape quotes from multiple pages
def scrape_all_pages(url):
    while url:
        print(f"Scraping: {url}") # Progress tracker
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            scrape_page(soup, quotes)
            
            # Look for the 'Next' button
            next_page = soup.find('li', class_='next')
            # Fixed indentation: URL update must be inside the while loop
            url = base_url + next_page.find('a')['href'] if next_page else None
        else:
            print("Failed to retrieve the page.")
            break

# Scrape quotes from all pages
scrape_all_pages(base_url)

# Save quotes to CSV file
with open('quotes2.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['Text', 'Author', 'Tags'])
    writer.writeheader()
    writer.writerows(quotes)

print(f"Finished! Saved {len(quotes)} quotes to quotes2.csv")

Scraping: https://quotes.toscrape.com
Scraping: https://quotes.toscrape.com/page/2/
Scraping: https://quotes.toscrape.com/page/3/
Scraping: https://quotes.toscrape.com/page/4/
Scraping: https://quotes.toscrape.com/page/5/
Scraping: https://quotes.toscrape.com/page/6/
Scraping: https://quotes.toscrape.com/page/7/
Scraping: https://quotes.toscrape.com/page/8/
Scraping: https://quotes.toscrape.com/page/9/
Scraping: https://quotes.toscrape.com/page/10/
Finished! Saved 100 quotes to quotes2.csv


## Exercise

## 1. 

In [17]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Function to scrape quotes from a page and filter by author
def scrape_page(soup, quotes, target_author):
    for quote in soup.find_all('div', class_='quote'):
        author = quote.find('small', class_='author').text
        
        # Only extract data if the author matches our target
        if author == target_author:
            text = quote.find('span', class_='text').text
            tags = ', '.join(tag.text for tag in quote.find_all('a', class_='tag'))
            quotes.append({'Text': text, 'Author': author, 'Tags': tags})

# Configuration
base_url = 'https://quotes.toscrape.com'
headers = {'User-Agent': 'Mozilla/5.0'}
target_author = "Albert Einstein"
all_quotes = []

def scrape_all_pages(url):
    while url:
        print(f"Checking page: {url}")
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            scrape_page(soup, all_quotes, target_author)
            
            # Find the 'Next' button to continue to the next page
            next_page = soup.find('li', class_='next')
            url = base_url + next_page.find('a')['href'] if next_page else None
            
            # Brief pause to be polite to the server
            time.sleep(0.5) 
        else:
            print("Error accessing the page.")
            break

# Run the scraper
scrape_all_pages(base_url)

# Save the specific Einstein quotes to a CSV
with open('quotes_Einstein.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['Text', 'Author', 'Tags'])
    writer.writeheader()
    writer.writerows(all_quotes)

print(f"\nSuccess! Found {len(all_quotes)} quotes by {target_author}.")
print("Data saved to 'quotes_Einstein.csv'")

Checking page: https://quotes.toscrape.com
Checking page: https://quotes.toscrape.com/page/2/
Checking page: https://quotes.toscrape.com/page/3/
Checking page: https://quotes.toscrape.com/page/4/
Checking page: https://quotes.toscrape.com/page/5/
Checking page: https://quotes.toscrape.com/page/6/
Checking page: https://quotes.toscrape.com/page/7/
Checking page: https://quotes.toscrape.com/page/8/
Checking page: https://quotes.toscrape.com/page/9/
Checking page: https://quotes.toscrape.com/page/10/

Success! Found 10 quotes by Albert Einstein.
Data saved to 'quotes_Einstein.csv'


## 2.

In [18]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Function to scrape quotes containing a specific keyword
def scrape_page(soup, quotes, keyword):
    for quote in soup.find_all('div', class_='quote'):
        text = quote.find('span', class_='text').text
        
        # Check if the keyword exists in the quote text (case-insensitive)
        if keyword.lower() in text.lower():
            author = quote.find('small', class_='author').text
            tags = ', '.join(tag.text for tag in quote.find_all('a', class_='tag'))
            
            quotes.append({
                'Text': text, 
                'Author': author, 
                'Tags': tags
            })
            # Optional: Print to console as requested
            print(f"Found: {text[:50]}...")

# Configuration
base_url = 'https://quotes.toscrape.com'
headers = {'User-Agent': 'Mozilla/5.0'}
search_word = "life"
life_quotes = []

def scrape_all_pages(url):
    while url:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            scrape_page(soup, life_quotes, search_word)
            
            # Pagination logic
            next_page = soup.find('li', class_='next')
            url = base_url + next_page.find('a')['href'] if next_page else None
            time.sleep(0.5) # Polite scraping delay
        else:
            break

# Execute the scraper
scrape_all_pages(base_url)

# Save the filtered quotes to a CSV
with open('life_quotes.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['Text', 'Author', 'Tags'])
    writer.writeheader()
    writer.writerows(life_quotes)

print(f"\nDone! Saved {len(life_quotes)} quotes containing '{search_word}' to 'life_quotes.csv'.")

Found: “There are only two ways to live your life. One is...
Found: “This life is what you make it. No matter what, yo...
Found: “The opposite of love is not hate, it's indifferen...
Found: “Good friends, good books, and a sleepy conscience...
Found: “Life is what happens to us while we are making ot...
Found: “Life is like riding a bicycle. To keep your balan...
Found: “If I were not a physicist, I would probably be a ...
Found: “Life isn't about finding yourself. Life is about ...
Found: “The fear of death follows from the fear of life. ...
Found: “I'm the one that's got to die when it's time for ...

Done! Saved 10 quotes containing 'life' to 'life_quotes.csv'.
