In [None]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
pip install requests



In [None]:
pip install texttable

Collecting texttable
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable
Successfully installed texttable-1.7.0


# **P1 - Scraping Quotes from website**

**URL:** [Quotes Website](http://www.values.com/inspirational-quotes)

**scrapes the website and saves quotes to a file inspirational_quotes.csv**

In [None]:
#Python program to scrape website and save quotes to a file inspirational_quotes.csv

import requests # To send an HTTP GET request to the website.
from bs4 import BeautifulSoup # To parse the HTML content of the page.
import csv # To write the scraped data into a CSV file.

URL = "https://www.passiton.com/inspirational-quotes"
r = requests.get(URL) # Sends a GET request to the specified URL and stores the response in variable "r".

soup = BeautifulSoup(r.content, 'html5lib') # Parses the HTML content using html5lib parser to create a navigable soup object.

quotes=[] # a list to store quotes and related dat in dictionaries.

table = soup.find('div', attrs = {'id':'all_quotes'}) # Locates the <div> tag with ID all_quotes, which contains all the quote elements.

# Now we iterates over each quote box, which is a <div> with a specific class structure.
for row in table.findAll('div',
						attrs = {'class':'col-6 col-lg-3 text-center margin-30px-bottom sm-margin-30px-top'}):
	quote = {}
	quote['theme'] = row.h5.text   # Extracts the theme of the quote
	quote['url'] = row.a['href']   # Extracts the relative URL of the quote page
	quote['img'] = row.img['src']  # Gets the image URL
	quote['lines'] = row.img['alt'].split(" #")[0]   # Extracts the quote text
	quote['author'] = row.img['alt'].split(" #")[1]  # Extracts the author
	quotes.append(quote)           # Appends the quote dict to the list

# Saving the scraped data in CSV file.

filename = 'inspirational_quotes.csv'
with open(filename, 'w', newline='') as f:
	w = csv.DictWriter(f,['theme','url','img','lines','author'])
	w.writeheader()
	for quote in quotes:
		w.writerow(quote)


  for row in table.findAll('div',


# **P2 - Scraping Covid-19 stats**

URL: [COVID-19 STATS COUNTRY WISE](https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/)

It extracts the number of confirmed cases and deaths for each country and displays the data in a formatted table using the texttable module.

In [None]:
# URl to Scrap: https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/

import requests
from bs4 import BeautifulSoup
import texttable as tt    # Used to create a neat table in the terminal output.
import json  # import json modul

# URL for scrapping data
url = 'https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/'

# Sends a request to the target URL and parses the HTML using html.parser.
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

data = []

# soup.find_all('td') will scrape every element in the url's table
data_iterator = iter(soup.find_all('td'))
# An iterator (data_iterator) is used to step through the table cells.

# This loop will keep repeating till there is data available in the iterator

'''
For each row in the HTML table (Refer the URl and table), the loop:

    --> Extracts country, number of cases, deaths, and continent.

    --> Removes commas from the numbers and converts them to integers.

    --> Appends the cleaned data as a tuple to the data list.

--> The loop stops when the iterator runs out of <td> tags (i.e., StopIteration is raised).

'''

while True:
	try:
		country = next(data_iterator).text
		confirmed = next(data_iterator).text
		deaths = next(data_iterator).text
		continent = next(data_iterator).text

		# For 'confirmed' and 'deaths', make sure to remove the commas and convert to int
		data.append((
			country,
			int(confirmed.replace(',', '')),
			int(deaths.replace(',', '')),
			continent
		))

	# StopIteration error is raised when there are no more elements left to iterate through
	except StopIteration:
		break

# Sorts the list in descending order based on the number of confirmed cases (row[1]).
data.sort(key = lambda row: row[1], reverse = True)


# create texttable object
table = tt.Texttable()
table.add_rows([(None, None, None, None)] + data)  # Add an empty row at the beginning for the headers
table.set_cols_align(('c', 'c', 'c', 'c'))  # 'l' denotes left, 'c' denotes center, and 'r' denotes right
table.header((' Country ', ' Number of cases ', ' Deaths ', ' Continent '))

print(table.draw())


# Save data to a JSON file
with open('covid_data.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

print("\nData successfully saved to 'covid_data.json'")

+---------------------------+-------------------+----------+-------------------+
|          Country          |  Number of cases  |  Deaths  |     Continent     |
|       United States       |     1.118e+08     | 1219487  |   North America   |
+---------------------------+-------------------+----------+-------------------+
|           India           |     45035393      |  533570  |       Asia        |
+---------------------------+-------------------+----------+-------------------+
|          France           |     40138560      |  167642  |      Europe       |
+---------------------------+-------------------+----------+-------------------+
|          Germany          |     38828995      |  183027  |      Europe       |
+---------------------------+-------------------+----------+-------------------+
|          Brazil           |     38743918      |  711380  |   South America   |
+---------------------------+-------------------+----------+-------------------+
|        South Korea        

# **P3 - Scraping GPU Card Product Information**

URL: [GPU Card Info](https://www.newegg.com/p/pl?d=graphics+card&nm_mc=KNC-GoogleKWLess-Search-Broad&cm_mmc=KNC-GoogleKWLess-Search-Broad-_-VGA-_-graphics-card-_-PLP-Feature&page=2)

In [None]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

my_url = 'https://www.newegg.com/p/pl?d=graphics+card&nm_mc=KNC-GoogleKWLess-Search-Broad&cm_mmc=KNC-GoogleKWLess-Search-Broad-_-VGA-_-graphics-card-_-PLP-Feature&page=2'
uclient = ureq(my_url)
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html, "html.parser")
print(page_soup)
#print(page_soup.body.id)
containers = page_soup.findAll("div",{"class":"item-container"})

filename = "products.csv"
f = open(filename, "w")
headers = "brand, productname, shipping\n"
f.write(headers)
print("before for")
for container in containers:
  print("after for")
  brand = container.div.div.a.img["title"]
  title_container = container.findAll("a",{"class":"item-title"})
  product_name = title_container[0].text
  shipping = container.findAll("li",{"class":"price-ship"})
  shipping_price = shipping[0].text.strip()
  print(brand)
  print(product_name)
  print(shipping_price)
  f.write(brand + "," + product_name.replace(",","|") + "," + shipping_price + "\n")
f.close()


HTTPError: HTTP Error 403: Forbidden

# **Why You're Getting the 403 Error**

* Missing User-Agent: The urllib.request library sends requests without a proper user-agent header by default, making it obvious to the server that the request is from a script rather than a browser.
* Anti-Scraping Protections: Newegg likely uses bot detection mechanisms (e.g., Cloudflare, Akamai) to block automated requests.
* Rate Limiting: Sending too many requests from the same IP in a short period can trigger a block.
* Authentication or Cookies: The page might require cookies, JavaScript execution, or other session data that your script isn’t providing.

In [None]:
# Installing dependencies
!apt-get update > /dev/null
!apt-get install -y firefox geckodriver > /dev/null
!pip install selenium > /dev/null

# Set up headless Firefox
import os
import csv
import json
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

# Set DISPLAY env for headless mode
os.environ['DISPLAY'] = ':0'

# Firefox options for headless mode
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080')

# Start Selenium WebDriver
driver = webdriver.Firefox(options=options)

# Go to Newegg page
url = "https://www.newegg.com/p/pl?d=graphics+card&page=1"
driver.get(url)
time.sleep(5)  # Wait for JS to load

# Parse with BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
driver.quit()

# Scrape product containers
containers = soup.find_all("div", class_="item-container")

products = []
for container in containers:
    try:
        # Brand
        brand = container.find("a", class_="item-brand").img["title"]

        # Product Name
        product_name = container.find("a", class_="item-title").text.strip()

        # Price
        price_tag = container.find("li", class_="price-current")
        price = "$" + price_tag.find("strong").text + price_tag.find("sup").text

        # Shipping
        shipping_tag = container.find("div", class_="product-delivery-title")
        shipping = shipping_tag.find("span").text.strip() if shipping_tag else "N/A"

        # Rating
        rating_tag = container.find("a", class_="item-rating")
        rating = rating_tag["title"] if rating_tag else "No rating"

        # Link
        link = container.find("a", class_="item-title")["href"]

        # Store product
        products.append({
            "brand": brand,
            "product_name": product_name,
            "price": price,
            "shipping": shipping,
            "rating": rating,
            "link": link
        })
    except Exception as e:
        print("Skipping product due to error:", e)

# Save to CSV file
csv_path = "/content/products.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["brand", "product_name", "price", "shipping", "rating", "link"])
    writer.writeheader()
    writer.writerows(products)

# Save in JSON file
json_path = "/content/products.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(products, f, indent=2)

# Final log
print(f"Scraped {len(products)} products")
print(f"CSV saved as: {csv_path}")
print(f"JSON saved as: {json_path}")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
E: Unable to locate package geckodriver
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object has no attribute 'img'
Skipping product due to error: 'NoneType' object ha

#HTML Dump for a single Product

<div id="14-932-814" class="item-container position-relative"><a href="https://www.newegg.com/gigabyte-windforce-gv-n5060wf2max-oc-8gd-geforce-rtx-5060-8gb-graphics-card-double-fans/p/N82E16814932814" class="item-img"><img src="https://c1.neweggimages.com/productimage/nb300/14-932-814-03.jpg" title="GIGABYTE WINDFORCE GeForce RTX 5060 8GB GDDR7 PCI Express 5.0 x8 ATX Graphics Card GV-N5060WF2MAX OC-8GD" alt="GIGABYTE WINDFORCE GeForce RTX 5060 8GB GDDR7 PCI Express 5.0 x8 ATX Graphics Card GV-N5060WF2MAX OC-8GD" fetchpriority="high"><div class="btn btn-large btn-quickview">Quick View</div><div class="item-quick-action-container"><button class="quick-action" title="log In required"><i class="fa fa-heart-outline"></i></button></div></a><div class="item-info"><div class="item-branding has-brand-store"><a href="https://www.newegg.com/GIGABYTE/BrandStore/ID-1314" class="item-brand"><img src="https://c1.neweggimages.com/brandimage/Brand1314.gif" title="GIGABYTE" alt="GIGABYTE"><i class="ico ico-angle-right-right"></i></a><a href="https://www.newegg.com/gigabyte-windforce-gv-n5060wf2max-oc-8gd-geforce-rtx-5060-8gb-graphics-card-double-fans/p/N82E16814932814#IsFeedbackTab" class="item-rating" title="Rating + 4.1"><i class="rating rating-4" aria-label="rated 4.1 out of 5"></i><span class="item-rating-num">(22)</span></a></div><a href="https://www.newegg.com/gigabyte-windforce-gv-n5060wf2max-oc-8gd-geforce-rtx-5060-8gb-graphics-card-double-fans/p/N82E16814932814" class="item-title" title="View Details"><span class="item-open-box-italic"></span>GIGABYTE WINDFORCE GeForce RTX 5060 8GB GDDR7 PCI Express 5.0 x8 ATX Graphics Card GV-N5060WF2MAX OC-8GD</a><ul class="item-features"><li><strong>Model #: </strong>GV-N5060WF2MAX OC-8GD</li></ul></div><div class="item-action"><ul class="price"><li class="price-was"></li><li class="price-map">&nbsp;</li><li class="price-current"><span class="price-current-label"></span>$<strong>309</strong><sup>.99</sup>&nbsp;<span class="price-current-range"><abbr title="to">–</abbr></span></li><li class="price-save "></li></ul><div class="product-delivery-new">
                <div class="product-delivery-title">
                  <span><strong>$6.99</strong> shipping</span>
                  
                   <span>from United States</span>
                </div>
                
                
              </div><div class="item-operate"><div class="item-button-area"><button class="btn btn-primary btn-mini" title="Add GIGABYTE WINDFORCE GeForce RTX 5060 8GB GDDR7 PCI Express 5.0 x8 ATX Graphics Card GV-N5060WF2MAX OC-8GD to cart">Add to cart <i class="fas fa-caret-right"></i></button></div><div class="item-compare-box"><label class="form-checkbox"><input type="checkbox" autocomplete="off"><span class="form-checkbox-title">Compare</span></label></div></div></div><div class="item-stock" id="stock_14-932-814"></div></div>


In [None]:
pip install fake_useragent



# **Why the New Code Avoids the 403 Error**

Selenium Mimics a Real Browser:

Selenium uses a real browser (Firefox in this case) to load the page, which executes JavaScript and renders the page as a human user would. This makes it harder for Newegg’s server to detect your script as a bot.
The headless mode (--headless) ensures the browser runs without a visible window, but it still behaves like a full browser, including handling cookies, JavaScript, and dynamic content.


User-Agent and Browser Behavior:

Selenium automatically sets a realistic user-agent for Firefox, unlike urllib.request, which sends no user-agent by default or a generic Python one that servers often flag as suspicious.
Your code also sets browser-like options (e.g., --window-size=1920,1080), making the request appear more legitimate.


JavaScript Rendering:

Newegg’s product listing page likely relies on JavaScript to load content dynamically. The previous urllib.request approach only retrieved the initial HTML, which might not include the full product data or could trigger a 403 if the server requires JavaScript execution. Selenium, however, waits for the page to fully load (aided by time.sleep(5)), ensuring the complete HTML is available for parsing.


No Direct HTTP Request:

Unlike urllib.request, which sends a direct HTTP request that can be easily blocked by anti-scraping tools (e.g., Cloudflare), Selenium simulates a full browser session, including handling redirects, cookies, and other browser behaviors that align with human activity.


Error Handling and Robustness:

Your new code includes try-except blocks to handle parsing errors gracefully, which doesn’t directly prevent the 403 error but ensures the script doesn’t crash if some elements are missing. This robustness helps when dealing with dynamically loaded or inconsistent page structures.

# **P4 - Web Scraping Customer Reports**

URL: http://www.consumerreports.org/cro/a-to-z-index/products/index.htm


In [None]:
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

# Get HTML content
url = 'http://www.consumerreports.org/cro/a-to-z-index/products/index.htm'
file_name = 'consumer_reports.txt'
user_agent = UserAgent()

page = requests.get(url, headers={'user-agent': user_agent.chrome})
with open(file_name, 'w') as file:
    file.write(page.content.decode('utf-8') if isinstance(page.content, bytes) else page.content)

# Parse HTML
def read_file():
    with open('consumer_reports.txt', 'r') as file:
        return file.read()

soup = BeautifulSoup(read_file(), 'lxml')
all_divs = soup.find_all('div', attrs={'class': 'crux-body-copy'})

# Extract product names safely
products = [div.a.string.strip() for div in all_divs if div.a and div.a.string]

# Print products
for product in products:
    print(product)


Air Conditioners
Air Filters
Air Fryers
Air Mattresses
Air Purifiers
Airline Travel
All-Purpose Cleaners
Antivirus Software
Appliance Stores
Athletic shoes
Baby Bathtubs
Baby Bottle Sterilizers
Baby formulas
Baby Monitors
Backpack Carriers
Backpacks
Bakeware
Banks & Credit Unions
Bassinets
Bath Towels
Bathroom Scales
Bathrooms
Batteries
Battery Platforms
Bike Helmets
Bike Locks
Bike Radars
Bike trailers
Bikes
Blenders
Blood Glucose Meters
Blood Pressure Monitors
Blu-Ray Players
Boilers
Bread
Breast pumps
Camcorders
Cameras
Car Batteries
Car Insurance
Car repair shops
Car Seats
Car Travels
Car Wax
Carpet Cleaners
Carpet stain removers
Cars
Ceiling Fans
Cell Phone Services
Cell Phones
Central Air Conditioning
Chainsaws
Clothes Dryers
Coffee
Coffee Grinders
Coffee Makers
Computer backup systems
Computer Monitors
Computers
Cooktops
Cookware
Cordless Drills & Impact Drivers
Cordless phones
Countertops
Credit cards
Crib bedding
Crib mattresses
Cribs
Curtains
Customer service
Dash Cams
Deckin

In [None]:
'''
Here we are using the same consumer_reports.txt and creating a dictionary with
product name as key and product link as value and trying to dispaly, save in csv and json format.
'''

from bs4 import BeautifulSoup

# Read file
def read_file():
    with open('/content/consumer_reports.txt', 'r', encoding='utf-8') as file:
        return file.read()

# Parse HTML
soup = BeautifulSoup(read_file(), 'lxml')

# Extract products safely
products = {}
for div in soup.find_all('div', class_='crux-body-copy'):
    a_tag = div.find('a')
    if a_tag and a_tag.string and a_tag.has_attr('href'):
        products[a_tag.string.strip()] = a_tag['href']

# Display products
for name, link in products.items():
    print(f"{name} --> {link}")


# Save to CSV
with open('/content/consumer_products.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Product Name', 'Link'])
    for name, link in products.items():
        writer.writerow([name, link])

# Save to JSON
with open('/content/consumer_products.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(products, jsonfile, indent=4)

print("Saved to CSV and JSON!")


Air Conditioners --> /appliances/air-conditioners/
Air Filters --> /appliances/air-filters/
Air Fryers --> /appliances/air-fryers/
Air Mattresses --> /home-garden/air-mattresses/
Air Purifiers --> /appliances/air-purifiers/
Airline Travel --> /money/airline-travel/
All-Purpose Cleaners --> /appliances/all-purpose-cleaners/buying-guide/
Antivirus Software --> /electronics-computers/antivirus-software/
Appliance Stores --> /money/appliance-stores/
Athletic shoes --> /health/athletic-shoes/buying-guide/
Baby Bathtubs --> /babies-kids/baby-bathtubs/buying-guide/
Baby Bottle Sterilizers --> /babies-kids/baby-bottle-sterilizers/
Baby formulas --> /babies-kids/baby-formula/buying-guide/
Baby Monitors --> /babies-kids/baby-monitors/
Backpack Carriers --> /babies-kids/backpack-carriers/buying-guide/
Backpacks --> /babies-kids/backpacks/buying-guide/
Bakeware --> /home-garden/bakeware/
Banks & Credit Unions --> /money/banks-credit-unions/
Bassinets --> /babies-kids/bassinets/
Bath Towels --> /ho

NameError: name 'csv' is not defined

In [None]:
import requests
from bs4 import BeautifulSoup
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/"
}

base_url = "https://www.consumerreports.org"
product_details = {}

session = requests.Session()

for name, relative_link in products.items():
    full_url = base_url + relative_link
    try:
        response = session.get(full_url, headers=headers, timeout=10)
        if response.status_code == 200:
            page_soup = BeautifulSoup(response.text, 'lxml')
            page_title = page_soup.title.string.strip() if page_soup.title else "No title"
            product_details[name] = {
                'url': full_url,
                'page_title': page_title
            }
            print(f"{name}: {page_title}")
        else:
            print(f"Failed to fetch {full_url} (status {response.status_code})")
    except Exception as e:
        print(f"Error fetching {full_url}: {e}")
    time.sleep(1)


✅ Air Conditioners: Best Air Conditioner Reviews – Consumer Reports Reviews – Consumer Reports
✅ Air Filters: Best Air Filter Reviews – Consumer Reports Reviews – Consumer Reports
✅ Air Fryers: Best Air Fryer Reviews – Consumer Reports Reviews – Consumer Reports
✅ Air Mattresses: Best Air Mattress Reviews – Consumer Reports Reviews – Consumer Reports
✅ Air Purifiers: Best Air Purifier Reviews – Consumer Reports Reviews – Consumer Reports
✅ Airline Travel: Best Airline Travel Reviews – Consumer Reports Reviews – Consumer Reports
✅ All-Purpose Cleaners: Best Multipurpose Cleaners  - Consumer Reports
✅ Antivirus Software: Best Antivirus Software Reviews – Consumer Reports Reviews – Consumer Reports
✅ Appliance Stores: Best Appliance Store Reviews – Consumer Reports Reviews – Consumer Reports
✅ Athletic shoes: Best Athletic Shoe Buying Guide - Consumer Reports
✅ Baby Bathtubs: Best Baby Bathtub Buying Guide - Consumer Reports
✅ Baby Bottle Sterilizers: Best Baby Bottle Sterilizer Reviews –

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import json
import csv

# Headers to avoid 403
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/"
}

base_url = "https://www.consumerreports.org"
product_details = {}
failed_urls = []

session = requests.Session()

# Ensure 'products' dict is already defined — run extraction from consumer_reports.txt beforehand
for name, relative_link in products.items():
    full_url = base_url + relative_link
    try:
        response = session.get(full_url, headers=headers, timeout=10)
        if response.status_code == 200:
            page_soup = BeautifulSoup(response.text, 'lxml')

            # Extract title
            page_title = page_soup.title.string.strip() if page_soup.title else "No title"

            # Extract first <h1> heading
            h1_tag = page_soup.find('h1')
            h1_text = h1_tag.text.strip() if h1_tag else "No H1 header"

            # Save details
            product_details[name] = {
                'url': full_url,
                'page_title': page_title,
                'h1': h1_text
            }
            print(f"{name}: {h1_text}")
        else:
            print(f"Failed to fetch {full_url} (status {response.status_code})")
            failed_urls.append(full_url)
    except Exception as e:
        print(f"Error fetching {full_url}: {e}")
        failed_urls.append(full_url)
    time.sleep(1)  # polite pause

with open('/content/consumer_product_pages.json', 'w', encoding='utf-8') as f:
    json.dump(product_details, f, indent=4)

print("Saved JSON to consumer_product_pages.json")


with open('/content/consumer_product_pages.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Product Name', 'URL', 'Page Title', 'H1 Header'])
    for name, details in product_details.items():
        writer.writerow([name, details['url'], details['page_title'], details['h1']])

print("Saved CSV to consumer_product_pages.csv")


with open('/content/failed_urls.txt', 'w', encoding='utf-8') as f:
    for url in failed_urls:
        f.write(url + '\n')

print(f"Logged {len(failed_urls)} failed URLs to failed_urls.txt")



Air Conditioners: Air Conditioners
Air Filters: Air Filters
Air Fryers: Air Fryers
Air Mattresses: Air Mattresses
Air Purifiers: Air Purifiers
Airline Travel: Airline Travel
All-Purpose Cleaners: Best Multipurpose Cleaners
Antivirus Software: Antivirus Software
Appliance Stores: Appliance Stores
Athletic shoes: Athletic Shoe Buying Guide
Baby Bathtubs: Baby Bathtub Buying Guide
Baby Bottle Sterilizers: Baby Bottle Sterilizers
Baby formulas: Baby Formula Buying Guide
Baby Monitors: Baby Monitors
Backpack Carriers: Backpack Carrier Buying Guide
Backpacks: Backpack Buying Guide
Bakeware: Bakeware
Banks & Credit Unions: Banks & Credit Unions
Bassinets: Bassinets
Bath Towels: Bath Towels
Bathroom Scales: Bathroom Scales
Bathrooms: Bathrooms
Batteries: Batteries
Battery Platforms: Battery Platforms
Bike Helmets: Bike Helmets
Bike Locks: Bike Locks
Bike Radars: Bike Radars
Bike trailers: Bike Trailer Buying Guide
Bikes: Bikes
Blenders: Blenders
Blood Glucose Meters: Blood Glucose Meters
Blood

# **P5 - Scraping Multiple web Pages**

Task is to Scrap java questions from codingbat website

URL: http://codingbat.com/java

I will divide the project into 3 parts:
1.   First script will describe you how to fetch the link of each section of Java questions.
2.   Secondly we will open each section(catagory)and we scrap link for each question.
3.   Thirdly we will open each question and get the problem statement, example associated with it.





In [None]:
#Part 1 - script will describe you how to fetch the link of each section of Java questions.

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

'''
Here we are scraping the link to each section.
Observe in inspect element that link is a ralative link (Warm-up) not absolute link
thus we used base_url above
'''
all_divs = soup.find_all('div',class_='summ')

#prints all the relative link
for div in all_divs:
    print(div.a['href']) #Here 'a' is a child of 'div' tag


#prints all the absolute link
for div in all_divs:
    print(base_url + div.a['href'])  #Here 'a' is a child of 'div' tag

/java/Warmup-1
/java/Warmup-2
/java/String-1
/java/Array-1
/java/Logic-1
/java/Logic-2
/java/String-2
/java/String-3
/java/Array-2
/java/Array-3
/java/AP-1
/java/Recursion-1
/java/Recursion-2
/java/Map-1
/java/Map-2
/java/Functional-1
/java/Functional-2
http://codingbat.com/java/Warmup-1
http://codingbat.com/java/Warmup-2
http://codingbat.com/java/String-1
http://codingbat.com/java/Array-1
http://codingbat.com/java/Logic-1
http://codingbat.com/java/Logic-2
http://codingbat.com/java/String-2
http://codingbat.com/java/String-3
http://codingbat.com/java/Array-2
http://codingbat.com/java/Array-3
http://codingbat.com/java/AP-1
http://codingbat.com/java/Recursion-1
http://codingbat.com/java/Recursion-2
http://codingbat.com/java/Map-1
http://codingbat.com/java/Map-2
http://codingbat.com/java/Functional-1
http://codingbat.com/java/Functional-2


In [None]:
#Secondly we will open each section and we scrap link for each question.
#--------Start - Same as above Script ----------------------------
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

all_divs = soup.find_all('div',class_='summ')


# all_links has link for each section (Page 1)
all_links = [base_url + div.a['href'] for div in all_divs] # This is list Comprahension

#--------End - Same as above Script ----------------------------

#Below code is to get link for each/all the section

for link in all_links:
    #link correspons to 2nd page ex:https://codingbat.com/java/Warmup-1
    inner_page = requests.get(link,headers={'user-agent':user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content,'lxml')

    #Now we need to scrap the link from 2nd inner page. (Inspect the HTML Page)

    div = inner_soup.find('div',class_='tabc')
    question_links = [base_url + td.a['href'] for td in div.table.find_all('td')] # has link to all the questions (list Comprahension)
    print(question_links)

    break #on commenting it you will get complete links for all the sections



ModuleNotFoundError: No module named 'fake_useragent'

In [None]:
#Final Script

#part 1

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


user_agent = UserAgent()
main_url = 'http://codingbat.com/java'
page = requests.get(main_url,headers={'user-agent':user_agent.chrome})
soup = BeautifulSoup(page.content,'lxml')

base_url = 'http://codingbat.com'

all_divs = soup.find_all('div',class_='summ')

all_links = [base_url + div.a['href'] for div in all_divs]


# part 2

for link in all_links:
    inner_page = requests.get(link,headers={'user-agent':user_agent.chrome})
    inner_soup = BeautifulSoup(inner_page.content,'lxml')
    div = inner_soup.find('div',class_='tabc')
    question_links = [base_url + td.a['href'] for td in div.table.find_all('td')]


# part 3

    for question_link in question_links:
        final_page = requests.get(question_link)
        final_soup = BeautifulSoup(final_page.content, 'lxml')
        indent_div = final_soup.find('div', attrs={'class':'indent'})

        problem_statement = indent_div.table.div.string

        siblings_of_statement = indent_div.table.div.next_siblings

        examples = [sibling for sibling in siblings_of_statement if sibling.string is not None]

        print(problem_statement)
        for example in examples:
            print(example)

        print('\n\n\n')

ModuleNotFoundError: No module named 'fake_useragent'

# **Python Job Search**

[Python Job Search](https://realpython.github.io/fake-jobs/)

---

* In the modern job market, there is an increasing need for automated systems to gather job postings from online platforms for purposes such as labor market analysis, career recommendations, and trend identification. However, scraping real-world job sites often faces legal and ethical restrictions.

* To enable safe and ethical skill development in web scraping, the website https://realpython.github.io/fake-jobs/ provides a static, publicly available dataset of fictional job listings. This site simulates a realistic job portal, containing elements such as job titles, company names, locations, posting dates, and detailed job descriptions, but without any real or sensitive data.

* The problem is to design and implement a Python-based web scraping solution that can:
  1. Extract structured data from the given URL, including:
    - Job title
    - Company name
    - Location
    - Job application link

In [15]:
import requests
from bs4 import BeautifulSoup

URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

#Getting HTML Dump
soup = BeautifulSoup(page.content, "html.parser")

#Filtering Container having all job cards
results = soup.find(id="ResultsContainer") #element that you’re looking for is a <div> with an id attribute that has the value "ResultsContainer"

#print(results)

#print("--------------------------------")

#Reformatting the HTML content
#print(results.prettify())  # all .prettify() on the results variable that you assigned above, then you’ll see all the HTML contained within the <div> neatly structured

#print("--------------------------------")

#Filtering individual job card
job_cards = results.find_all("div", class_="card-content") # It returns an iterable containing all the HTML for all the job listings displayed on that page.

for job_card in job_cards:
  print(job_card, end="\n" * 2)
print("--------------------------------")

'''
You see that your page has descriptive class names for elements like title, company and location.
You can pick out those child elements from each job posting with .find()
'''

#Filtering jod information from job cards iteratively which returns html tag and strings of text information


for job_card in job_cards:
 title_element = job_card.find("h2", class_="title")
 company_element = job_card.find("h3", class_="company")
 location_element = job_card.find("p", class_="location")
 print(title_element)
 print(company_element)
 print(location_element)
 print("################################")

 #Extract Text From HTML Elements

'''
 We only want to see the title, company, and location of each job posting.
 You can add .text to a BeautifulSoup object to return only the text content of the HTML elements that the object contains
'''
'''
for job_card in job_cards:
  title_element = job_card.find("h2", class_="title")
  company_element = job_card.find("h3", class_="company")
  location_element = job_card.find("p", class_="location")
  print(title_element.text)
  print(company_element.text)
  print(location_element.text)
  print("################################")
'''
'''
you’ll also get some extra whitespace. But no worries, because you’re working with Python strings so you can .strip()
which superfluous whitespace. You can also apply any other familiar Python string methods to further clean up your text
'''
'''
for job_card in job_cards:
  title_element = job_card.find("h2", class_="title")
  company_element = job_card.find("h3", class_="company")
  location_element = job_card.find("p", class_="location")
  print(title_element.text.strip())
  print(company_element.text.strip())
  print(location_element.text.strip())
  print("################################")
  '''

<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>
<div class="content">
<p class="location">
        Stewartbury, AA
      </p>
<p class="is-small has-text-grey">
<time datetime="2021-04-08">2021-04-08</time>
</p>
</div>
<footer class="card-footer">
<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
</footer>
</div>

<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://fi

'\nfor job_card in job_cards:\n  title_element = job_card.find("h2", class_="title")\n  company_element = job_card.find("h3", class_="company")\n  location_element = job_card.find("p", class_="location")\n  print(title_element.text.strip())\n  print(company_element.text.strip())\n  print(location_element.text.strip())\n  print("################################")\n  '

# **Filtering Technique**

* Not all of the job listings are developer jobs. Instead of printing out all the jobs listed on the website, we can first filter them using keywords.

* The job titles in the page are kept within **\<h2>** elements. To filter for only specific jobs, you can use the string argument:

In [None]:
python_jobs = results.find_all("h2", string="Python")
print(python_jobs)

[]


* This code finds all **\<h2>** elements where the contained string matches "Python" exactly.
* Note that you’re directly calling the method on your first results variable.
* the print() the output of the above code snippet to your console, then you might be disappointed because it’ll be empty.
* There was a Python job in the search results, so why isn’t it showing up?


* When you use string as you did above, your program looks for that string exactly. Any variations in the spelling, capitalization, or whitespace will prevent the element from matching.
* You should design your search criteria to make your search string more general.

# **Pass a Function to a Beautiful Soup Method**

* In addition to strings, you can sometimes pass functions as arguments to Beautiful Soup methods. You can change the previous line of code to use a function instead.

In [None]:
python_jobs = results.find_all("h2", string=lambda text: "python" in text.lower())
print(python_jobs)
print("################################")
print(len(python_jobs))

[<h2 class="title is-5">Senior Python Developer</h2>, <h2 class="title is-5">Software Engineer (Python)</h2>, <h2 class="title is-5">Python Programmer (Entry-Level)</h2>, <h2 class="title is-5">Python Programmer (Entry-Level)</h2>, <h2 class="title is-5">Software Developer (Python)</h2>, <h2 class="title is-5">Python Developer</h2>, <h2 class="title is-5">Back-End Web Developer (Python, Django)</h2>, <h2 class="title is-5">Back-End Web Developer (Python, Django)</h2>, <h2 class="title is-5">Python Programmer (Entry-Level)</h2>, <h2 class="title is-5">Software Developer (Python)</h2>]
################################
10


* Finding elements based on their text content is a powerful way to filter your HTML response for specific information.
* Beautiful Soup allows you to use exact strings or functions as arguments for filtering text in BeautifulSoup objects.
* However, when you try to print the information of the filtered Python jobs like you’ve done before, you run into an error:

In [None]:
for job_card in python_jobs:
  title_element = job_card.find("h2", class_="title")
  company_element = job_card.find("h3", class_="company")
  location_element = job_card.find("p", class_="location")
  print(title_element.text.strip())
  print(company_element.text.strip())
  print(location_element.text.strip())
  print("################################")

AttributeError: 'NoneType' object has no attribute 'text'

* This traceback message is a common error that you’ll run into a lot when you’re scraping information from the internet. Inspect the HTML of an element in your python_jobs list. What does it look like? Where do you think the error is coming from?

* When you look at a single element in python_jobs, you’ll see that it consists of only the \<h2> element that contains the job title

In [None]:
python_jobs[0]

<h2 class="title is-5">Senior Python Developer</h2>

* When you revisit the code you used to select the items, you’ll notice that’s what you targeted. You filtered for only the **\<h2>** title elements of the job postings that contain the word **"python"**. As you can see, these elements don’t include the rest of the information about the job like company name and location. Thus it is throwing error.

* The error message you received earlier was related to this:
  - AttributeError: 'NoneType' object has no attribute 'text'

* You tried to find the job title, the company name, and the job’s location in each element in python_jobs, but each element contains only the job title text.

* Your diligent parsing library still looks for the other ones, too, and returns None because it can’t find them. Then, print() fails with the shown error message when you try to extract the .text attribute from one of these None objects.

* The text you’re looking for is nested in sibling elements of the **\<h2>** elements that your filter returns. Beautiful Soup can help you select sibling, child, and parent elements of each BeautifulSoup object.



# Access Parent Elements

* One way to get access to all the information for a job is to step up in the hierarchy of the DOM starting from the **\<h2>** elements that you identified. **Take another look at the HTML of a single job posting, for example, using your developer tools**. Then, find the **\<h2>** element that contains the job title and its closest parent element that contains the information you’re interested in.

* The **\<div>** element with the ***card-content*** class contains all the information you want. It’s a third-level parent of the **\<h2>** title element that you found using your filter.

* With this information in mind, you can now use the elements in python_jobs and fetch their great-grandparent elements to get access to all the information you want.



In [None]:
python_jobs = results.find_all("h2", string=lambda text: "python" in text.lower())
print(python_jobs)
print("################################")

python_job_cards = [ h2_element.parent.parent.parent for h2_element in python_jobs]
print(python_job_cards)
print("################################")

[<h2 class="title is-5">Senior Python Developer</h2>, <h2 class="title is-5">Software Engineer (Python)</h2>, <h2 class="title is-5">Python Programmer (Entry-Level)</h2>, <h2 class="title is-5">Python Programmer (Entry-Level)</h2>, <h2 class="title is-5">Software Developer (Python)</h2>, <h2 class="title is-5">Python Developer</h2>, <h2 class="title is-5">Back-End Web Developer (Python, Django)</h2>, <h2 class="title is-5">Back-End Web Developer (Python, Django)</h2>, <h2 class="title is-5">Python Programmer (Entry-Level)</h2>, <h2 class="title is-5">Software Developer (Python)</h2>]
################################
[<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, R

* You added a list comprehension that operates on each of the **\<h2>** title elements in python_jobs that you got by filtering with the lambda expression. You’re selecting the parent element of the parent element of the parent element of each **\<h2>** title element. That’s three generations up!!!!

* A **list comprehension in Python** is a tool for creating lists by iterating over an iterable and optionally applying a condition.

* You should use list comprehensions instead of loops when you want concise, readable code that performs transformations or filtering.

In [None]:
for job_card in python_job_cards:
  title_element = job_card.find("h2", class_="title")
  company_element = job_card.find("h3", class_="company")
  location_element = job_card.find("p", class_="location")
  print(title_element.text.strip())
  print(company_element.text.strip())
  print(location_element.text.strip())
  print("################################")

Senior Python Developer
Payne, Roberts and Davis
Stewartbury, AA
################################
Software Engineer (Python)
Garcia PLC
Ericberg, AE
################################
Python Programmer (Entry-Level)
Moss, Duncan and Allen
Port Sara, AE
################################
Python Programmer (Entry-Level)
Cooper and Sons
West Victor, AE
################################
Software Developer (Python)
Adams-Brewer
Brockburgh, AE
################################
Python Developer
Rivera and Sons
East Michaelfort, AA
################################
Back-End Web Developer (Python, Django)
Stewart-Alexander
South Kimberly, AA
################################
Back-End Web Developer (Python, Django)
Jackson, Ali and Mckee
New Elizabethside, AA
################################
Python Programmer (Entry-Level)
Mathews Inc
Robertborough, AP
################################
Software Developer (Python)
Moreno-Rodriguez
Martinezburgh, AE
################################


* Using the *.parent* attribute that each BeautifulSoup object comes with gives you an intuitive way to step through your DOM structure and address the elements you need. You can also access child elements and sibling elements in a similar manner.

# **Extract Attributes From HTML Elements**

* At this point, you’ve already written code that scrapes the site and filters its HTML for relevant job postings. However, what’s still missing is fetching the link to apply for a job.

* While inspecting the page, you found two links at the bottom of each card. If you use .text on the link elements in the same way you did for the other elements, then you won’t get the URLs that you’re interested in.

In [None]:
for job_card in python_job_cards:
  links = job_card.find_all("a")
  for link in links:
    print(link.text.strip())


Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply
Learn
Apply


* If you execute the code shown above, then you’ll get the link text for Learn and Apply instead of the associated URLs.

* That’s because the *.text* attribute leaves only the visible content of an HTML element. It strips away all HTML tags, including the HTML attributes containing the URL, and leaves you with just the link text.

* To get the URL instead, you need to extract the value of one of the HTML attributes instead of discarding it.

* The URL of a link element is associated with the href HTML attribute. The specific URL that you’re looking for is the value of the href attribute of the second **\<a>** tag at the bottom of the HTML for a single job posting.

<!-- ... -->
    <footer class="card-footer">
        <a href="https://www.realpython.com" target="_blank"
           class="card-footer-item">Learn</a>
        <a href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html"
           target="_blank"
           class="card-footer-item">Apply</a>
    </footer>
  </div>
</div>

* Start by fetching all the **\<a>** elements in a job card. Then, extract the value of their href attributes using square-bracket notation.

In [None]:
for job_card in python_job_cards:
  links = job_card.find_all("a")
  for link in links:
    link_url = link["href"]
    print(f"Apply here: {link_url}\n")

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-20.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-30.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-40.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/python-developer-50.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github.io/fake-jobs/jobs/back-end-web-developer-python-django-60.html

Apply here: https://www.realpython.com

Apply here: https://realpython.github

* In this code snippet, you first fetch all the links from each of the filtered job postings. Then, you extract the href attribute, which contains the URL, using ["href"] and print it to your console.

* Each job card has two links associated with it. However, you’re only looking for the second link, so you’ll apply a small edit to the code

In [None]:
for job_card in python_job_cards:
  link_url = job_card.find_all("a")[1]["href"]
  print(f"Apply here: {link_url}\n")

Apply here: https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-20.html

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-30.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-40.html

Apply here: https://realpython.github.io/fake-jobs/jobs/python-developer-50.html

Apply here: https://realpython.github.io/fake-jobs/jobs/back-end-web-developer-python-django-60.html

Apply here: https://realpython.github.io/fake-jobs/jobs/back-end-web-developer-python-django-70.html

Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-80.html

Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-90.html



* In the updated code snippet, you use indexing to pick the second link element from the results of .find_all() using its index ([1]). Then, you directly extract the URL using the square-bracket notation with the "href" key, thereby fetching the value of the href attribute.

* You can use the same square-bracket notation to extract other HTML attributes as well.

# **Assemble Your Code in a Single Script**

In [None]:
import requests
from bs4 import BeautifulSoup

URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="ResultsContainer")

python_jobs = results.find_all(
    "h2", string=lambda text: "python" in text.lower()
)

python_job_cards = [
    h2_element.parent.parent.parent for h2_element in python_jobs
]

for job_card in python_job_cards:
    title_element = job_card.find("h2", class_="title")
    company_element = job_card.find("h3", class_="company")
    location_element = job_card.find("p", class_="location")
    print(title_element.text.strip())
    print(company_element.text.strip())
    print(location_element.text.strip())
    link_url = job_card.find_all("a")[1]["href"]
    print(f"Apply here: {link_url}\n")

Senior Python Developer
Payne, Roberts and Davis
Stewartbury, AA
Apply here: https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html

Software Engineer (Python)
Garcia PLC
Ericberg, AE
Apply here: https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html

Python Programmer (Entry-Level)
Moss, Duncan and Allen
Port Sara, AE
Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-20.html

Python Programmer (Entry-Level)
Cooper and Sons
West Victor, AE
Apply here: https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-30.html

Software Developer (Python)
Adams-Brewer
Brockburgh, AE
Apply here: https://realpython.github.io/fake-jobs/jobs/software-developer-python-40.html

Python Developer
Rivera and Sons
East Michaelfort, AA
Apply here: https://realpython.github.io/fake-jobs/jobs/python-developer-50.html

Back-End Web Developer (Python, Django)
Stewart-Alexander
South Kimberly, AA
Apply here: https://rea

# **Assignment:**

* Extract the job Requriement Information having following fields:


1.   List the job roles name
2.   Location
3.   Skill set required
4.   Date of job post
5.   Category
6.   Salary if any
7.   Demonstrate filtering the data on different criteria

* [Job1](https://www.python.org/jobs/)
* [Job2](https://remote.co/remote-jobs/developer)
* [Job3](https://pythonjobs.github.io/)





* Ensure scraping compliance by:
  - Checking the site’s robots.txt file before making requests.
  - Using polite scraping practices, such as setting a custom User-Agent and implementing delays between requests.

* Store the extracted data in a structured format (CSV/JSON) for further analysis or integration into job search applications.

* Enable scalability, allowing the scraper to be extended to multiple pages or similar websites with minimal changes.

# **Key Ethical Compliance Principles to be followed for Web Scraping**

  * Check the Website’s robots.txt
      * The robots.txt file specifies what parts of the site are allowed or disallowed for automated scraping.
      * If an area is disallowed, avoid scraping it unless you have explicit permission.

  * Read and Respect the Terms of Service (ToS)
      * Many sites have legal restrictions in their ToS against scraping certain content.

  * Avoid Overloading Servers
      * Use polite delays between requests (time.sleep()), avoid sending too many requests in parallel.

  * Don’t Collect Sensitive or Personal Data Without Consent
      * Scraping PII (personally identifiable information) without explicit permission is unethical and may be illegal.

  * Credit the Source if Data Is Used Publicly
      * If the scraped data is published, acknowledge the source.

  * Obtain Permission for Large or Repeated Scraping
      * Contact the site owner if your scraping might impact their service.

---

**Usecase Scenario**

* If you want to scrape product prices from an e-commerce site for academic research.
Ethical compliance steps:
  - Check robots.txt → see if /products/ path is allowed for scraping.
  - If allowed, add a request rate limit.
  - Don’t bypass login walls or scrape personal order histories.
  - If you publish your research, credit the site.
---

Example Script

In [9]:
import requests
from urllib.robotparser import RobotFileParser
import time

# Target site
site_url = "https://en.wikipedia.org"
robots_url = site_url + "/robots.txt"

# Step 1: Parse robots.txt
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()

# URL to scrape
target_path = "/wiki/List_of_countries_and_dependencies_by_area"

# Step 2: Check if scraping is allowed
if rp.can_fetch("*", site_url + target_path):
    print(f" Scraping allowed for: {target_path}")

    # Step 3: Polite scraping example
    headers = {"User-Agent": "Mozilla/5.0 (compatible; EthicalScraper/1.0)"}
    response = requests.get(site_url + target_path, headers=headers)
    if response.status_code == 200:
        print("Page fetched successfully.")
        # Process HTML here...
    else:
        print(f"Failed to fetch page. Status: {response.status_code}")

    # Step 4: Delay between requests
    time.sleep(2)

else:
    print(f" Scraping NOT allowed for: {target_path}. Check site's ToS.")


 Scraping allowed for: /wiki/List_of_countries_and_dependencies_by_area
Page fetched successfully.


Why this code is ethical:
  - Checks robots.txt before scraping.
  - Uses a polite User-Agent (not pretending to be a browser for deceit).
  - Uses delays to prevent server overload.
  - Does not attempt to bypass authentication or scrape disallowed pages.

# **Checking for robots.txt file for permission**

In [5]:
import time
import requests
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup

BASE_URL = "https://www.newegg.com"
TARGET_PATH = "/p/pl?d=graphics+card&page=2"  # stripped query params to focus on path


# Step 1: Parse robots.txt
rp = RobotFileParser()
rp.set_url(BASE_URL + "/robots.txt")
rp.read()

full_url = BASE_URL + TARGET_PATH

# Step 2: Check permission
if rp.can_fetch("*", full_url):
    print(f" Allowed to scrape: {full_url}")

    headers = {
        "User-Agent": "EthicalScraperBot/1.0 (+https://your.domain/bot-info)"
    }

    response = requests.get(full_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # Example scrape: collect product titles (assuming they have a known CSS class)
        titles = [tag.get_text(strip=True) for tag in soup.select(".item-title a")]

        print("Product Titles Found:")
        for idx, title in enumerate(titles, 1):
            print(f"{idx}. {title}")
    else:
        print(f"Failed to fetch: HTTP {response.status_code}")

    # Step 3: polite pause
    time.sleep(2)
else:
    print(f" Scraping disallowed by robots.txt: {TARGET_PATH}")


 Allowed to scrape: https://www.newegg.com/p/pl?d=graphics+card&page=2
Failed to fetch: HTTP 403


In [6]:
import requests
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
import time

BASE_URL = "https://www.newegg.com"
ROBOTS_URL = BASE_URL + "/robots.txt"

# Initialize and read robots.txt
rp = RobotFileParser()
rp.set_url(ROBOTS_URL)
rp.read()

def scrape_path(path):
    full_url = BASE_URL + path
    if not rp.can_fetch("*", full_url):
        print(f" Not allowed to scrape: {path}")
        return

    print(f" Allowed to scrape: {path}")
    headers = {
        "User-Agent": "EthicalScraperBot/1.0 (+https://your.domain/bot-info)"
    }

    response = requests.get(full_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch {path}, status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string.strip() if soup.title else "No title found"
    print(f"Title: {title}")
    # Additional scraping logic...

    time.sleep(2)  # Be courteous with rate limits

# Examples
scrape_path("/info/newsroomdetail.aspx")
scrape_path("/api/TrendingNow")


 Allowed to scrape: /info/newsroomdetail.aspx
Failed to fetch /info/newsroomdetail.aspx, status code: 403
 Allowed to scrape: /api/TrendingNow
Failed to fetch /api/TrendingNow, status code: 403


In [7]:
import requests
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
import time

BASE_URL = "https://en.wikipedia.org"
PAGE_PATH = "/wiki/List_of_countries_and_dependencies_by_area"
ROBOTS_URL = BASE_URL + "/robots.txt"

# Step 1: Parse robots.txt
rp = RobotFileParser()
rp.set_url(ROBOTS_URL)
rp.read()

full_url = BASE_URL + PAGE_PATH
if not rp.can_fetch("*", full_url):
    print(" Scraping not allowed per robots.txt")
else:
    print(" Allowed—fetching the page...")
    headers = {"User-Agent": "EthicalScraperBot/1.0 (contact@example.com)"}
    response = requests.get(full_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # Example: extract all country names from the first column
        table = soup.find("table", {"class": "wikitable"})
        countries = [row.find("a").get_text() for row in table.find_all("tr")[1:] if row.find("a")]
        for country in countries[:10]:
            print(country)
    else:
        print(f"Failed to retrieve page: status {response.status_code}")
    time.sleep(1)  # Polite delay


 Allowed—fetching the page...
Earth
Russia
Antarctica
Canada
[e]
United States
Brazil
Australia
India
Argentina


# How to parse *robots.txt* for multiple allowed/disallowed paths and validate them before scraping, so that you can integrate compliance into a larger scraper

---

Approach to follow:
* Use Python’s ***urllib.robotparser*** for compliance checking.
* Parse the raw robots.txt ourselves to get the list of allowed and disallowed paths.
* Validate multiple URLs before scraping.
* Keep it modular so you can plug it into any scraper.

In [10]:
import requests
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser

class RobotsComplianceChecker:
    def __init__(self, base_url, user_agent="*"):
        self.base_url = base_url.rstrip("/")
        self.user_agent = user_agent
        self.robots_url = f"{self.base_url}/robots.txt"
        self.raw_rules = {"Allow": [], "Disallow": []}
        self.rp = RobotFileParser()
        self._load_robots()

    def _load_robots(self):
        # Load into RobotFileParser for can_fetch checks
        self.rp.set_url(self.robots_url)
        self.rp.read()

        # Parse manually to store all allow/disallow rules
        resp = requests.get(self.robots_url, timeout=5)
        if resp.status_code == 200:
            lines = resp.text.splitlines()
            capture = False
            for line in lines:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                if line.lower().startswith("user-agent"):
                    ua = line.split(":")[1].strip()
                    capture = (ua == self.user_agent or ua == "*")
                elif capture and (line.lower().startswith("allow") or line.lower().startswith("disallow")):
                    key, path = line.split(":", 1)
                    self.raw_rules[key.strip().capitalize()].append(path.strip())

    def can_scrape(self, url):
        """Check if a specific URL can be fetched according to robots.txt."""
        return self.rp.can_fetch(self.user_agent, url)

    def show_rules(self):
        """Print all allow/disallow rules for the current user agent."""
        print(f"Robots.txt rules for '{self.user_agent}':")
        print("\nAllowed Paths:")
        for path in self.raw_rules["Allow"]:
            print(f"  {path}")
        print("\nDisallowed Paths:")
        for path in self.raw_rules["Disallow"]:
            print(f"  {path}")


# ----------------------
# Example Usage
# ----------------------

if __name__ == "__main__":
    base_url = "https://en.wikipedia.org"
    urls_to_check = [
        "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area",
        "https://en.wikipedia.org/w/index.php?search=test"
    ]

    checker = RobotsComplianceChecker(base_url)

    # Show all allow/disallow paths
    checker.show_rules()

    print("\nValidation Results:")
    for url in urls_to_check:
        result = " Allowed" if checker.can_scrape(url) else " Disallowed"
        print(f"{result}: {url}")


Robots.txt rules for '*':

Allowed Paths:
  /w/api.php?action=mobileview&
  /w/load.php?
  /api/rest_v1/?doc
  /w/rest.php/site/v1/sitemap

Disallowed Paths:
  /w/
  /api/
  /trap/
  /wiki/Special:
  /wiki/Spezial:
  /wiki/Spesial:
  /wiki/Special%3A
  /wiki/Spezial%3A
  /wiki/Spesial%3A
  /wiki/%D8%AE%D8%A7%D8%B5:Search
  /wiki/%D8%AE%D8%A7%D8%B5%3ASearch
  /wiki/Wikipedia:L%C3%B6schkandidaten/
  /wiki/Wikipedia:Löschkandidaten/
  /wiki/Wikipedia:Vandalensperrung/
  /wiki/Wikipedia:Benutzersperrung/
  /wiki/Wikipedia:Vermittlungsausschuss/
  /wiki/Wikipedia:Administratoren/Probleme/
  /wiki/Wikipedia:Adminkandidaturen/
  /wiki/Wikipedia:Qualitätssicherung/
  /wiki/Wikipedia:Qualit%C3%A4tssicherung/
  /wiki/Wikipedia:Vandalismusmeldung/
  /wiki/Wikipedia:Gesperrte_Lemmata/
  /wiki/Wikipedia:Löschprüfung/
  /wiki/Wikipedia:L%C3%B6schprüfung/
  /wiki/Wikipedia:Administratoren/Notizen/
  /wiki/Wikipedia:Schiedsgericht/Anfragen/
  /wiki/Wikipedia:L%C3%B6schpr%C3%BCfung/
  /wiki/Wikipedia:C