# Web Scraper

## Get Started

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Web Scraper With Python Tutorial

Web Scraper with Python Tutorial

```
# Import the Requests library for making HTTP requests
import requests

# Install Beautiful Soup, a Python library for parsing structured data
!pip install beautifulsoup4

from bs4 import BeautifulSoup

# set the URL to parse
URL = "https://realpython.github.io/fake-jobs/"

# Printing the `.text` attribute of a page provides a holistic, quick look at the HTML.
page = requests.get(URL)
print(page.text)

# To save the page, use the binary `.content` attribute.
with open("/content/drive/MyDrive/AF/arcticfrenz.html", "wb") as f:
    f.write(page.content)

# Create a Beautiful Soup object that takes `page.content` as its input.
soup = BeautifulSoup(page.content, "html.parser")

# Find specific HTML elements by its ID
results = soup.find(id="ResultsContainer")

# For easier viewing, any Beautiful Soup object can use prettify
print(results.prettify())


# Find elements by HTML class name
job_elements = results.find_all("div", class_="card-content")

# Pick out child HTML elements with `.find()` attribute.
# Each job_element is another BeautifulSoup() object.
# `.strip()` can be used to remove the superfluous whitespace
for job_element in job_elements:
    title_element = job_element.find("h2", class_="title")
    company_element = job_element.find("h3", class_="company")
    location_element = job_element.find("p", class_="location")
    print(title_element.text.strip())
    print(company_element.text.strip())
    print(location_element.text.strip())
    print()

# To find elements by class name and text context you can use the string argument
# And you can sometimes pass functions as arguments to Beautiful Soup methods
python_jobs = results.find_all(
    "h2", string=lambda text: "python" in text.lower()
)

# A method for checking is to use `len()`
print(len(python_jobs))

# You can fetch great-grandparent elements
python_job_elements = [
    h2_element.parent.parent.parent for h2_element in python_jobs
]

# With `python_job_elements` defined you can adapt the code in the 'for loop' to iterate over the parent elements instead
for job_element in python_job_elements:
    title_element = job_element.find("h2", class_="title")
    company_element = job_element.find("h3", class_="company")
    location_element = job_element.find("p", class_="location")
    print(title_element.text.strip())
    print(company_element.text.strip())
    print(location_element.text.strip())
    print()

# The `.text` attribute finds only the visible content of an HTML element.
# For example, if you are looking for the URL in a link element then start by fetching all the <a> elements.
# Then, extract the value of the href attributes using square-bracket notation.

for job_element in python_job_elements:
    links = job_element.find_all("a")
    for link in links:
        link_url = link["href"]
        print(f"Apply here: {link_url}\n")

# To fetch the URL of just the second link for each job card
for job_element in python_job_elements:
    link_url = job_element.find_all("a")[1]["href"]
    print(f"Apply here: {link_url}\n")
```

## The web scrapping script

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import logging
from typing import Union, List, Dict

def scrape_site(url: str) -> Union[str, None]:
    """Scrapes the given website and returns the HTML content, or None if the scrape fails."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises a HTTPError for bad responses (4xx and 5xx)
        return response.text
    except requests.RequestException as e:
        logging.error(f"Failed to scrape website: {url}, due to {str(e)}")
        return None

def parse_html(html: str) -> List[Dict[str, str]]:
    """Parses the given HTML content and returns a list of dictionaries of parsed data, or an empty list if the parsing fails."""
    soup = BeautifulSoup(html, "html.parser")
    results_container = soup.find(id="ResultsContainer")
    parsed_data = []

    if results_container:
        for result in results_container.find_all("div", class_="card-content"):
            title = result.find("h2", class_="title").text.strip() if result.find("h2", class_="title") else None
            company = result.find("h3", class_="company").text.strip() if result.find("h3", class_="company") else None
            location = result.find("p", class_="location").text.strip() if result.find("p", class_="location") else None
            parsed_data.append({"title": title, "company": company, "location": location})

    return parsed_data

def save_data(data: List[Dict[str, str]], filename: str) -> None:
    """Saves the given list of dictionaries of data to the given JSON file."""
    try:
        with open(filename, "w") as file:
            json.dump(data, file, indent=4)
        print(f"Data saved to '{filename}'.")
    except Exception as e:
        logging.error(f"Failed to save data to {filename}, due to {str(e)}")

def main():
    """Scrapes the website, parses the HTML, and saves the data."""
    url = "https://realpython.github.io/fake-jobs/"
    html = scrape_site(url)

    if html:
        parsed_data = parse_html(html)
        if parsed_data:
            save_data(parsed_data, "/content/drive/MyDrive/AF/arcticfrenz.json")
        else:
            logging.error(f"Failed to parse HTML from {url}")
    else:
        logging.error(f"Failed to scrape website: {url}")

if __name__ == "__main__":
    main()

Data saved to '/content/drive/MyDrive/AF/arcticfrenz.json'.
