### Initial BeautifulSoup4 Demo:
- [Site being scraped](https://realpython.github.io/fake-jobs/)
- [Guide Site](https://realpython.com/beautiful-soup-web-scraper-python/)

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
def receive_structure():
    url = "https://www.amazon.com/"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    with open("amz-home-structure.html", "w") as file:
        file.write(soup.prettify())

In [4]:
def extract_structure_of_application_page(data):
    link = data["link"][1] # only grab the first link because they are all the same
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")

    with open("application-structure.html", "w") as file:
        file.write(soup.prettify())

In [5]:
def load_data():
    url = "https://realpython.github.io/fake-jobs/"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    collective_job_data = []

    job_cards = soup.find_all("div", class_="card-content")

    for card in job_cards:
        job_data = {}
        if card.find("time"):
            job_data["date"] = card.find("time").text.strip()
        if card.find("h2", class_="title"):
            job_data["title"] = card.find("h2", class_="title").text.strip()
        if card.find("p", class_="location"):
            job_data["location"] = card.find("p", class_="location").text.strip()
        if card.find("h3", class_="company"):
            job_data["company"] = card.find("h3", class_="subtitle is-6 company").text.strip()
        if card.find_all("a")[1]["href"]:
            job_data["link"] = card.find_all("a")[1]["href"]
        collective_job_data.append(job_data)

    return pd.DataFrame(collective_job_data)

In [6]:
def extract_description_from_application_link(data):
    for index, row in data.iterrows():
        response = requests.get(row["link"])
        soup = BeautifulSoup(response.text, "html.parser")
        if soup.find("p", class_="content"):
            data.loc[index, "description"] = soup.find("div", class_="content").text.strip()
    return data

In [7]:
def write_to_csv(data):
    data.to_csv("jobs.csv", index=False)

In [8]:
data = load_data()

receive_structure()
extract_structure_of_application_page(data)

write_to_csv(data)

In [9]:
data.head()

Unnamed: 0,date,title,location,company,link
0,2021-04-08,Senior Python Developer,"Stewartbury, AA","Payne, Roberts and Davis",https://realpython.github.io/fake-jobs/jobs/se...
1,2021-04-08,Energy engineer,"Christopherville, AA",Vasquez-Davidson,https://realpython.github.io/fake-jobs/jobs/en...
2,2021-04-08,Legal executive,"Port Ericaburgh, AA","Jackson, Chambers and Levy",https://realpython.github.io/fake-jobs/jobs/le...
3,2021-04-08,Fitness centre manager,"East Seanview, AP",Savage-Bradley,https://realpython.github.io/fake-jobs/jobs/fi...
4,2021-04-08,Product manager,"North Jamieview, AP",Ramirez Inc,https://realpython.github.io/fake-jobs/jobs/pr...


In [10]:
# All locations are unique
data["location"].value_counts()

location
Stewartbury, AA         1
Christopherville, AA    1
Port Ericaburgh, AA     1
East Seanview, AP       1
North Jamieview, AP     1
                       ..
Lake Abigail, AE        1
Jacobshire, AP          1
Port Susan, AE          1
North Tiffany, AA       1
Michelleville, AP       1
Name: count, Length: 100, dtype: int64

In [11]:
receive_structure()

# Scraping Amazon
_____________________________________

In [12]:
# Headers needed so Amazon thinks we are a user and not a program
# This is a common practice when scraping websites because some websites try to block programs from scraping
headers = {
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-encoding": "gzip, deflate, br",
    "Accept-language": "en-US,en;q=0.9",
    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
    "Referer": "https://www.amazon.com/",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

In [29]:
url = "https://www.amazon.com/Bose-QuietComfort-45-Bluetooth-Canceling-Headphones/dp/B098FKXT8L"

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
soup.prettify()

# Save the structure of the product page for visualization
with open("amz-product-structure.html", "w", encoding='utf-8') as file:
    file.write(soup.prettify())

In [21]:
# Testing scraping by class and component combination
found = soup.find("span", class_="a-price a-text-price")
found

with open ("li.html", "w") as file:
    file.write(found.prettify())

### Taking input from user then scraping page

In [23]:
headers = {
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-encoding": "gzip, deflate, br",
    "Accept-language": "en-US,en;q=0.9",
    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
    "Referer": "https://www.amazon.com/",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

search_query = "bose headphones"
url = f"https://www.amazon.com/s?k={search_query}"

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
soup.prettify()

with open("amz-search-structure.html", "w", encoding="utf-8") as file:
    file.write(soup.prettify())

In [28]:
found = soup.find("div", class_="s-main-slot s-result-list s-search-results sg-row")

with open("search-results.html", "w", encoding="utf-8") as file:
    file.write(found.prettify())

In [31]:
found = soup.find_all("div", class_="sg-col-20-of-24 s-result-item sg-col-0-of-12 sg-col-16-of-20 s-widget sg-col sg-col-12-of-16 s-widget-spacing-large")

with open("search-results.html", "w", encoding="utf-8") as file:
    for item in found:
        file.write(item.prettify())
        # a-section a-spacing-small a-spacing-top-small