In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import logging

In [2]:
headers = {
    "authority": "www.amazon.com",
    "pragma": "no-cache",
    "cache-control": "no-cache",
    "dnt": "1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "sec-fetch-site": "none",
    "sec-fetch-mode": "navigate",
    "sec-fetch-dest": "document",
    "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
}

URLS = [
    "https://www.amazon.com/Dell-3910-Business-Computer-Processor/product-reviews/B0C8WYB2F8/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "https://www.amazon.com/Dell-3888-Business-Processor-2-Monitor/product-reviews/B0CDJMPMRC/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "https://www.amazon.com/Sceptre-E248W-19203R-Monitor-Speakers-Metallic/product-reviews/B0773ZY26F/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "https://www.amazon.com/ELLAS-EARS-Wireless-Earbuds-Waterproof/product-reviews/B0BYCNW37S/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "https://www.amazon.com/TakeCase-Subwoofer-Earphones-Earphone-Headphones/product-reviews/B0CLS4V5Q8/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "https://www.amazon.com/Headphones-Warranty-Earphones-Microphone-Isolating/product-reviews/B074M7FJDW/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    "https://www.amazon.com/SanDisk-256GB-Extreme-UHS-I-Memory/product-reviews/B09X7CFXSX/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    "https://www.amazon.com/NIUTO-Portable-1920x1080-Ultra-Slim-Speakers/product-reviews/B0BJ2PLZKL/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    "https://www.amazon.com/G-Anica-Digital-Photography%EF%BC%8C48MP-Vlogging-Card-Wide-Angle/product-reviews/B0CG8ZB8GY/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    "https://www.amazon.com/Sonic-Generations-PlayStation-3/product-reviews/B004X56PWK/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    "https://www.amazon.com/CHENGDAO-Controller-Wireless-Playstation-Rechargeable-3/product-reviews/B0BZKYQVG7/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    "https://www.amazon.com/SB242Y-Zero-Frame-FreeSync-Technology-Ultra-Thin/product-reviews/B0BS9RGKZZ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
]

In [3]:
def get_page_html(page_url: str) -> str:
    resp = requests.get(page_url, headers=headers)
    return resp.text


def get_reviews_from_html(page_html: str) -> BeautifulSoup:
    soup = BeautifulSoup(page_html, "lxml")
    reviews = soup.find_all("div", {"class": "a-section celwidget"})
    return reviews


def get_review_date(soup_object: BeautifulSoup):
    date_string = soup_object.find("span", {"class": "review-date"}).get_text()
    return date_string


def get_review_text(soup_object: BeautifulSoup) -> str:
    review_text = soup_object.find(
        "span", {"class": "a-size-base review-text review-text-content"}
    ).get_text()
    return review_text.strip()


def get_review_header(soup_object: BeautifulSoup) -> str:
    review_header = soup_object.find(
        "a",
        {
            "class": "a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold"
        },
    ).get_text()
    return review_header.strip()


def get_number_stars(soup_object: BeautifulSoup) -> str:
    stars = soup_object.find("span", {"class": "a-icon-alt"}).get_text()
    return stars.strip()


def get_product_name(soup_object: BeautifulSoup) -> str:
    product = soup_object.find(
        "a", {"class": "a-size-mini a-link-normal a-color-secondary"}
    ).get_text()
    return product.strip()


def orchestrate_data_gathering(single_review: BeautifulSoup) -> dict:
    return {
        "review_text": get_review_text(single_review),
        "review_date": get_review_date(single_review),
        "review_title": get_review_header(single_review),
        "review_stars": get_number_stars(single_review),
        "review_flavor": get_product_link(single_review),
    }

In [4]:
if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    all_results = []

    for u in URLS:
        logging.info(u)
        html = get_page_html(u)
        reviews = get_reviews_from_html(html)
        for rev in reviews:
            data = orchestrate_data_gathering(rev)
            all_results.append(data)

    out = pd.DataFrame.from_records(all_results)
    logging.info(f"{out.shape[0]} Is the shape of the dataframe")
    save_name = f"{datetime.now().strftime('reviews')}.csv"
    logging.info(f"saving to {save_name}")
    out.to_csv(save_name)
    logging.info('Done yayy')

INFO:root:https://www.amazon.com/Dell-3910-Business-Computer-Processor/product-reviews/B0C8WYB2F8/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
INFO:root:https://www.amazon.com/Dell-3888-Business-Processor-2-Monitor/product-reviews/B0CDJMPMRC/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
INFO:root:https://www.amazon.com/Sceptre-E248W-19203R-Monitor-Speakers-Metallic/product-reviews/B0773ZY26F/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
INFO:root:https://www.amazon.com/ELLAS-EARS-Wireless-Earbuds-Waterproof/product-reviews/B0BYCNW37S/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
INFO:root:https://www.amazon.com/TakeCase-Subwoofer-Earphones-Earphone-Headphones/product-reviews/B0CLS4V5Q8/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
INFO:root:https://www.amazon.com/Headphones-Warranty-Earphones-Microphone-Isolating/product-reviews/B074M7FJDW/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviewshttps://www.