## Amazon product reviews web-scraper - ~7.5 seconds for 7 pages

In [1]:
import grequests
from bs4 import BeautifulSoup
import pandas as pd
import time

  with loop.timer(seconds, ref=ref) as t:


In [2]:
product_url = "https://www.amazon.co.uk/Simply-Cheetos-Cheddar-Cheese-Flavored/product-reviews/B0015GTJCI/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"

In [3]:
custom_headers = {
    # Eliminating non-english reviews
    "Accept-language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}

In [4]:
def get_soup(response):
    if response.status_code != 200:
        print("Error in getting webpage")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    return soup

def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None
        
        review_dict = {
                    "content": r_content
                }

        scraped_reviews.append(review_dict)

    return scraped_reviews

def scrape_reviews(base_url):
    all_reviews = []
    page_number = 1

    while True:
        url = f"{base_url}&pageNumber={page_number}"
        response = grequests.get(url, headers=custom_headers).send().response
        soup = get_soup(response)

        if not soup:
            break  # Exit loop if unable to parse page

        reviews = get_reviews(soup)
        all_reviews.extend(reviews)
        
        # Note: there's a valid page for any pageNumber, 
        # so we need to stop scraping based on the button of next page
        # Check for the presence of the "Next page" element
        next_page_element = soup.find("li", class_="a-disabled a-last")
        if next_page_element:
            break  # Exit loop if "Next page" element is found

        page_number += 1

    return all_reviews

In [5]:
all_reviews = scrape_reviews(product_url)

In [6]:
df = pd.DataFrame(all_reviews)
df.to_csv('amazon_reviews.csv', index=False, encoding='utf-8')

## Scraper with the total time

In [7]:
# Testing the time:
import time

def get_soup(response):
    if response.status_code != 200:
        print("Error in getting webpage")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    return soup

def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None
        
        review_dict = {
                    "content": r_content
                }

        scraped_reviews.append(review_dict)

    return scraped_reviews

def scrape_reviews(base_url):
    all_reviews = []
    page_number = 1

    start_time = time.time()

    while True:
        url = f"{base_url}&pageNumber={page_number}"
        response = grequests.get(url, headers=custom_headers).send().response
        soup = get_soup(response)

        if not soup:
            break  # Exit loop if unable to parse page

        reviews = get_reviews(soup)
        all_reviews.extend(reviews)
        
        # Note: there's a valid page for any pageNumber, 
        # so we need to stop scraping based on the button of next page
        # Check for the presence of the "Next page" element
        next_page_element = soup.find("li", class_="a-disabled a-last")
        if next_page_element:
            break  # Exit loop if "Next page" element is found

        page_number += 1

    end_time = time.time()
    scraping_time = end_time - start_time

    print(f"Scraping completed in {scraping_time:.2f} seconds")

    return all_reviews

In [8]:
all_reviews = scrape_reviews(product_url)

Scraping completed in 6.91 seconds


## Overview of the scraped reviews data

In [9]:
len(all_reviews)

70

In [10]:
all_reviews[68]

{'content': '\nWe did not like these cheese puffs. They tasted stale.\n'}

In [11]:
for review in all_reviews:
    print(review)

{'content': '\nExcellent - the best thing that I’ve ever tasted in my entire life.The problem that it’s very hard to get it only through amazon and it’s taking too too much time to be received.\n'}
{'content': '\nWow, these are so addicting! I really don’t eat any other chips very often, but these are my go-to. For some reason they seem slightly healthier than regular Cheetos, and are made my Simply. The white cheddar taste is delicious and you don’t get bright orange fingers after eating them. They have the perfect crispy puff texture, and are larger in size. The bag is also nice and big.\n'}
{'content': "\nIt's hard to find quality snacks here. These were awesome, great taste ! Will def buy again\n"}
{'content': '\nWith healthy ingredients and strong flavor of white cheddar. These are one of their best products!\n'}
{'content': '\nWay more cheesy taste that the orange puffs!! So so good, can eat the entire bag!\n'}
{'content': '\nBeen eating these for years. All natural, I cannot eat