## Amazon product reviews web-scraper

In [1]:
import grequests
from bs4 import BeautifulSoup
import pandas as pd
import time

  with loop.timer(seconds, ref=ref) as t:


In [2]:
product_url = "https://www.amazon.co.uk/Smiths-Savoury-Snacks-Favourites-24/product-reviews/B07X2M1D16/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"

In [3]:
custom_headers = {
    # Eliminating non-english reviews
    "Accept-language": "en;q=1.0",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}

In [4]:
def get_soup(response):
    if response.status_code != 200:
        print("Error in getting webpage")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    return soup

def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None
        preprocessed_review = r_content.replace('\n', '')
        
        scraped_reviews.append(preprocessed_review)

    return scraped_reviews

def scrape_reviews(base_url):
    all_reviews = []
    star_ratings = ['one', 'two', 'three', 'four', 'five']
    
    for star in star_ratings:
        page_number = 1
        
        while True:
            url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
            response = grequests.get(url, headers=custom_headers).send().response
            soup = get_soup(response)

            if not soup:
                continue  # Skip to next star rating if unable to parse page

            reviews = get_reviews(soup)
            all_reviews.extend(reviews)
        
            # Note: there's a valid page for any pageNumber, 
            # so we need to stop scraping based on the button of next page
            # Check for the presence of the "Next page" element
            next_page_element = soup.find("li", class_="a-disabled a-last")
            if next_page_element:
                break  # Exit loop if "Next page" element is found

            page_number += 1

    return all_reviews

In [5]:
all_reviews = scrape_reviews(product_url)

In [6]:
len(all_reviews)

477

## Scraper with the total time

In [7]:
# Testing the time- E2E:

def scrape_and_reviews_time(url):
    start_time = time.time()
    
    all_reviews = scrape_reviews(url)
    
    end_time = time.time()
    
    time_taken = end_time - start_time
    
    print(f"Time taken to scrape and export reviews: {time_taken:.2f} seconds")

scrape_and_reviews_time(product_url)

Time taken to scrape and export reviews: 41.53 seconds


## Overview of the scraped reviews data (before and after lightly preprocessed)

In [8]:
len(all_reviews)

477

In [9]:
all_reviews[0]

'Have ordered these before on amason , but this taste awful , not sure if it’s how they’ve been stored but just vileGutted as they are normally my favourite snack'

In [10]:
all_reviews

['Have ordered these before on amason , but this taste awful , not sure if it’s how they’ve been stored but just vileGutted as they are normally my favourite snack',
 'Just realised that all of the packs went out of BB date in the 26th of August, and I ordered in November ..',
 'Out of date when I brought in November',
 'Will not be ordering these again as they were two days out of date when they arrived!',
 'I regularly order these without issue but this last order had 4 packs missing. I am trying to return but it say not eligible for return. Anyone able to help?',
 "I've always loved this but the quality has gone down.It tastes more of salt than actual scampi, disappointing, I won't buy again.",
 'Massively out of date. How can i sell this to my customers!??! BUYER BEWARE  even taste off.',
 'They are what they are, just not the quantity as 2 were missing from the board.',
 'Bought these as a gift. When they arrived they only had a few days left on the use by date. No mention of that