## Amazon product reviews web-scraper - ~6.5 seconds for 10 pages

In [1]:
import grequests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv

  with loop.timer(seconds, ref=ref) as t:


In [2]:
product_url = "https://www.amazon.co.uk/Smiths-Savoury-Snacks-Favourites-24/product-reviews/B07X2M1D16/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"

In [3]:
custom_headers = {
    # Eliminating non-english reviews
    "Accept-language": "en;q=1.0",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}

In [4]:
def get_soup(response):
    if response.status_code != 200:
        print("Error in getting webpage")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    return soup

def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None
        
        review_dict = {
                    "content": r_content
                }

        scraped_reviews.append(review_dict)

    return scraped_reviews

def scrape_reviews(base_url):
    all_reviews = []
    star_ratings = ['one', 'two', 'three', 'four', 'five']
    
    for star in star_ratings:
        page_number = 1
        
        while True:
            url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
            response = grequests.get(url, headers=custom_headers).send().response
            soup = get_soup(response)

            if not soup:
                continue  # Skip to next star rating if unable to parse page

            reviews = get_reviews(soup)
            all_reviews.extend(reviews)
        
            # Note: there's a valid page for any pageNumber, 
            # so we need to stop scraping based on the button of next page
            # Check for the presence of the "Next page" element
            next_page_element = soup.find("li", class_="a-disabled a-last")
            if next_page_element:
                break  # Exit loop if "Next page" element is found

            page_number += 1

    return all_reviews


def preprocess_and_export_reviews_to_csv(reviews, filename):
    # Open the CSV file with UTF-8 encoding
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['content']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for review in reviews:
            # Remove '\n'
            clean_review_content = review['content'].replace('\n', '')

            # Encode the review content to UTF-8
            #encoded_review_content = clean_review_content.encode('utf-8')
            
            writer.writerow({'content': clean_review_content})

In [5]:
all_reviews = scrape_reviews(product_url)
preprocess_and_export_reviews_to_csv(all_reviews, 'amazon_reviews.csv')

In [6]:
len(all_reviews)

477

## Scraper with the total time- including exporting to csv

In [7]:
# Testing the time- E2E:

def scrape_and_export_reviews_time(url, filename):
    start_time = time.time()
    
    all_reviews = scrape_reviews(url)
    
    preprocess_and_export_reviews_to_csv(all_reviews, filename)
    
    end_time = time.time()
    
    time_taken = end_time - start_time
    
    print(f"Time taken to scrape and export reviews: {time_taken:.2f} seconds")

scrape_and_export_reviews_time(product_url, 'amazon_reviews.csv')

Time taken to scrape and export reviews: 41.47 seconds


## Overview of the scraped reviews data (before and after lightly preprocessed)

In [8]:
len(all_reviews)

477

In [9]:
all_reviews[68]

{'content': "\nDisappointed to receive out of date scampi fries! I didn't even check until I opened a pack and thought they tasted stale :(\n"}

In [10]:
df = pd.read_csv('amazon_reviews.csv')
df.head()

Unnamed: 0,content
0,Just realised that all of the packs went out o...
1,"Have ordered these before on amason , but this..."
2,Out of date when I brought in November
3,Will not be ordering these again as they were ...
4,I regularly order these without issue but this...


  with loop.timer(seconds, ref=ref) as t:


In [11]:
type(df['content'][0])

str