
# Amazon Reviews Scraping

In [396]:
# Import packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [397]:
# Header to set the requests as a browser requests

headers = {
    'authority': 'www.amazon.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/102.0.0.0 Safari/537.36'
}

In [398]:
# url of The amazon Review page
reviews_url = 'https://www.amazon.com/Legendary-Whitetails-Journeyman-Jacket-Tarmac/product-reviews/B013KW38RQ/'

In [399]:
# number of pages to scrape.
len_page = 4

### Functions

In [400]:
# This function retrieves the HTML data from multiple pages of 
# Amazon reviews.

# Extra Data as Html object from amazon Review page
def reviewsHtml(url, len_page):
    
    # Empty List define to store all pages html data
    soups = []
    
    # Loop for gather all 3000 reviews from 300 pages via range
    for page_no in range(1, len_page + 1):
        
        # parameter set as page no to the requests body
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }
        
        # Request make for each page
        response = requests.get(url, headers=headers)
        
        # Save Html object by using BeautifulSoup4 and lxml parser
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Add single Html page data in master soups list
        soups.append(soup)
        
    return soups

In [401]:
# This function extracts relevant information from the HTML data of each review.
# Grab Reviews name, description, date, stars, title from HTML
def getReviews(html_data):

    # Create Empty list to Hold all data
    data_dicts = []
    
    # Select all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')
    
    # Iterate all Reviews BOX 
    for box in boxes:
        
        # Select Name using css selector and cleaning text using strip()
        # If Value is empty define value with 'N/A' for all.
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        # create Dictionary with al review data 
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        # Add Dictionary in master empty List
        data_dicts.append(data_dict)
    
    return data_dicts

### Data Process

In [408]:
# Grab all HTML
html_datas = reviewsHtml(reviews_url, len_page)

In [409]:
# Empty List to Hold all reviews data
reviews = []

In [410]:
# Iterate all Html page 
for html_data in html_datas:
    
    # Grab review data
    review = getReviews(html_data)
    
    # add review data in reviews empty list
    reviews += review

In [411]:
# Create a dataframe with reviews Data
df_reviews = pd.DataFrame(reviews)

In [412]:
df_reviews

Unnamed: 0,Name,Stars,Title,Date,Description
0,Nick,5.0,"5.0 out of 5 stars\nBought twice, would buy again",24/02/2024,I bought this jacket first five years ago and ...
1,Bartek,5.0,5.0 out of 5 stars\nRugged Comfort Meets Style...,19/03/2024,The Legendary Whitetails Journeyman Shirt Jack...
2,#1NDfan,5.0,5.0 out of 5 stars\nFunctional & durable,28/04/2024,Bought this for my farmer husband (Christmas 2...
3,Jeff,4.0,"4.0 out of 5 stars\nRugged, good look and nice...",02/10/2018,The short answer to if you should go down from...
4,David S.,5.0,5.0 out of 5 stars\nNice jacket - slim fit,07/04/2024,I have a slim build and the medium fit me perf...
5,ROB P.,5.0,5.0 out of 5 stars\nCool-lookin Shacket,30/03/2024,"I purchased the Tobacco color, and thrilled wi..."
6,M. Weber,3.0,"3.0 out of 5 stars\nNeck is massive, great fin...",09/03/2024,The fabric is really quite nice. Silky and dec...
7,Amazon Customer,5.0,5.0 out of 5 stars\nFits and looks great,13/04/2024,I really like the legendary style and quality ...
8,Matt,4.0,4.0 out of 5 stars\nNice jacket - could be better,23/03/2024,I like the jacket a lot and will probably wear...
9,Remycat,4.0,"4.0 out of 5 stars\nSolid value, good investme...",27/04/2024,I lost some weight and it was just too big. I ...


In [413]:
#dataframe is converted to csv file
df_reviews.to_csv(r'G:\reviews.csv',index=True)
