In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import time
import random
import logging


In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

headers = {
    'authority': 'www.amazon.in',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

In [3]:
def reviewsHtml(base_url, product_id, len_page, max_retries=3, timeout=30):
    soups = []
    for page_no in range(1, len_page + 1):
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'pageNumber': page_no,
        }
        if page_no == 1:
            full_url = f"{base_url}{product_id}/ref=cm_cr_dp_d_show_all_btm"
        else:
            full_url = f"{base_url}{product_id}/ref=cm_cr_getr_d_paging_btm_next_{page_no}"
        
        retries = 0
        while retries < max_retries:
            try:
                logging.info(f"Attempting to scrape page {page_no}")
                response = requests.get(full_url, headers=headers, params=params, timeout=timeout)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'lxml')
                soups.append(soup)
                logging.info(f"Successfully scraped page {page_no}")
                time.sleep(random.uniform(5, 10))  
                break  
            except requests.RequestException as e:
                logging.warning(f"Error scraping page {page_no} (attempt {retries + 1}): {e}")
                retries += 1
                if retries == max_retries:
                    logging.error(f"Failed to scrape page {page_no} after {max_retries} attempts")
                    if page_no == 1:
                        logging.error("Failed to scrape the first page. Stopping the script.")
                        return []
                    return soups 
                time.sleep(random.uniform(10, 20))  
    return soups

def getReviews(html_data):
    data_dicts = []
    boxes = html_data.select('div[data-hook="review"]')
    logging.info(f"Found {len(boxes)} review boxes on this page")
    
    for box in boxes:
        try:
            title_element = box.select_one('[data-hook="review-title"]')
            title = title_element.text.strip().split('\n')[-1] if title_element else 'N/A'
        except Exception as e:
            logging.warning(f"Error extracting title: {e}")
            title = 'N/A'
        
        try:
            description_element = box.select_one('[data-hook="review-body"]')
            description = description_element.text.strip() if description_element else 'N/A'
        except Exception as e:
            logging.warning(f"Error extracting description: {e}")
            description = 'N/A'
        
        try:
            rating_element = box.select_one('i[data-hook="review-star-rating"]')
            rating = rating_element.text.strip().split(' ')[0] if rating_element else 'N/A'
        except Exception as e:
            logging.warning(f"Error extracting rating: {e}")
            rating = 'N/A'
        
        try:
            format_strip = box.select_one('.review-format-strip')
            color = format_strip.find(string=lambda t: 'Colour:' in t).split(':')[1].strip()
        except Exception as e:
            color = 'N/A'
        
        try:
            format_strip = box.select_one('.review-format-strip')
            size = format_strip.find(string=lambda t: 'Size:' in t).split(':')[1].strip()
        except Exception as e:
            size = 'N/A'
        
        try:
            verified = box.select_one('[data-hook="avp-badge"]').text.strip()
        except Exception as e:
            verified = 'Not Verified'
        
        
        data_dict = {
            'Title': title,
            'Description': description,
            'Rating': rating,
            'Color':color,
            'Storage Size':size,
            'Verified Purchase':verified

        }
        
        data_dicts.append(data_dict)
    
    return data_dicts

In [4]:
base_url = 'https://www.amazon.in/New-Apple-iPhone-12-128GB/product-reviews/'
product_id = 'B08L5TNJHG'
len_page = 15
html_datas = reviewsHtml(base_url, product_id, len_page)
reviews = []
for html_data in html_datas:
    review = getReviews(html_data)
    reviews += review

2024-09-29 18:42:38,625 - INFO - Attempting to scrape page 1
2024-09-29 18:42:39,659 - INFO - Successfully scraped page 1
2024-09-29 18:42:48,851 - INFO - Attempting to scrape page 2
2024-09-29 18:42:49,560 - INFO - Successfully scraped page 2
2024-09-29 18:42:58,223 - INFO - Attempting to scrape page 3
2024-09-29 18:42:59,160 - INFO - Successfully scraped page 3
2024-09-29 18:43:06,737 - INFO - Attempting to scrape page 4
2024-09-29 18:43:07,642 - INFO - Successfully scraped page 4
2024-09-29 18:43:12,661 - INFO - Attempting to scrape page 5
2024-09-29 18:43:13,658 - INFO - Successfully scraped page 5
2024-09-29 18:43:20,706 - INFO - Attempting to scrape page 6
2024-09-29 18:43:21,541 - INFO - Successfully scraped page 6
2024-09-29 18:43:31,505 - INFO - Attempting to scrape page 7
2024-09-29 18:43:32,350 - INFO - Successfully scraped page 7
2024-09-29 18:43:39,511 - INFO - Attempting to scrape page 8
2024-09-29 18:43:40,493 - INFO - Successfully scraped page 8
2024-09-29 18:43:50,352 

In [5]:
reviews

[{'Title': 'Confused? Read on! One stop review for your flagship purchase.',
  'Description': 'I went through a lot of reviews and articles before I decided on the iPhone 12. I’ve been an Android user all my life and frankly thought Apple phones were really overpriced(at least here in India). But in the present day, the flagship Android options are no less expensive. If you spend a lot of time on your mobile, using it as your primary camera, social media device, business emails/collaboration and also to stream videos then you definitely need to spend a little more and go with a flagship option for the best experience.Coming to the flagships today (Aug2021),  apart from the iPhone we have Samsung Galaxy S21/+/ultra, Oneplus 9 pro, Vivo x60 pro+, mi 11ultra.  If you read a few reviews you’ll understand that 888Snapdragon has a lot of heating issues  and 9pro cameras didn’t live up to the hype(also overheating when the camera is in use). Samsung in India ships the phone with exynos2100 wh

In [6]:
df=pd.DataFrame(reviews)

In [7]:
df.head()

Unnamed: 0,Title,Description,Rating,Color,Storage Size,Verified Purchase
0,Confused? Read on! One stop review for your fl...,I went through a lot of reviews and articles b...,5.0,Black,128GB,Verified Purchase
1,Excellent product; please purchase if your dat...,The iPhone 12 is the latest iteration of Apple...,5.0,Black,128GB,Verified Purchase
2,An overall good premium experience,"SUMMERY: As a first time iPhone user, I would ...",4.0,,,Verified Purchase
3,Perfect product,It's my second iPhone ever and I bought it wit...,5.0,,,Verified Purchase
4,First step into the ecosystem!,"I have been a smartphone user since 2015, used...",5.0,Blue,128GB,Verified Purchase


In [8]:
df.shape

(100, 6)

In [9]:
df['Rating'].value_counts()

Rating
5.0    53
4.0    41
3.0     5
1.0     1
Name: count, dtype: int64

In [10]:
df = df.drop_duplicates(keep='first')


In [11]:
df.shape

(99, 6)

In [12]:
df['Verified Purchase'].value_counts()

Verified Purchase
Verified Purchase    99
Name: count, dtype: int64

In [13]:
df['Color'].value_counts()

Color
N/A             61
Blue            20
Black           16
(PRODUCT)RED     2
Name: count, dtype: int64

In [14]:
df['Storage Size'].value_counts()

Storage Size
N/A      61
128GB    36
64GB      2
Name: count, dtype: int64