In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import csv
import time
import logging
import pickle

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
def is_amazon_url(url):
    """Checks if a URL is a valid amazon.com product URL."""
    amazon_pattern = r"(https?://)?(www.)?amazon\.com/.*"
    return bool(re.match(amazon_pattern, url))

def save_reviews_to_csv(reviews, filename='scraped_reviews.csv'):
    """Saves a list of reviews to a CSV file."""
    if not reviews:
        logging.info("No reviews to save.")
        return
    
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["content", "rating"])
        writer.writeheader()
        writer.writerows(reviews)
    logging.info(f"Saved {len(reviews)} reviews to '{filename}'.")

In [3]:
def scrape_amazon_reviews(product_url, max_reviews=50, phone_number=None, password=None):
    """
    Scrapes reviews from an Amazon product page, handling dynamic loading, pagination, and CAPTCHAs.
    Attempts to load cookies first, if available, to bypass login.

    Args:
        product_url (str): The URL of the Amazon product page.
        max_reviews (int, optional): Maximum number of reviews to scrape. Defaults to 50.
        phone_number (str, optional): Amazon account phone number.
        password (str, optional): Amazon account password.

    Returns:
        list: A list of dictionaries, each containing review content and rating.
            Returns an empty list if no reviews were scraped or login/captcha/cookie load fails.
    """

    if not is_amazon_url(product_url):
        raise ValueError("Invalid amazon.com product URL.")

    scraped_reviews = []
    review_count = 0
    page_number = 1
    cookies_file_path = "amazon_cookies.pkl"
    
    # Set up Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver_headless = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        logging.info("Navigating to product page in headless mode.")
        driver_headless.get(product_url)
        
        try:
            logging.info("Loading cookies.")
            cookies = pickle.load(open(cookies_file_path, "rb"))
            for cookie in cookies:
                driver_headless.add_cookie(cookie)
            driver_headless.get(product_url)
            logging.info("Cookies loaded successfully. Bypassing login.")
        except Exception as cookie_e:
            logging.warning(f"Error loading cookies, proceeding with login. {cookie_e}")
            
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            try:
                logging.info("Attempting to log in.")
                driver.get(product_url)

                login_link = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#nav-link-accountList"))
                )
                login_link.click()
                time.sleep(2)

                phone_input = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#ap_email"))
                )
                phone_input.send_keys(phone_number)

                continue_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#continue"))
                )
                continue_button.click()
                time.sleep(2)

                password_input = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#ap_password"))
                )
                password_input.send_keys(password)

                sign_in_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#signInSubmit"))
                )
                sign_in_button.click()
                time.sleep(5)

                if driver.find_elements(By.CSS_SELECTOR, '#auth-captcha-guess-text'):
                    logging.error("Captcha detected. Please solve it manually.")
                    logging.info("Sleeping for 2 minutes to solve captcha.")
                    time.sleep(120)  # Giving 2 minutes for captcha solving
                    if driver.find_elements(By.CSS_SELECTOR, '#auth-captcha-guess-text'):
                        logging.error("Captcha was not solved. Login failed")
                        driver.quit()
                        driver_headless.quit()
                        return []
                else:
                    logging.info("Captcha was not detected.")
                logging.info("Login successful")
                
                logging.info("Saving cookies after successful login.")
                pickle.dump(driver.get_cookies(), open(cookies_file_path, "wb"))
                
            except Exception as login_e:
                logging.error(f"Login failed: {login_e}")
                driver.quit()
                driver_headless.quit()
                return []
            finally:
                driver.quit()
                logging.info("Driver quitted after solving captcha.")
            
        try:
            logging.info("Attempting to find 'see all reviews' link")
            see_all_link = WebDriverWait(driver_headless, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-hook='see-all-reviews-link-foot']"))
             )
            
            logging.info("Found 'see all reviews' link. Clicking it.")
            see_all_link.click()
            time.sleep(5)
            
        except Exception as e:
             logging.error(f"Could not find 'see all reviews' link {e}")
             driver_headless.quit()
             return scraped_reviews
        
        logging.info("Navigated to review page")
        
        while review_count < max_reviews:
          logging.info(f"Fetching reviews from page: {page_number}")
          
          
          try:
            WebDriverWait(driver_headless, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-hook="review"]'))
             )
          except:
             logging.info("Could not find any reviews in the page.")
             break
          
          soup = BeautifulSoup(driver_headless.page_source, 'html.parser')
          review_elements = soup.select('[data-hook="review"]')
          
          if not review_elements:
            logging.info(f"No reviews found on page {page_number}")
            break

          for review in review_elements:
            try:
              content_element = review.select_one('[data-hook="review-body"]')
              rating_element = review.select_one('[data-hook="review-star-rating"] > span.a-icon-alt')
              
              if content_element and rating_element:
                content = content_element.get_text(strip=True)
                rating_text = rating_element.get_text(strip=True)
                
                # Extract the numerical rating using regex
                rating_match = re.search(r'(\d+(\.\d+)?)', rating_text)
                rating = rating_match.group(1) if rating_match else None

                scraped_reviews.append({
                    "content": content,
                    "rating": rating,
                    })
                review_count += 1
                if review_count >= max_reviews:
                  break
              else:
                logging.warning(f"Skipping review due to missing content or rating")
            except Exception as inner_e:
               logging.error(f"Error parsing review: {inner_e}")
          if review_count >= max_reviews:
            break
          
          try:
              next_button = WebDriverWait(driver_headless, 20).until(
                  EC.presence_of_element_located((By.CSS_SELECTOR, "li.a-last > a"))
              )
              if next_button:
                  next_button.click()
                  page_number+=1
                  time.sleep(5)
                  logging.info("Moving to next page")
              else:
                  logging.info("Next page button not found. Stopping pagination.")
                  break
          except Exception as next_e:
                logging.info(f"Next page button not found: {next_e}. Stopping pagination.")
                break
            
    except Exception as e:
        logging.error(f"An error occurred during scraping: {e}")
    finally:
        driver_headless.quit()
        logging.info("Driver quitted after reviews scraping.")

    return scraped_reviews

In [4]:
if __name__ == '__main__':
    try:
        product_url = "https://www.amazon.com/i7-13620H-Processor-GeForce-Display-ANV15-51-73B9/dp/B0CMRGBXM9/ref=nav_ya_signin?_encoding=UTF8&sr=8-4"
        phone_number = "9727715703"
        password = "kuku@1108"
        reviews = scrape_amazon_reviews(product_url, max_reviews=100, phone_number=phone_number, password=password)
        save_reviews_to_csv(reviews)
        if reviews:
            logging.info("Scraping Complete.")
        else:
            logging.info("No reviews were scraped.")
    except ValueError as e:
         logging.error(e)

2025-01-27 22:34:25,222 - INFO - Get LATEST chromedriver version for google-chrome
2025-01-27 22:34:25,331 - INFO - Get LATEST chromedriver version for google-chrome
2025-01-27 22:34:25,479 - INFO - Driver [C:\Users\ericp\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver-win32/chromedriver.exe] found in cache
2025-01-27 22:34:26,644 - INFO - Navigating to product page in headless mode.
2025-01-27 22:34:31,557 - INFO - Loading cookies.
2025-01-27 22:34:35,377 - INFO - Cookies loaded successfully. Bypassing login.
2025-01-27 22:34:35,378 - INFO - Attempting to find 'see all reviews' link
2025-01-27 22:34:35,435 - INFO - Found 'see all reviews' link. Clicking it.
2025-01-27 22:34:41,806 - INFO - Navigated to review page
2025-01-27 22:34:41,807 - INFO - Fetching reviews from page: 1
2025-01-27 22:34:47,119 - INFO - Moving to next page
2025-01-27 22:34:47,120 - INFO - Fetching reviews from page: 2
2025-01-27 22:34:52,490 - INFO - Moving to next page
2025-01-27 22:34:52,491 - INFO 