In [5]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import json

class AmazonReviewsScraper:
    def __init__(self):
        self.setup_driver()
        
    def setup_driver(self):
        """Setup Chrome driver with options"""
        chrome_options = Options()
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--start-maximized")
        
        # You may need to specify the path to your chromedriver
        # service = Service('/path/to/chromedriver')
        # self.driver = webdriver.Chrome(service=service, options=chrome_options)
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
    def wait_for_element(self, selector, timeout=30, by=By.CSS_SELECTOR):
        """Wait for element with custom timeout and retry logic"""
        max_retries = 3
        for attempt in range(max_retries):
            try:
                element = WebDriverWait(self.driver, timeout).until(
                    EC.presence_of_element_located((by, selector))
                )
                return element
            except TimeoutException:
                if attempt < max_retries - 1:
                    print(f"Timeout waiting for {selector}, retrying... (attempt {attempt + 1})")
                    time.sleep(2)
                else:
                    print(f"Final timeout waiting for {selector}")
                    return None
        return None
    
    def scrape_reviews(self, product_url, max_pages=None):
        """Scrape reviews from Amazon product page - if max_pages is None, scrape all pages"""
        reviews_data = []
        
        try:
            self.driver.get(product_url)
            time.sleep(3)
            
            # First, try to scrape reviews from the product page
            print("Attempting to scrape reviews from product page...")
            try:
                # Wait for the customerReviews section with longer timeout
                customer_reviews = self.wait_for_element("#customerReviews", timeout=15, by=By.ID)
                
                if customer_reviews:
                    # Try to find reviews on the main product page
                    review_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-hook='review']")
                    
                    if review_elements:
                        print(f"Found {len(review_elements)} reviews on product page")
                        for review_element in review_elements:
                            review_data = self.extract_review_data(review_element)
                            if review_data:
                                reviews_data.append(review_data)
                
            except Exception as e:
                print(f"Error scraping from product page: {str(e)}")
            
            # Now try to navigate to the dedicated reviews page
            print("Attempting to navigate to reviews page...")
            try:
                # Try multiple selectors for "See all reviews" link
                see_all_selectors = [
                    "[data-hook='see-all-reviews-link-foot']",
                    "a[href*='product-reviews']",
                    ".cr-widget-FocalReviews a[href*='product-reviews']",
                    ".reviewsSection a[href*='product-reviews']"
                ]
                
                clicked = False
                for selector in see_all_selectors:
                    try:
                        see_all_reviews = self.driver.find_element(By.CSS_SELECTOR, selector)
                        if see_all_reviews.is_displayed():
                            # Scroll to element before clicking
                            self.driver.execute_script("arguments[0].scrollIntoView(true);", see_all_reviews)
                            time.sleep(1)
                            
                            # Try JavaScript click first
                            try:
                                self.driver.execute_script("arguments[0].click();", see_all_reviews)
                                clicked = True
                                print("Successfully clicked 'See all reviews' link")
                                break
                            except:
                                # If JS click fails, try regular click
                                see_all_reviews.click()
                                clicked = True
                                print("Successfully clicked 'See all reviews' link")
                                break
                    except (NoSuchElementException, Exception):
                        continue
                
                if not clicked:
                    print("Could not find or click 'See all reviews' link")
                    # If we can't click the link, try to construct the reviews URL directly
                    if "/dp/" in product_url:
                        asin = product_url.split("/dp/")[1].split("/")[0].split("?")[0]
                        reviews_url = f"https://www.amazon.com/product-reviews/{asin}/"
                        print(f"Trying direct reviews URL: {reviews_url}")
                        self.driver.get(reviews_url)
                        clicked = True
                
                if clicked:
                    time.sleep(3)
                    
                    # Scrape reviews from multiple pages
                    page = 0
                    consecutive_failures = 0
                    max_consecutive_failures = 3
                    
                    while True:
                        page += 1
                        
                        # If max_pages is set and we've reached it, break
                        if max_pages is not None and page > max_pages:
                            print(f"Reached maximum pages limit ({max_pages})")
                            break
                            
                        print(f"Scraping reviews page {page}...")
                        
                        try:
                            # Wait for reviews to load with retry logic
                            reviews_found = False
                            for attempt in range(3):  # Try 3 times to find reviews
                                try:
                                    WebDriverWait(self.driver, 20).until(
                                        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-hook='review']"))
                                    )
                                    reviews_found = True
                                    break
                                except TimeoutException:
                                    if attempt < 2:
                                        print(f"Timeout finding reviews on page {page}, retrying...")
                                        time.sleep(3)
                                        self.driver.refresh()
                                        time.sleep(3)
                                    else:
                                        print(f"Final timeout finding reviews on page {page}")
                            
                            if not reviews_found:
                                consecutive_failures += 1
                                if consecutive_failures >= max_consecutive_failures:
                                    print(f"Failed to find reviews on {consecutive_failures} consecutive pages. Stopping.")
                                    break
                                continue
                            
                            # Reset consecutive failures counter
                            consecutive_failures = 0
                            
                            # Get all review elements on current page
                            review_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-hook='review']")
                            print(f"Found {len(review_elements)} reviews on page {page}")
                            
                            if len(review_elements) == 0:
                                print("No reviews found on this page, ending scraping")
                                break
                            
                            # Extract review data
                            page_reviews_count = 0
                            for review_element in review_elements:
                                review_data = self.extract_review_data(review_element)
                                if review_data:
                                    reviews_data.append(review_data)
                                    page_reviews_count += 1
                            
                            print(f"Successfully extracted {page_reviews_count} reviews from page {page}")
                            
                            # Try to go to next page
                            try:
                                next_button = self.driver.find_element(By.CSS_SELECTOR, "li.a-last a")
                                if next_button and "a-disabled" not in next_button.get_attribute("class"):
                                    self.driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                                    time.sleep(2)
                                    
                                    # Try clicking next button with retry
                                    clicked_next = False
                                    for attempt in range(3):
                                        try:
                                            self.driver.execute_script("arguments[0].click();", next_button)
                                            clicked_next = True
                                            break
                                        except Exception as e:
                                            if attempt < 2:
                                                print(f"Failed to click next button, retrying... (attempt {attempt + 1})")
                                                time.sleep(2)
                                            else:
                                                print(f"Failed to click next button after 3 attempts: {str(e)}")
                                    
                                    if not clicked_next:
                                        print("Could not proceed to next page")
                                        break
                                    
                                    time.sleep(5)  # Wait longer for page to load
                                else:
                                    print("No more pages available (next button disabled)")
                                    break
                            except NoSuchElementException:
                                print("Next button not found, ending pagination")
                                break
                                
                        except Exception as e:
                            print(f"Error on page {page}: {str(e)}")
                            consecutive_failures += 1
                            if consecutive_failures >= max_consecutive_failures:
                                print(f"Too many consecutive failures ({consecutive_failures}). Stopping.")
                                break
                            time.sleep(3)
                            continue
                            
            except Exception as e:
                print(f"Error navigating to reviews page: {str(e)}")
                # If all else fails, try to scrape from current page
                try:
                    review_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-hook='review']")
                    if review_elements:
                        print(f"Scraping {len(review_elements)} reviews from current page")
                        for review_element in review_elements:
                            review_data = self.extract_review_data(review_element)
                            if review_data:
                                reviews_data.append(review_data)
                except:
                    pass
                    
        except Exception as e:
            print(f"Error occurred: {str(e)}")
            
        # Remove duplicates based on review text
        seen_reviews = set()
        unique_reviews = []
        for review in reviews_data:
            review_key = review.get('text', '')[:100]  # Use first 100 chars as key
            if review_key not in seen_reviews:
                seen_reviews.add(review_key)
                unique_reviews.append(review)
        
        print(f"Total unique reviews found: {len(unique_reviews)}")
        return unique_reviews
    
    def extract_review_data(self, review_element):
        """Extract data from a single review element"""
        try:
            review_data = {}
            
            # Review title
            try:
                title_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='review-title']")
                review_data['title'] = title_element.text.strip()
            except NoSuchElementException:
                review_data['title'] = "No title"
            
            # Star rating
            try:
                rating_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='review-star-rating']")
                rating_text = rating_element.get_attribute("class")
                # Extract rating from class name (e.g., "a-star-5" -> "5")
                rating = rating_text.split()[-1].split('-')[-1] if 'star' in rating_text else "No rating"
                review_data['rating'] = rating
            except NoSuchElementException:
                review_data['rating'] = "No rating"
            
            # Review text
            try:
                text_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='review-body']")
                review_data['text'] = text_element.text.strip()
            except NoSuchElementException:
                review_data['text'] = "No review text"
            
            # Reviewer name
            try:
                name_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='genome-widget'] .a-profile-name")
                review_data['reviewer_name'] = name_element.text.strip()
            except NoSuchElementException:
                review_data['reviewer_name'] = "Anonymous"
            
            # Review date
            try:
                date_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='review-date']")
                review_data['date'] = date_element.text.strip()
            except NoSuchElementException:
                review_data['date'] = "No date"
            
            # Verified purchase
            try:
                verified_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='avp-badge']")
                review_data['verified_purchase'] = "Yes" if verified_element else "No"
            except NoSuchElementException:
                review_data['verified_purchase'] = "No"
            
            # Helpful votes
            try:
                helpful_element = review_element.find_element(By.CSS_SELECTOR, "[data-hook='helpful-vote-statement']")
                review_data['helpful_votes'] = helpful_element.text.strip()
            except NoSuchElementException:
                review_data['helpful_votes'] = "0"
            
            return review_data
            
        except Exception as e:
            print(f"Error extracting review data: {str(e)}")
            return None
    
    def save_to_csv(self, reviews_data, filename="amazon_reviews.csv"):
        """Save reviews data to CSV file"""
        if not reviews_data:
            print("No reviews data to save")
            return
            
        fieldnames = ['title', 'rating', 'text', 'reviewer_name', 'date', 'verified_purchase', 'helpful_votes']
        
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(reviews_data)
        
        print(f"Reviews saved to {filename}")
    
    def save_to_json(self, reviews_data, filename="amazon_reviews.json"):
        """Save reviews data to JSON file"""
        if not reviews_data:
            print("No reviews data to save")
            return
            
        with open(filename, 'w', encoding='utf-8') as jsonfile:
            json.dump(reviews_data, jsonfile, indent=2, ensure_ascii=False)
        
        print(f"Reviews saved to {filename}")
    
    def close(self):
        """Close the browser"""
        self.driver.quit()

# Usage example
if __name__ == "__main__":
    # Initialize scraper
    scraper = AmazonReviewsScraper()
    
    # Amazon product URL
    product_url = "https://www.amazon.com/Amazon-Basics-3-Button-Scrolling-Tracking/dp/B005EJH6RW"
    
    try:
        # Scrape ALL reviews (no page limit)
        print("Starting to scrape ALL reviews...")
        reviews = scraper.scrape_reviews(product_url, max_pages=None)  # Set to None for unlimited
        
        print(f"Scraped {len(reviews)} total reviews")
        
        # Save to CSV
        scraper.save_to_csv(reviews, "amazon_reviews_complete.csv")
        
        # Save to JSON
        scraper.save_to_json(reviews, "amazon_reviews_complete.json")
        
        # Print first few reviews as example
        for i, review in enumerate(reviews[:3]):
            print(f"\n--- Review {i+1} ---")
            print(f"Title: {review['title']}")
            print(f"Rating: {review['rating']}")
            print(f"Text: {review['text'][:200]}...")
            print(f"Reviewer: {review['reviewer_name']}")
            print(f"Date: {review['date']}")
            
    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        # Close browser
        scraper.close()

Starting to scrape ALL reviews...
Attempting to scrape reviews from product page...
Timeout waiting for #customerReviews, retrying... (attempt 1)
Timeout waiting for #customerReviews, retrying... (attempt 2)
Final timeout waiting for #customerReviews
Attempting to navigate to reviews page...
Successfully clicked 'See all reviews' link
Scraping reviews page 1...
Found 10 reviews on page 1
Successfully extracted 10 reviews from page 1
Scraping reviews page 2...
Found 10 reviews on page 2
Successfully extracted 10 reviews from page 2
Scraping reviews page 3...
Found 10 reviews on page 3
Successfully extracted 10 reviews from page 3
Scraping reviews page 4...
Found 10 reviews on page 4
Successfully extracted 10 reviews from page 4
Scraping reviews page 5...
Found 10 reviews on page 5
Successfully extracted 10 reviews from page 5
Scraping reviews page 6...
Found 10 reviews on page 6
Successfully extracted 10 reviews from page 6
Scraping reviews page 7...
Found 10 reviews on page 7
Successfu