In [7]:
import re
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd

In [8]:
def is_amazon_url(url):
    """Checks if a URL is a valid Amazon product URL."""
    amazon_pattern = r"(https?://)?(www.)?amazon.(com|in|co\.uk|de|ca|fr|co\.jp|it|es|nl|com\.mx|com\.au|com\.br|ae|sg|sa)/.*"
    return bool(re.match(amazon_pattern, url))

In [9]:
def scrape_amazon_reviews(product_url, max_reviews=50):
    """
    Scrapes reviews from an Amazon product page, filtering by star rating.

    Args:
        product_url (str): The URL of the Amazon product page.
        max_reviews (int, optional): Maximum number of reviews to scrape. Defaults to 50.

    Returns:
        list: A list of dictionaries, each containing review content and rating.
            Returns an empty list if no reviews were scraped.
    """

    if not is_amazon_url(product_url):
        raise ValueError("Invalid Amazon product URL.")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    all_reviews = []  # Store all reviews here
    review_count = 0
    star_ratings = ['one_star', 'two_star', 'three_star', 'four_star', 'five_star']
    
    for rating_filter in star_ratings:
        page_number = 1
        
        while review_count < max_reviews:
          
            filtered_url = f"{product_url}?filterByStar={rating_filter}&pageNumber={page_number}"

            try:
                response = requests.get(filtered_url, headers=headers)
                response.raise_for_status()  # Raise an exception for bad status codes
            except requests.exceptions.RequestException as e:
                print(f"Error fetching page {page_number} with filter {rating_filter}: {e}")
                break
            
            soup = BeautifulSoup(response.content, 'html.parser')
            review_elements = soup.select('.review')
            
            if not review_elements:
                print(f"No more reviews found for {rating_filter}.")
                break
            
            for review in review_elements:
                content_element = review.select_one('.review-text')
                rating_element = review.select_one('.review-rating')

                content = content_element.text.strip() if content_element else None
                rating = rating_element.text.strip() if rating_element else None
                
                if content and rating:
                    all_reviews.append({
                        "content": content,
                        "rating": rating,
                        "filter": rating_filter
                    })
                    review_count += 1

                    if review_count >= max_reviews:
                      break
            page_number += 1
        
    return all_reviews

In [10]:
def save_reviews_to_csv(reviews, filename='All_Reviews.csv'):
  """Saves a list of reviews to a CSV file."""
  if not reviews:
    print("No reviews to save.")
    return
  
  with open(filename, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["content", "rating","filter"])
    writer.writeheader()
    writer.writerows(reviews)
  print(f"Saved {len(reviews)} reviews to '{filename}'.")

In [11]:
def load_csv_to_dataframe(filename='All_Reviews.csv'):
  """Loads the csv file into a dataframe. Returns dataframe"""
  try:
    df = pd.read_csv(filename)
    return df
  except FileNotFoundError:
    print(f"File '{filename}' not found.")
    return None

In [12]:
if __name__ == '__main__':
    try:
        product_url = input("Enter the Amazon product reviews URL: ")  
        reviews = scrape_amazon_reviews(product_url, max_reviews=100)  
        save_reviews_to_csv(reviews)
        dataframe=load_csv_to_dataframe()
        if dataframe is not None:
          print("\nDataFrame Loaded:")
          print(dataframe)
    except ValueError as e:
        print(e)

Saved 100 reviews to 'All_Reviews.csv'.

DataFrame Loaded:
                                              content              rating  \
0   I recently had the pleasure of riding the Velo...  5.0 out of 5 stars   
1   Overall, I'm thoroughly impressed with the Lif...  4.0 out of 5 stars   
2   This is one of the best bicycle in this price ...  5.0 out of 5 stars   
3   I recently purchased a Lifelong cycle from Ama...  1.0 out of 5 stars   
4   I recently had the pleasure of riding the Velo...  5.0 out of 5 stars   
..                                                ...                 ...   
95  I recently purchased a Lifelong cycle from Ama...  1.0 out of 5 stars   
96  I recently had the pleasure of riding the Velo...  5.0 out of 5 stars   
97  Overall, I'm thoroughly impressed with the Lif...  4.0 out of 5 stars   
98  This is one of the best bicycle in this price ...  5.0 out of 5 stars   
99  I recently purchased a Lifelong cycle from Ama...  1.0 out of 5 stars   

      filter  
0