# **Checkpoint 3: Web Scraping for Reviews**

In [67]:
import re
import requests
import csv
from bs4 import BeautifulSoup

In [68]:
def is_amazon_url(url):
    amazon_pattern = r"(https?://)?(www\.)?amazon\.(com|in|co\.uk|de|ca|fr|co\.jp|it|es|nl|com\.mx|com\.au|com\.br|ae|sg|sa)/.*"
    return bool(re.match(amazon_pattern, url))

In [69]:
def scrape_reviews_by_rating(product_url, max_reviews=50):
    if not is_amazon_url(product_url):
        raise ValueError("The provided URL is not an Amazon product URL.")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    star_ratings = ['one_star', 'two_star', 'three_star', 'four_star', 'five_star']
    reviews = []

    for star in star_ratings:
        page = 1
        while len(reviews) < max_reviews:
            filtered_url = f"{product_url}?filterByStar={star}&pageNumber={page}"
            response = requests.get(filtered_url, headers=headers)

            if response.status_code != 200:
                print(f"Failed to fetch page {page} for {star}. Status Code: {response.status_code}")
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            review_elements = soup.select('.review')

            if not review_elements:
                print(f"No more reviews found for {star}.")
                break

            for review in review_elements:
                content = review.select_one('.review-text').text.strip() if review.select_one('.review-text') else None
                rating = review.select_one('.review-rating').text.strip() if review.select_one('.review-rating') else None

                if content and rating:
                    reviews.append({
                        "content": content,
                        "rating": rating,
                    })

                if len(reviews) >= max_reviews:
                    break

            page += 1  # Move to the next page

    # Save to CSV
    with open('All_Reviews.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["content", "rating"])
        writer.writeheader()
        writer.writerows(reviews)

    print(f"Scraped {len(reviews)} reviews across all ratings and saved to 'All_Reviews.csv'.")

# Example usage
try:
    product_url = input("Enter the Amazon product reviews URL: ")  # Replace with the actual product review URL
    scrape_reviews_by_rating(product_url, max_reviews=100)  # Change max_reviews as needed
except ValueError as e:
    print(e)


Enter the Amazon product reviews URL: https://www.amazon.com/TOZO-S6-Ultra-Clear-Bluetooth-Waterproof/dp/B0DBZ3WSLL/?_encoding=UTF8&pd_rd_w=XNUgw&content-id=amzn1.sym.0f9e35ef-aec2-4925-899e-45c0e396b456&pf_rd_p=0f9e35ef-aec2-4925-899e-45c0e396b456&pf_rd_r=N0XMQ922NN6K11YBT9MD&pd_rd_wg=5z13e&pd_rd_r=a6d7fc0c-dc64-4d55-abd0-a3442711cd1f&ref_=pd_hp_d_btf_gcx_gw_per_1
No more reviews found for one_star.
No more reviews found for two_star.
No more reviews found for three_star.
No more reviews found for four_star.
No more reviews found for five_star.
Scraped 77 reviews across all ratings and saved to 'All_Reviews.csv'.


In [70]:
import pandas as pd
d2=pd.read_csv('All_Reviews.csv')
d2

Unnamed: 0,content,rating
0,This watch is an exceptional value with severa...,5.0 out of 5 stars
1,The Tozo S6 is an impressive smartwatch that b...,5.0 out of 5 stars
2,This watch has a lot going for it. it nails th...,5.0 out of 5 stars
3,I was impressed with the quality level of this...,5.0 out of 5 stars
4,"I wear it all the time, good for day and night...",4.0 out of 5 stars
...,...,...
72,Ich habe sofort eine Haut Irritation vom Armba...,1.0 out of 5 stars
73,Una de las cosas que me gusta de este reloj es...,5.0 out of 5 stars
74,It does exactly what it's designed for! You'd ...,5.0 out of 5 stars
75,"Precioso reloj y muy comodo, se ve muy bien y ...",5.0 out of 5 stars
