In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

# List of products (stored in a JSON file, but loading here for simplicity)
products = ["headphones", "smartphones", "laptops", "cameras", "watches", "tablets", "monitors", "printers", "speakers", "drones"]

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

# Base URL with placeholders for product and page
base_url = "https://www.amazon.com/s?k={product}&page={page}"

# Function to scrape data from a single page
def scrape_page(product, page):
    url = base_url.format(product=product, page=page)
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    results = []
    for item in items:
        try:
            title = item.h2.text.strip()
            reviews = item.find("span", {"class": "a-size-base"})
            total_reviews = reviews.text.strip() if reviews else "No reviews"
            price_whole = item.find("span", {"class": "a-price-whole"})
            price_fraction = item.find("span", {"class": "a-price-fraction"})
            price = f"{price_whole.text.strip()}.{price_fraction.text.strip()}" if price_whole and price_fraction else "Price not available"
            image = item.find("img", {"class": "s-image"})["src"] if item.find("img", {"class": "s-image"}) else "Image not available"

            results.append({
                "Title": title,
                "Total Reviews": total_reviews,
                "Price": price,
                "Image URL": image,
            })
        except Exception as e:
            print(f"Error parsing item: {e}")

    return results

# Main scraping loop
all_data = {}

for product in products:
    product_data = []
    print(f"Scraping product: {product}")
    for page in range(1, 21):  # Scraping pages 1 to 20
        print(f"Scraping page {page} for {product}")
        page_data = scrape_page(product, page)
        product_data.extend(page_data)
        time.sleep(2)  # Be polite and avoid being blocked

    all_data[product] = product_data

# Save data to a JSON file
with open("amazon_products.json", "w") as f:
    json.dump(all_data, f, indent=4)

print("Scraping completed. Data saved to 'amazon_products.json'")


Scraping product: headphones
Scraping page 1 for headphones
Scraping page 2 for headphones
Scraping page 3 for headphones
Scraping page 4 for headphones
Scraping page 5 for headphones
Scraping page 6 for headphones
Scraping page 7 for headphones
Scraping page 8 for headphones
Scraping page 9 for headphones
Scraping page 10 for headphones
Scraping page 11 for headphones
Scraping page 12 for headphones
Scraping page 13 for headphones
Scraping page 14 for headphones
Scraping page 15 for headphones
Scraping page 16 for headphones
Scraping page 17 for headphones
Scraping page 18 for headphones
Scraping page 19 for headphones
Scraping page 20 for headphones
Scraping product: smartphones
Scraping page 1 for smartphones
Scraping page 2 for smartphones
Scraping page 3 for smartphones
Scraping page 4 for smartphones
Scraping page 5 for smartphones
Scraping page 6 for smartphones
Scraping page 7 for smartphones
Scraping page 8 for smartphones
Scraping page 9 for smartphones
Scraping page 10 for s