In [1]:
import time
import json
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import os

In [2]:
csv_file_path = 'Amazon Scraping-Sheet1.csv'
data = pd.read_csv(csv_file_path)

In [3]:
driver = webdriver.Chrome() 

In [4]:
# Function to scrape product details from a given URL
def scrape_product_details(url):
    product_details = {}
    try:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Check if the product title element exists
        title_element = soup.find('span', {'id': 'productTitle'})
        if title_element:
            product_details['URL'] = url
            product_details['Title'] = title_element.get_text().strip()
            
            # Check if the image element exists
            image_element = soup.find('img', {'id': 'landingImage'})
            if image_element:
                product_details['ImageURL'] = image_element['src']
                
            # Check if the price element exists
            price_element = soup.find('span', {'id': 'priceblock_ourprice'})
            if price_element:
                product_details['Price'] = price_element.get_text().strip()
            
            # Check if the details element exists
            details_element = soup.find('div', {'id': 'productDescription'})
            if details_element:
                product_details['Details'] = details_element.get_text().strip()

    except NoSuchElementException:
        print(f"404 Error: {url} not available")
        return None

    return product_details

In [5]:
scraped_data = []

# Loop through each row in the CSV and scrape product details
batch_size = 100
start_time = time.time()

for index, row in data.iterrows():
    if index > 0 and index % batch_size == 0:
        elapsed_time = time.time() - start_time
        print(f"Completed {index}/{len(data)} URLs in {elapsed_time:.2f} seconds")
        start_time = time.time()

    country = row['country']
    asin = row['Asin']
    url = f"https://www.amazon.{country}/dp/{asin}"
    product_data = scrape_product_details(url)
    if product_data:
        scraped_data.append(product_data)

# Close the web driver

driver.quit()

desktop_directory = os.path.expanduser("~/Desktop")
output_file_path = os.path.join(desktop_directory, 'scraped_data.json')

output_file = 'scraped_data.json'
with open(output_file, 'w') as json_file:
    json.dump(scraped_data, json_file, indent=4)

print(f"Scraping completed. Data saved to {output_file_path}")



Completed 100/1000 URLs in 277.74 seconds
Completed 200/1000 URLs in 279.12 seconds
Completed 300/1000 URLs in 273.94 seconds
Completed 400/1000 URLs in 128.79 seconds
Completed 500/1000 URLs in 128.12 seconds
Completed 600/1000 URLs in 224.38 seconds
Completed 700/1000 URLs in 256.13 seconds
Completed 800/1000 URLs in 91.71 seconds
Completed 900/1000 URLs in 103.86 seconds
Scraping completed. Data saved to C:\Users\Dheeraj/Desktop\scraped_data.json
