In [1]:
from scrapfly import ScrapflyClient, ScrapeConfig, ScrapflyError
scrapfly = ScrapflyClient(key="scp-live-01fc8534987042f4a2fe553c5cf6a2df", max_concurrency=2)
url = 'https://www.amazon.com/dp/B0D4RF55QK'
listing_result = scrapfly.scrape(ScrapeConfig(url=url, render_js=True, country="US", asp=True, retry=False, rendering_wait=10000))
listing_html = listing_result.scrape_result['content']

In [14]:
import os
from bs4 import BeautifulSoup
import re
import json
input_dir = r"C:\Users\SudheerRChinthala\circana\Srikanth\Data"
input_file = os.path.join(input_dir, "AmazonListing.html")
with open(input_file, 'w', encoding='utf-8') as f:
    f.write(listing_html)

In [4]:
# Function to clean unwanted characters from text
def clean_text(text):
    if text:
        text = re.sub(r'[\n\r\u200e\u200f\u00a0\u2014\u2026]+', ' ', text)  # Add unwanted characters here
        return text.strip()
    return ''

# Function to clean the key by removing extra spaces and colons
def clean_key(key):
    return re.sub(r'\s+:\s*$', '', key).strip()

# Function to scrape images from the product page and return them as image_url1, image_url2, etc.
def scrape_product_images(page_soup):
    image_urls = {}
    alt_images_section = page_soup.find('div', id='altImages')
    if alt_images_section:
        img_tags = alt_images_section.find_all('img')
        # print(f"Found {len(img_tags)} images")  # Log the number of images found
        for index, img_tag in enumerate(img_tags, start=0):
            img_src = img_tag.get('src')
            if img_src and not img_src.lower().endswith('.png'):
                image_urls[f'image_url{index}'] = img_src
    return image_urls

def scrape_price_delivery_details(page_soup, product_details):
    try:
        comparison_section = page_soup.find('div', class_='_product-comparison-desktop_styles_psem-product-comparison__22yGa')
        if comparison_section:
            rows = comparison_section.find_all('tr')
            for row in rows:
                key_cell = row.find('span', class_='a-size-base a-text-bold')
                if key_cell:
                    key = clean_text(key_cell.text)
                    value_cell = key_cell.find_parent('td').find_next_sibling('td')
                    value = clean_text(value_cell.text) if value_cell else None
                    if key and value:
                        cleaned_key = clean_key(key)
                        product_details[cleaned_key] = value
    except Exception as e:
        log_message(f"Error scraping price and delivery details: {e}")

def scrape_product_description(page_soup):
    product_description = None
    description_section = page_soup.find('div', id='productDescription')
    if description_section:
        description_text = description_section.find('p')
        if description_text:
            product_description = clean_text(description_text.text)
    if not product_description:
        feature_bullets = page_soup.find('div', id='feature-bullets')
        if feature_bullets:
            bullets = feature_bullets.find_all('span', class_='a-list-item')
            product_description = ' '.join([clean_text(bullet.text) for bullet in bullets if bullet.text.strip()])
    return product_description


def scrape_generic_product_details(page_soup, product_details):
    try:
        details_section = page_soup.find('table', class_='a-normal a-spacing-micro')
        if details_section:
            rows = details_section.find_all('tr')
            for row in rows:
                key_cell = row.find('td', class_='a-span3')
                value_cell = row.find('td', class_='a-span9')
                key = clean_text(key_cell.text) if key_cell else None
                value = clean_text(value_cell.text) if value_cell else None
                if key and value:
                    cleaned_key = clean_key(key)
                    product_details[cleaned_key] = value
    except Exception as e:
        log_message(f"Error scraping generic product details: {e}")

def scrape_product_information(page_soup, product_details):
    try:
        product_info_section = page_soup.find('div', id='productDetails_feature_div')
        if product_info_section:
            tables = product_info_section.find_all('table', class_='a-keyvalue')
            for table in tables:
                rows = table.find_all('tr')
                for row in rows:
                    key_cell = row.find('th', class_='a-color-secondary a-size-base prodDetSectionEntry')
                    value_cell = row.find('td', class_='a-size-base prodDetAttrValue')
                    key = clean_text(key_cell.text) if key_cell else None
                    value = clean_text(value_cell.text) if value_cell else None
                    if key and value:
                        cleaned_key = clean_key(key)
                        product_details[cleaned_key] = value
    except Exception as e:
        log_message(f"Error scraping product information: {e}")


def scrape_product_details(page_soup):
    product_details = {}

    try:
        # Scrape from detailBullets_feature_div
        product_details_section = page_soup.find('div', id='detailBullets_feature_div')
        if product_details_section:
            for li in product_details_section.find_all('li'):
                key_span = li.find('span', class_='a-text-bold')
                value_span = key_span.find_next_sibling('span') if key_span else None
                key = clean_text(key_span.text) if key_span else None
                value = clean_text(value_span.text) if value_span else None
                if key and value:
                    cleaned_key = clean_key(key)
                    product_details[cleaned_key] = value

            # Extract "Best Sellers Rank"
            best_seller_rank = page_soup.find('li', string=re.compile(r'Best Sellers Rank'))
            if best_seller_rank:
                rank_value = best_seller_rank.find_next('span').text.strip()
                sub_rank = best_seller_rank.find('ul')
                if sub_rank:
                    sub_rank_value = sub_rank.text.strip()
                    rank_value += f" ({sub_rank_value})"
                product_details["Best Sellers Rank"] = clean_text(rank_value)

            # Extract customer reviews and ratings
            review_section = page_soup.find('div', id='detailBullets_averageCustomerReviews')
            if review_section:
                star_rating = review_section.find('span', class_='a-icon-alt')
                if star_rating:
                    product_details["Star Rating"] = clean_text(star_rating.text)
                review_count = review_section.find('span', id='acrCustomerReviewText')
                if review_count:
                    product_details["Customer Reviews"] = clean_text(review_count.text)

        # Additional scraping for the product table (Brand, Item Volume, etc.)
        scrape_generic_product_details(page_soup, product_details)

        scrape_price_delivery_details(page_soup, product_details)


        # Scrape the additional information in productDetails_feature_div (if present)
        scrape_product_information(page_soup, product_details)

        # Scrape the product description
        product_description = scrape_product_description(page_soup)
        if product_description:
            product_details["Product Description"] = product_description
    except Exception as e:
        log_message(f"Error scraping product details: {e}")

    return product_details


# Function to clean and limit the description length for file names
def clean_description_for_filename(description):
    clean_desc = re.sub(r'[^\w\s-]', '', description).replace(' ', '_')
    return clean_desc[:50]  # Limit description to 50 characters


In [12]:
page_soup = BeautifulSoup(listing_html, 'html.parser')

try:
    product_name = clean_text(page_soup.find('h1', class_='a-size-large a-spacing-none').text)
except:
    product_name = None
    
product_details = scrape_product_details(page_soup)
image_urls = scrape_product_images(page_soup)

# Prepare the keyval entry ensuring buyUrl appears before image_urls
keyval_entry = {
    "product_name": product_name,
    "buyUrl": url  # Ensure buyUrl is inserted first
}
# Add image URLs after buyUrl and product details
keyval_entry.update(image_urls)
# Merge product_details into keyval_entry
keyval_entry.update(product_details)

# Clean the description to create a valid file name
#clean_description = clean_description_for_filename(description)

# Construct the final JSON structure
product_data = {
    "itemid": None,
    "site": "amazon.com",
    "retailer_name": "amazon",
    "country_id": "us",
    "data_source": "cpg",
    "keyval": [keyval_entry]
}


In [15]:
final_output = json.dumps(product_data,indent=4)

In [16]:
print(final_output)

{
    "itemid": null,
    "site": "amazon.com",
    "retailer_name": "amazon",
    "country_id": "us",
    "data_source": "cpg",
    "keyval": [
        {
            "product_name": "Razer DeathAdder V3 HyperSpeed Wireless Gaming Mouse: 55g Lightweight - USB C Charging - Up to 100 Hr Battery - Advanced 26K Optical Sensor - Gen-3 Optical Switches - 8 Programmable Controls - Black",
            "buyUrl": "https://www.amazon.com/dp/B0D4RF55QK",
            "image_url1": "https://m.media-amazon.com/images/I/31D4dnvCXtL._AC_US40_.jpg",
            "image_url2": "https://m.media-amazon.com/images/I/41MOMEBr7CL._AC_US40_.jpg",
            "image_url3": "https://m.media-amazon.com/images/I/413p9BS76OL._AC_US40_.jpg",
            "image_url4": "https://m.media-amazon.com/images/I/413YP1e8o3L._AC_US40_.jpg",
            "image_url5": "https://m.media-amazon.com/images/I/414pmFuLOcL._AC_US40_.jpg",
            "image_url6": "https://m.media-amazon.com/images/I/41CMFYd1lmL._AC_US40_.jpg",
       