In [70]:
from dataclasses import dataclass, asdict
import json

@dataclass
class Troubleshooting:
    symptoms: list[str]
    replaces: list[str]
    products: list[str]

@dataclass
class ModelCrossReference:
    brand: str
    model_number: str
    description: str

@dataclass
class Dishwasher:
    name: str
    price: float
    partselect_number: str
    manufacturer_number: str
    manufactured_by: str
    image_url: str
    description: str
    part_video_url: str
    troubleshooting: Troubleshooting
    rating: float
    model_cross_reference: list[ModelCrossReference]

In [71]:
from selenium import webdriver
from selenium.webdriver.common.by import By


def scrape_data(url: str, output_file: str) -> None:
    product_data = Dishwasher(
        name="",
        price=0.0,
        partselect_number="",
        manufacturer_number="",
        manufactured_by="",
        image_url="",
        description="",
        part_video_url="",
        troubleshooting=Troubleshooting(
            symptoms=[],
            replaces=[],
            products=[]
        ),
        rating=0.0,
        model_cross_reference=[]
    )

    driver = webdriver.Chrome()
    driver.get(url)

    # title element
    # <h1 class="title-lg mt-1 mb-3" itemprop="name">Refrigerator Door Shelf Bin WPW10321304</h1>
    title_el = driver.find_element(By.CSS_SELECTOR, 'h1[itemprop="name"]')
    product_data.name = title_el.text
    print('title:', title_el.text)

    # price element
    # <span class="js-partPrice" data-core-charge="0.0000">44.95</span>
    price_el = driver.find_element(By.CSS_SELECTOR, 'span.js-partPrice')
    product_data.price = float(price_el.text)
    print('price:', price_el.text)

    # partselect number element
    # <span class="bold text-teal" itemprop="productID">PS11752778</span>
    partselect_number_el = driver.find_element(By.CSS_SELECTOR, 'span[ itemprop="productID"]')
    product_data.partselect_number = partselect_number_el.text
    print('partselect number:', partselect_number_el.text)

    # manufacturer part number element
    # <span class="bold text-teal" itemprop="mpn">WPW10321304</span>
    manufacturer_part_number_el = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="mpn"]')
    product_data.manufacturer_number = manufacturer_part_number_el.text
    print('manufacturer part number:', manufacturer_part_number_el.text)

    # manufacturer element
    # <span class="bold text-teal" itemprop="brand" itemscope="" itemtype="http://schema.org/Brand"><span itemprop="name">Whirlpool</span></span>
    manufacturer_el = driver.find_element(By.CSS_SELECTOR, 'span[itemprop="brand"] > span[itemprop="name"]')
    product_data.manufactured_by = manufacturer_el.text
    print('manufacturer:', manufacturer_el.text)

    # description element
    # <div itemprop="description" class="mt-3">This refrigerator door bin is a genuine OEM replacement designed to fit many side-by-side refrigerator models. Compatible with brands like KitchenAid, Maytag, and Amana, it attaches to the interior door, providing storage for jars and bottles. Featuring a clear design with white trim, this bin is both durable and functional. Installation is tool-free—simply align and snap into place. Verify your refrigerator’s model number before ordering to ensure compatibility, as dimensions may vary slightly. Restore your refrigerator's storage capacity and organization with this high-quality replacement door bin.</div>
    description_el = driver.find_element(By.CSS_SELECTOR, 'div[itemprop="description"]')
    product_data.description = description_el.text
    print('description:', description_el.text)

    # common symptoms element
    # <ul class="list-disc">
    # <li class="mb-1">Door won’t open or close</li>
    # <li class="mb-1">Ice maker won’t dispense ice</li>
    # <li class="mb-1">Leaking</li>
    # </ul>
    common_symptoms = []
    uls = driver.find_elements(By.CSS_SELECTOR, 'ul.list-disc')
    common_symptoms_el = uls[0]
    for li in common_symptoms_el.find_elements(By.TAG_NAME, 'li'):
        common_symptoms.append(li.text)
    product_data.troubleshooting.symptoms = common_symptoms
    print('common symptoms:', common_symptoms)

    # replaces element
    # <div data-collapse-container="{&quot;targetClassToggle&quot;:&quot;d-none&quot;}">
    #     AP6019471,  2171046,  2171047,  2179574,  2179575,  2179607,  2179607K,  2198449,  2198449K,  2304235,  2304235K,  W10321302,  W10321303,  W10321304,  W10549739,  WPW10321304VP
    # </div>
    replaces_el = driver.find_element(By.CSS_SELECTOR, 'div[data-collapse-container*="targetClassToggle"]')
    replaces_text = replaces_el.text.strip()
    replaces_list = [part.strip() for part in replaces_text.split(',')]
    product_data.troubleshooting.replaces = replaces_list
    print('replaces:', replaces_list)

    # products element
    # <ul class="list-disc">
    #     <li class="mb-1">Refrigerator</li>
    # </ul>
    products = []
    uls = driver.find_elements(By.CSS_SELECTOR, 'ul.list-disc')
    products_el = uls[1]
    for li in products_el.find_elements(By.TAG_NAME, 'li'):
        products.append(li.text)
    product_data.troubleshooting.products = products
    print('products:', products)

    # rating element
    # <div class="pd__cust-review__header__rating__chart--border">4.9</div>
    rating_el = driver.find_element(By.CSS_SELECTOR, 'div.pd__cust-review__header__rating__chart--border')
    product_data.rating = float(rating_el.text)
    print('rating:', rating_el.text)

    # img url element
    # <img itemprop="image" src="https://partselectcom-gtcdcddbene3cpes.z01.azurefd.net/11752778-1-M-Whirlpool-WPW10321304-Refrigerator-Door-Shelf-Bin.jpg" alt="11752778-1-M-Whirlpool-WPW10321304-Refrigerator Door Shelf Bin" title="11752778-1-M-Whirlpool-WPW10321304-Refrigerator Door Shelf Bin" style="max-width: 600px; max-height: 480px;">
    img_url_el = driver.find_element(By.CSS_SELECTOR, 'img[itemprop="image"]')
    product_data.image_url = img_url_el.get_attribute('src')
    print('img url:', img_url_el.get_attribute('src'))

    # model cross reference element
    # <div class="pd__crossref__list js-dataContainer js-infiniteScroll">        
    #     <div class="row">
    #         <div class="col-6 col-md-3">Kenmore</div>
    #         <a class="col-6 col-md-3 col-lg-2" rel="nofollow" href="/Models/10640262010/">10640262010</a>
    #         <div class="col col-md-6 col-lg-7">
    #             Refrigerator
    #         </div>
    #     </div>
    #     <div class="row">
    #         <div class="col-6 col-md-3">Kenmore</div>
    #         <a class="col-6 col-md-3 col-lg-2" rel="nofollow" href="/Models/10640263010/">10640263010</a>
    #         <div class="col col-md-6 col-lg-7">
    #             Refrigerator
    #         </div>
    #     </div>
    #     <div class="row">
    #         <div class="col-6 col-md-3">Kenmore</div>
    #         <a class="col-6 col-md-3 col-lg-2" rel="nofollow" href="/Models/10640263011/">10640263011</a>
    #         <div class="col col-md-6 col-lg-7">
    #             Refrigerator
    #  - REFRIGERATOR        </div>
    #     </div>
    #     <div class="row">
    #         <div class="col-6 col-md-3">Kenmore</div>
    #         <a class="col-6 col-md-3 col-lg-2" rel="nofollow" href="/Models/10653642300/">10653642300</a>
    #         <div class="col col-md-6 col-lg-7">
    #             Refrigerator
    #         </div>
    #     </div>
    # <div class="js-loadNext d-flex justify-content-center">Load more...</div></div>
    model_cross_reference = []
    model_cross_reference_container_el = driver.find_element(By.CSS_SELECTOR, 'div.pd__crossref__list.js-dataContainer.js-infiniteScroll')
    for row_el in model_cross_reference_container_el.find_elements(By.CSS_SELECTOR, 'div.row'):
        brand_el = row_el.find_element(By.CSS_SELECTOR, 'div.col-6.col-md-3')
        model_el = row_el.find_element(By.CSS_SELECTOR, 'a.col-6.col-md-3.col-lg-2')
        description_el = row_el.find_element(By.CSS_SELECTOR, 'div.col.col-md-6.col-lg-7')
        model_cross_reference.append({
            'brand': brand_el.text,
            'model': model_el.text,
            'description': description_el.text.strip()
        })
    product_data.model_cross_reference = [
        ModelCrossReference(
            brand=item['brand'],
            model_number=item['model'],
            description=item['description']
        ) for item in model_cross_reference
    ]
    print('model cross reference:', model_cross_reference)

    print(product_data)

    driver.quit()

    seen = set()
    filtered_model_cross_reference = []
    for item in product_data.model_cross_reference:
        if item.model_number and item.model_number not in seen:
            seen.add(item.model_number)
            filtered_model_cross_reference.append(item)
    product_data.model_cross_reference = filtered_model_cross_reference

    json_data = asdict(product_data)

    # Save to file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)


In [72]:
urls = [
    "https://www.partselect.com/PS11752778-Whirlpool-WPW10321304-Refrigerator-Door-Shelf-Bin.htm?SourceCode=3&SearchTerm=PS11752778",
    "https://www.partselect.com/PS3406971-Whirlpool-W10195416-Lower-Dishrack-Wheel.htm?SourceCode=3&SearchTerm=PS3406971",
    "https://www.partselect.com/PS10065979-Whirlpool-W10712395-Upper-Rack-Adjuster-Kit-White-Wheels-Left-and-Right-Sides.htm?SourceCode=3&SearchTerm=PS10065979",
    "https://www.partselect.com/PS11756150-Whirlpool-WPW10546503-Dishwasher-Upper-Rack-Adjuster.htm?SourceCode=3&SearchTerm=PS11756150",
    "https://www.partselect.com/PS11746591-Whirlpool-WP8565925-Dishwasher-Rack-Track-Stop.htm?SourceCode=3&SearchTerm=PS11746591",
    "https://www.partselect.com/PS11750057-Whirlpool-WPW10195417-Lower-Dishrack-Wheel-Assembly.htm?SourceCode=3&SearchTerm=PS11750057",
    "https://www.partselect.com/PS12585623-Frigidaire-5304517203-Lower-Spray-Arm.htm?SourceCode=3&SearchTerm=PS12585623",
    "https://www.partselect.com/PS17137081-GE-WD22X33499-LOWER-SPRAY-ARM.htm?SourceCode=3&SearchTerm=PS17137081",
    "https://www.partselect.com/PS11731570-Whirlpool-W10861000-Detergent-Dispenser.htm?SourceCode=3&SearchTerm=PS11731570",
    "https://www.partselect.com/PS11745488-Whirlpool-WP8268961-Dishwasher-Friction-Sleeve.htm?SourceCode=3&SearchTerm=PS11745488",
    "https://www.partselect.com/PS11755592-Whirlpool-WPW10491331-Dishwasher-Lower-Spray-Arm.htm?SourceCode=3&SearchTerm=PS11755592",
    "https://www.partselect.com/PS11750093-Whirlpool-WPW10195840-Dishwasher-Positioner.htm?SourceCode=3&SearchTerm=PS11750093",
    "https://www.partselect.com/PS18355438-Whirlpool-W11768590-GASKET.htm?SourceCode=3&SearchTerm=PS18355438",
    "https://www.partselect.com/PS18351367-GE-WD28X35779-UPPER-RACK.htm?SourceCode=18",
    "https://www.partselect.com/PS17873657-GE-WD28X34744-LOWER-RACK.htm?SourceCode=18"
]

# for output file counter, start at 1
for i, url in enumerate(urls, start=1):
    output_file = f'part_{i}.json'
    scrape_data(url, output_file)

title: Refrigerator Door Shelf Bin WPW10321304
price: 44.95
partselect number: PS11752778
manufacturer part number: WPW10321304
manufacturer: Whirlpool
description: This refrigerator door bin is a genuine OEM replacement designed to fit many side-by-side refrigerator models. Compatible with brands like KitchenAid, Maytag, and Amana, it attaches to the interior door, providing storage for jars and bottles. Featuring a clear design with white trim, this bin is both durable and functional. Installation is tool-free—simply align and snap into place. Verify your refrigerator’s model number before ordering to ensure compatibility, as dimensions may vary slightly. Restore your refrigerator's storage capacity and organization with this high-quality replacement door bin.
common symptoms: ['Door won’t open or close', 'Ice maker won’t dispense ice', 'Leaking']
replaces: ['AP6019471', '2171046', '2171047', '2179574', '2179575', '2179607', '2179607K', '2198449', '2198449K', '2304235', '2304235K', '

ValueError: could not convert string to float: ''