In [6]:
import json
from bs4 import BeautifulSoup

def scrape_uniqlo(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []
    errors = []
    
    product_cards = soup.find_all('article', class_='_13lwRUB5FbvQg2uaUeYNJ9')
    
    for index, card in enumerate(product_cards):
        product_data = {}
        try:
            # Track product name early for error reporting
            name_element = card.find('h3', class_='product-tile-product-description')
            product_name = name_element.text.strip() if name_element else "Unknown Product"
            
            # 1. Validate name
            if not name_element:
                raise ValueError(f"Missing name element")
            product_data['name'] = product_name

            # 2. Extract price (updated handling)
            price_element = card.find('span', class_='price-original-ER') or card.find('span', class_='price-limited-ER')
            if not price_element:
                raise ValueError(f"Missing both price-original-ER and price-limited-ER elements")
            
            try:
                price_text = price_element.text.replace('S$', '').strip()
                if '-' in price_text:
                    min_price = min([float(p.strip()) for p in price_text.split('-')])
                    product_data['price'] = min_price
                else:
                    product_data['price'] = float(price_text)
            except ValueError:
                raise ValueError(f"Invalid price format: {price_element.text}")

            # 3. Extract image URL
            img_tag = card.find('img', class_='thumb-img')
            if not img_tag or 'src' not in img_tag.attrs:
                raise ValueError(f"Missing image URL")
            product_data['image_url'] = img_tag['src'].split('?')[0]

            # 4. Extract product link
            link_element = card.find('a')
            if not link_element or 'href' not in link_element.attrs:
                raise ValueError(f"Missing product link")
            product_data['product_link'] = f'https://www.uniqlo.com{link_element["href"]}'

            # 5. Extract gender
            gender_element = card.find('p', class_='product-tile-category-item-gender')
            product_data['gender'] = 'U' if gender_element and 'UNISEX' in gender_element.text.upper() else 'M'

            product_data.update({
                'retailer': 'UNIQLO',
                'category': 'Bottoms'
            })

            products.append(product_data)

        except Exception as e:
            errors.append({
                'index': index,
                'name': product_name if 'product_name' in locals() else "Unknown",
                'error': str(e)
            })
            continue
    
    return products, len(product_cards), errors

def save_to_json(products, filename='products.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(products, f, ensure_ascii=False, indent=2)

if __name__ == '__main__':
    with open('www.uniqlo.com_sg_en_men_bottoms.html', 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    products, total_count, errors = scrape_uniqlo(html_content)
    save_to_json(products)
    
    print(f"\nProcessing Summary:")
    print(f"Total products found: {total_count}")
    print(f"Successfully processed: {len(products)}")
    print(f"Skipped products: {total_count - len(products)}\n")
    
    if errors:
        print("Detailed error report:")
        for error in errors:
            print(f"Product #{error['index']+1} ({error['name']}): {error['error']}")
    
    print("\nSaved valid products to products.json")


Processing Summary:
Total products found: 85
Successfully processed: 85
Skipped products: 0


Saved valid products to products.json
