In [3]:
import json
import os
from bs4 import BeautifulSoup

def scrape_benjamin_barker(html_content, category):
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []
    errors = []
    
    product_cards = soup.find_all('div', class_='product-item product-item--portrait')
    
    for index, card in enumerate(product_cards):
        product_data = {}
        try:
            # Track product name early for error reporting
            name_element = card.select_one('.product-item__details a')
            product_name = name_element.text.strip() if name_element else "Unknown Product"
            
            # 1. Validate name
            if not name_element:
                raise ValueError("Missing name element")
            product_data['name'] = product_name

            # 2. Improved price extraction
            price_element = card.select_one('.price--on-sale .price-item--sale') or card.select_one('.price__regular .price-item--regular')
            if not price_element:
                raise ValueError("Missing price element")
            
            try:
                # Handle prices with extra text (e.g., "Unit price" information)
                price_text = price_element.get_text(separator=' ', strip=True)
                price_value = price_text.split()[0]  # Get first part before any space
                product_data['price'] = float(price_value.replace('$', '').replace(',', ''))
            except (IndexError, ValueError) as e:
                raise ValueError(f"Invalid price format: {price_element.text.strip()}")

            # 3. Enhanced image handling
            img_container = card.select_one('.product-item__image img')
            if not img_container:
                raise ValueError("Missing image container")

            # Get best available image source
            if img_container.get('src'):
                base_url = img_container['src']
            elif img_container.get('data-srcset'):
                base_url = img_container['data-srcset'].split()[0]
            else:
                raise ValueError("No valid image source found")

            # Convert to high-resolution URL first
            if '_' in base_url and '.' in base_url:
                parts = base_url.split('_')
                base_part = '_'.join(parts[:-1]) if len(parts) > 1 else parts[0]
                extension = base_url.split('.')[-1]
                hi_res_url = f"{base_part}_9999x9999.{extension}"
            else:
                hi_res_url = base_url

            # Now clean and add protocol
            cleaned_url = hi_res_url.split('?')[0]
            if cleaned_url.startswith('//'):
                cleaned_url = f'https:{cleaned_url}'
            elif not cleaned_url.startswith('http'):
                cleaned_url = f'https://{cleaned_url}'

            product_data['image_url'] = cleaned_url

            # 4. Extract product link
            link_element = card.find('a', class_='product-item__image-wrapper')
            if not link_element or 'href' not in link_element.attrs:
                raise ValueError("Missing product link")
            product_data['product_link'] = f'https://www.benjaminbarker.co{link_element["href"]}'

            # 5. Set gender (all products appear to be male)
            product_data['gender'] = 'M'

            product_data.update({
                'retailer': 'Benjamin Barker',
                'category': category
            })

            products.append(product_data)

        except Exception as e:
            errors.append({
                'index': index,
                'name': product_name if 'product_name' in locals() else "Unknown",
                'error': str(e)
            })
            continue
    
    return products, len(product_cards), errors

def save_to_json(products, filename='products.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(products, f, ensure_ascii=False, indent=2)

def determine_category(file_number):
    if 1 <= file_number <= 4:
        return 'Tops'
    elif file_number == 5:
        return 'Bottoms'
    elif file_number == 6:
        return 'Shoes'
    return 'Unknown'

if __name__ == '__main__':
    all_products = []
    total_processed = 0
    total_errors = 0
    
    for file_number in range(1, 7):
        filename = f'www.benjaminbarker.co_collections_all-products ({file_number}).html'
        if not os.path.exists(filename):
            continue
            
        with open(filename, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        category = determine_category(file_number)
        products, total_count, errors = scrape_benjamin_barker(html_content, category)
        all_products.extend(products)
        total_processed += len(products)
        total_errors += len(errors)
        
        print(f"\nFile {filename} Processing Summary:")
        print(f"Total products found: {total_count}")
        print(f"Successfully processed: {len(products)}")
        print(f"Skipped products: {total_count - len(products)}")
        if errors:
            print("Errors in this file:")
            for error in errors:
                print(f"Product #{error['index']+1} ({error['name']}): {error['error']}")
    
    save_to_json(all_products)
    
    print("\nFinal Processing Summary:")
    print(f"Total products processed across all files: {total_processed}")
    print(f"Total errors across all files: {total_errors}")
    print("\nSaved all valid products to products.json")


File www.benjaminbarker.co_collections_all-products (1).html Processing Summary:
Total products found: 24
Successfully processed: 24
Skipped products: 0

File www.benjaminbarker.co_collections_all-products (2).html Processing Summary:
Total products found: 24
Successfully processed: 24
Skipped products: 0

File www.benjaminbarker.co_collections_all-products (3).html Processing Summary:
Total products found: 24
Successfully processed: 24
Skipped products: 0

File www.benjaminbarker.co_collections_all-products (4).html Processing Summary:
Total products found: 21
Successfully processed: 21
Skipped products: 0

File www.benjaminbarker.co_collections_all-products (5).html Processing Summary:
Total products found: 4
Successfully processed: 4
Skipped products: 0

File www.benjaminbarker.co_collections_all-products (6).html Processing Summary:
Total products found: 1
Successfully processed: 1
Skipped products: 0

Final Processing Summary:
Total products processed across all files: 98
Total e