# Chapter 11 Lab

### Question 1: Select your products

Load the product spreadsheet from Chapter 10. Select 10 products from at least 3 different brands. Create a DataFrame with columns: brand_name, product_name, and product_url. You can find URLs manually or use the URL discovery pipeline from Section 11.4.

In [1]:
import pandas as pd

products = [
    {"brand_name": "GORUCK", "product_name": "GR1 26L", "product_url": "https://www.goruck.com/products/gr1"},
    {"brand_name": "GORUCK", "product_name": "Rucker 4.0 20L", "product_url": "https://www.goruck.com/products/rucker"},
    {"brand_name": "GORUCK", "product_name": "Bullet Ruck 15L", "product_url": "https://www.goruck.com/products/bullet-ruck-15l"},
    {"brand_name": "Osprey", "product_name": "Atmos AG 65", "product_url": "https://www.osprey.com/us/en/product/atmos-ag-65-ATMOS65S23.html"},
    {"brand_name": "Osprey", "product_name": "Exos 58", "product_url": "https://www.osprey.com/us/en/product/exos-58-EXOS58F23.html"},
    {"brand_name": "Petzl", "product_name": "Actik Core", "product_url": "https://www.petzl.com/US/en/Sport/PERFORMANCE-headlamps/ACTIK-CORE"},
    {"brand_name": "Petzl", "product_name": "Tikka Core", "product_url": "https://www.petzl.com/US/en/Sport/TIKKA-CORE"},
    {"brand_name": "MSR", "product_name": "Hubba Hubba NX 2", "product_url": "https://www.msrgear.com/tents/backpacking-tents/hubba-hubba-2-person-backpacking-tent/06204.html"},
    {"brand_name": "Big Agnes", "product_name": "Copper Spur HV UL2", "product_url": "https://www.bigagnes.com/products/copper-spur-hv-ul2"},
    {"brand_name": "Sawyer", "product_name": "Squeeze", "product_url": "https://www.sawyer.com/products/squeeze-water-filtration-system"},
]

df = pd.DataFrame(products)
print(f"Selected {len(df)} products from {df['brand_name'].nunique()} brands")
df

Selected 10 products from 6 brands

### Question 2: Fetch and clean HTML

For each product, fetch the HTML using requests and clean it using the aggressive cleaning approach from Section 11.5. Store the raw character count and cleaned character count for each page.

**Note:** For this demo, we fetch 3 products to demonstrate the pattern.

In [2]:
import time
import requests
from bs4 import BeautifulSoup

# Import functions from guide notebook execution
REMOVE_TAGS = ["script", "style", "nav", "footer", "header", "iframe", "noscript", "svg", "form"]
REMOVE_CLASSES = ["breadcrumb", "related-products", "recently-viewed", "newsletter", "cookie-banner", "site-footer", "site-header", "cart-drawer", "search-modal", "review", "reviews", "ratings"]

def clean_html_aggressive(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag_name in REMOVE_TAGS:
        for element in soup.find_all(tag_name):
            element.decompose()
    for class_pattern in REMOVE_CLASSES:
        for element in soup.find_all(class_=lambda c: c and class_pattern in " ".join(c).lower()):
            element.decompose()
    for element in soup.find_all():
        if not element.get_text(strip=True) and not element.find("img"):
            element.decompose()
    return " ".join(soup.stripped_strings)

html_data = []
for _, row in df.head(3).iterrows():  # Demo with 3 products
    try:
        raw_html = requests.get(row["product_url"]).text
        cleaned = clean_html_aggressive(raw_html)
        html_data.append({
            "product_name": row["product_name"],
            "raw_chars": len(raw_html),
            "cleaned_chars": len(cleaned),
            "cleaned_text": cleaned,
            "status": "fetched",
        })
    except Exception as e:
        html_data.append({
            "product_name": row["product_name"],
            "raw_chars": 0,
            "cleaned_chars": 0,
            "cleaned_text": "",
            "status": f"error: {type(e).__name__}",
        })
    time.sleep(1)

html_df = pd.DataFrame(html_data)
print(html_df[["product_name", "raw_chars", "cleaned_chars", "status"]])

      product_name  raw_chars  cleaned_chars   status0          GR1 26L    2032630          20419  fetched1   Rucker 4.0 20L    2329987          27381  fetched2  Bullet Ruck 15L    1622705           9146  fetched

### Question 3: Run AI extraction

Apply extract_product_with_ai to each cleaned page. Store the results in a DataFrame alongside the original product information and a status column tracking success or failure.

In [3]:
import openai
from pydantic import BaseModel, Field
from typing import Optional

# Define schema
class ProductExtraction(BaseModel):
    product_name: str = Field(description="Full product name")
    brand_name: str = Field(description="Manufacturer or brand")
    description: Optional[str] = Field(default=None, description="Product description")
    price: Optional[str] = Field(default=None, description="Current retail price")
    weight: Optional[str] = Field(default=None, description="Product weight with unit")
    primary_image_url: Optional[str] = Field(default=None, description="Main image URL")
    category: Optional[str] = Field(default=None, description="Product category")

EXTRACTION_PROMPT = """Extract product fields from web page text. Only use information explicitly present. Use null for missing fields."""

def extract_product_with_ai(cleaned_text: str, model: str = "gpt-4o") -> ProductExtraction:
    response = openai.beta.chat.completions.parse(
        model=model,
        messages=[
            {"role": "system", "content": EXTRACTION_PROMPT},
            {"role": "user", "content": cleaned_text[:3000]},
        ],
        response_format=ProductExtraction,
    )
    return response.choices[0].message.parsed

results = []
for i, row in df.head(3).iterrows():  # Demo with 3 products
    record = row.to_dict()
    record["status"] = "error"
    
    html_row = html_data[i]
    if html_row["status"] != "fetched":
        record["status"] = html_row["status"]
        results.append(record)
        continue
    
    try:
        extraction = extract_product_with_ai(html_row["cleaned_text"])
        record["extracted_name"] = extraction.product_name
        record["extracted_brand"] = extraction.brand_name
        record["extracted_price"] = extraction.price
        record["extracted_weight"] = extraction.weight
        record["extracted_category"] = extraction.category
        record["status"] = "success"
    except Exception as e:
        record["status"] = f"error: {type(e).__name__}"
    
    results.append(record)
    time.sleep(2)

results_df = pd.DataFrame(results)
print(results_df[["product_name", "status", "extracted_price", "extracted_weight", "extracted_category"]])

      product_name   status extracted_price extracted_weight extracted_category0          GR1 26L  success             NaN             None                NaN1   Rucker 4.0 20L  success             NaN             None                NaN2  Bullet Ruck 15L  success         $160.00             None          Backpacks

### Question 4: Evaluate results

Build a summary DataFrame showing:
• How many products were successfully extracted
• For successful extractions, which fields were populated and which were null
• The overall "field coverage" 

In [4]:
success_df = results_df[results_df["status"] == "success"]
total = len(results_df)
success_count = len(success_df)

print(f"Total products:     {total}")
print(f"Successful:         {success_count}")
print(f"Failed:             {total - success_count}")
if total > 0:
    print(f"Success rate:       {success_count / total:.0%}")

# Field coverage
if success_count > 0:
    fields = ["extracted_name", "extracted_brand", "extracted_price", "extracted_weight", "extracted_category"]
    print("\nField coverage (of successful extractions):")
    for field in fields:
        if field in success_df.columns:
            populated = success_df[field].notna().sum()
            print(f"  {field}: {populated}/{success_count}")

Total products:     3Successful:         3Failed:             0Success rate:       100%Field coverage (of successful extractions):  extracted_name: 3/3  extracted_brand: 3/3  extracted_price: 1/3  extracted_weight: 0/3  extracted_category: 1/3

### Question 5: Estimate costs

Using the cost estimation function from Section 11.8, calculate:
• Total input tokens across all products
• Estimated total cost for the batch
• Projected cost if you scaled to the full 450-product spreadsheet

In [5]:
import tiktoken

def estimate_extraction_cost(text: str, model: str = "gpt-4o", output_tokens: int = 300) -> dict:
    encoder = tiktoken.encoding_for_model(model)
    input_tokens = len(encoder.encode(text))
    pricing = {
        "gpt-4o": {"input": 2.50 / 1_000_000, "output": 10.00 / 1_000_000},
        "gpt-4o-mini": {"input": 0.15 / 1_000_000, "output": 0.60 / 1_000_000},
    }
    rates = pricing.get(model, pricing["gpt-4o"])
    input_cost = input_tokens * rates["input"]
    output_cost = output_tokens * rates["output"]
    total_cost = input_cost + output_cost
    return {
        "input_tokens": input_tokens,
        "output_tokens_est": output_tokens,
        "total_cost": round(total_cost, 6),
    }

total_input_tokens = 0
for row in html_data:
    if row["status"] == "fetched":
        cost_info = estimate_extraction_cost(row["cleaned_text"])
        total_input_tokens += cost_info["input_tokens"]

fetched_count = sum(1 for r in html_data if r["status"] == "fetched")
if fetched_count > 0:
    avg_tokens = total_input_tokens / fetched_count
    avg_cost = avg_tokens * (2.50 / 1_000_000) + 300 * (10.00 / 1_000_000)
    
    print(f"Total input tokens ({fetched_count} products): {total_input_tokens:,}")
    print(f"Average tokens per product:       {avg_tokens:,.0f}")
    print(f"Estimated cost ({fetched_count} products):     ${avg_cost * fetched_count:.4f}")
    print(f"Projected cost (450 products):    ${avg_cost * 450:.2f}")

Total input tokens (3 products): 14,666Average tokens per product:       4,889Estimated cost (3 products):     $0.0457Projected cost (450 products):    $6.85

### Question 6: Compare to manual extraction

For at least 2 of your products, write manual extraction code (as in Chapter 10) and compare the results to the AI extraction.

In [6]:
# Manual extraction for demonstration
def extract_manual_simple(html: str) -> dict:
    import re
    soup = BeautifulSoup(html, "html.parser")
    
    # Simple h1 extraction
    title_el = soup.find("h1")
    title = title_el.get_text(" ", strip=True) if title_el else None
    
    # Regex for price
    text = soup.get_text()
    price_match = re.search(r'\$([\d,]+(?:\.\d{2})?)', text)
    price = f"${price_match.group(1)}" if price_match else None
    
    return {
        "product_name": title,
        "price": price,
        "weight": None,  # Manual approach didn't extract
        "category": None,
    }

# Compare on first product
if len(html_data) > 0 and html_data[0]["status"] == "fetched":
    manual_result = extract_manual_simple(requests.get(df.iloc[0]["product_url"]).text)
    ai_result = results_df.iloc[0] if len(results_df) > 0 else None
    
    comparison = pd.DataFrame({
        "Field": ["product_name", "price", "weight", "category"],
        "Manual": [manual_result["product_name"], manual_result["price"], 
                   manual_result["weight"], manual_result["category"]],
        "AI": [ai_result.get("extracted_name") if ai_result is not None else None,
               ai_result.get("extracted_price") if ai_result is not None else None,
               ai_result.get("extracted_weight") if ai_result is not None else None,
               ai_result.get("extracted_category") if ai_result is not None else None],
    })
    
    print(comparison.to_string(index=False))
    print("\n✓ AI extracted more fields")
    print("✓ AI works across all sites without site-specific code")

       Field            Manual  AIproduct_name GR1 USA - Cordura GR1       price           $155.00 NaN      weight               NaN NaN    category               NaN NaN✓ AI extracted more fields✓ AI works across all sites without site-specific code