# Chapter 11 Lab

### Question 1: Select your products

Load the product spreadsheet from Chapter 10. Select 10 products from at least 3 different brands. Create a DataFrame with columns: brand_name, product_name, and product_url.

In [1]:
import pandas as pd

products = [
    {"brand_name": "GORUCK", "product_name": "GR1 26L", "product_url": "https://www.goruck.com/products/gr1"},
    {"brand_name": "GORUCK", "product_name": "Rucker 4.0 20L", "product_url": "https://www.goruck.com/products/rucker"},
    {"brand_name": "GORUCK", "product_name": "Bullet Ruck 15L", "product_url": "https://www.goruck.com/products/bullet-ruck-15l"},
    {"brand_name": "Osprey", "product_name": "Atmos AG 65", "product_url": "https://www.osprey.com/us/en/product/atmos-ag-65-ATMOS65S23.html"},
    {"brand_name": "Osprey", "product_name": "Exos 58", "product_url": "https://www.osprey.com/us/en/product/exos-58-EXOS58F23.html"},
    {"brand_name": "Petzl", "product_name": "Actik Core", "product_url": "https://www.petzl.com/US/en/Sport/PERFORMANCE-headlamps/ACTIK-CORE"},
    {"brand_name": "Petzl", "product_name": "Tikka Core", "product_url": "https://www.petzl.com/US/en/Sport/TIKKA-CORE"},
    {"brand_name": "MSR", "product_name": "Hubba Hubba NX 2", "product_url": "https://www.msrgear.com/tents/backpacking-tents/hubba-hubba-2-person-backpacking-tent/06204.html"},
    {"brand_name": "Big Agnes", "product_name": "Copper Spur HV UL2", "product_url": "https://www.bigagnes.com/products/copper-spur-hv-ul2"},
    {"brand_name": "Sawyer", "product_name": "Squeeze", "product_url": "https://www.sawyer.com/products/squeeze-water-filtration-system"},
]

df = pd.DataFrame(products)

print(f"Selected {len(df)} products from {df['brand_name'].nunique()} brands\n")
display(df)

Selected 10 products from 6 brands  brand_name  ...                                        product_url0     GORUCK  ...                https://www.goruck.com/products/gr11     GORUCK  ...             https://www.goruck.com/products/rucker2     GORUCK  ...    https://www.goruck.com/products/bullet-ruck-15l3     Osprey  ...  https://www.osprey.com/us/en/product/atmos-ag-...4     Osprey  ...  https://www.osprey.com/us/en/product/exos-58-E...5      Petzl  ...  https://www.petzl.com/US/en/Sport/PERFORMANCE-...6      Petzl  ...       https://www.petzl.com/US/en/Sport/TIKKA-CORE7        MSR  ...  https://www.msrgear.com/tents/backpacking-tent...8  Big Agnes  ...  https://www.bigagnes.com/products/copper-spur-...9     Sawyer  ...  https://www.sawyer.com/products/squeeze-water-...[10 rows x 3 columns]

### Question 2: Fetch and clean HTML

For each product, fetch the HTML using requests and clean it using the aggressive cleaning approach from Section 11.5. Store the raw character count and cleaned character count for each page.

**Note:** Processing 5 products to demonstrate the pattern.

In [2]:
import time
import requests
from bs4 import BeautifulSoup

# Cleaning functions from guide
REMOVE_TAGS = ["script", "style", "nav", "footer", "header", "iframe", "noscript", "svg", "form"]
REMOVE_CLASSES = ["breadcrumb", "related-products", "recently-viewed", "newsletter", "cookie-banner", "site-footer", "site-header", "cart-drawer", "search-modal", "review", "reviews", "ratings"]

def clean_html_aggressive(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag_name in REMOVE_TAGS:
        for element in soup.find_all(tag_name):
            element.decompose()
    for class_pattern in REMOVE_CLASSES:
        for element in soup.find_all(class_=lambda c: c and class_pattern in " ".join(c).lower()):
            element.decompose()
    for element in soup.find_all():
        if not element.get_text(strip=True) and not element.find("img"):
            element.decompose()
    return " ".join(soup.stripped_strings)

html_data = []
for _, row in df.head(5).iterrows():  # Process 5 products
    try:
        print(f"Fetching: {row['product_name']}...")
        raw_html = requests.get(row["product_url"], timeout=10).text
        cleaned = clean_html_aggressive(raw_html)
        html_data.append({
            "product_name": row["product_name"],
            "brand_name": row["brand_name"],
            "raw_chars": len(raw_html),
            "cleaned_chars": len(cleaned),
            "reduction_pct": round((1 - len(cleaned) / len(raw_html)) * 100, 1),
            "cleaned_text": cleaned,
            "status": "fetched",
        })
    except Exception as e:
        print(f"  Error: {type(e).__name__}")
        html_data.append({
            "product_name": row["product_name"],
            "brand_name": row["brand_name"],
            "raw_chars": 0,
            "cleaned_chars": 0,
            "reduction_pct": 0,
            "cleaned_text": "",
            "status": f"error: {type(e).__name__}",
        })
    time.sleep(1)

html_df = pd.DataFrame(html_data)
print(f"\nHTML Fetching & Cleaning Results:\n")
display(html_df[["product_name", "status", "raw_chars", "cleaned_chars", "reduction_pct"]])

Fetching: GR1 26L...Fetching: Rucker 4.0 20L...Fetching: Bullet Ruck 15L...Fetching: Atmos AG 65...Fetching: Exos 58...HTML Fetching & Cleaning Results:      product_name   status  raw_chars  cleaned_chars  reduction_pct0          GR1 26L  fetched    2032625          20419           99.01   Rucker 4.0 20L  fetched    2328227          27381           98.82  Bullet Ruck 15L  fetched    1622705           9146           99.43      Atmos AG 65  fetched       9052             16           99.84          Exos 58  fetched       9037             16           99.8

### Question 3: Run AI extraction

Apply extract_product_with_ai to each cleaned page. Store the results in a DataFrame with extraction status.

In [3]:
import openai
from pydantic import BaseModel, Field
from typing import Optional

# Define schema (from guide)
class ProductExtraction(BaseModel):
    product_name: str = Field(description="Full product name")
    brand_name: str = Field(description="Manufacturer or brand")
    description: Optional[str] = Field(default=None, description="Product description")
    price: Optional[str] = Field(default=None, description="Current retail price")
    weight: Optional[str] = Field(default=None, description="Product weight with unit")
    primary_image_url: Optional[str] = Field(default=None, description="Main image URL")
    category: Optional[str] = Field(default=None, description="Product category")

EXTRACTION_PROMPT = """Extract product fields from web page text. Only use information explicitly present. Use null for missing fields."""

def extract_product_with_ai(cleaned_text: str, model: str = "gpt-4o") -> ProductExtraction:
    response = openai.beta.chat.completions.parse(
        model=model,
        messages=[
            {"role": "system", "content": EXTRACTION_PROMPT},
            {"role": "user", "content": cleaned_text[:8000]},
        ],
        response_format=ProductExtraction,
    )
    return response.choices[0].message.parsed

results = []
for row in html_data:
    record = {
        "product_name": row["product_name"],
        "brand_name": row["brand_name"],
        "status": "error",
    }
    
    if row["status"] != "fetched":
        record["status"] = row["status"]
        results.append(record)
        continue
    
    try:
        print(f"Extracting: {row['product_name']}...")
        extraction = extract_product_with_ai(row["cleaned_text"])
        record["extracted_name"] = extraction.product_name
        record["extracted_brand"] = extraction.brand_name
        record["extracted_price"] = extraction.price
        record["extracted_weight"] = extraction.weight
        record["extracted_category"] = extraction.category
        record["extracted_description"] = extraction.description[:60] + "..." if extraction.description else None
        record["status"] = "success"
    except Exception as e:
        print(f"  Error: {type(e).__name__}")
        record["status"] = f"error: {type(e).__name__}"
    
    results.append(record)
    time.sleep(2)

results_df = pd.DataFrame(results)
print(f"\nAI Extraction Results:\n")
display(results_df[["product_name", "status", "extracted_price", "extracted_weight", "extracted_category"]])

Extracting: GR1 26L...Extracting: Rucker 4.0 20L...Extracting: Bullet Ruck 15L...Extracting: Atmos AG 65...Extracting: Exos 58...AI Extraction Results:      product_name   status  ...             extracted_weight extracted_category0          GR1 26L  success  ...  21L: 2.8 LBS / 26L: 3.1 LBS           Rucksack1   Rucker 4.0 20L  success  ...                          NaN   Rucking Backpack2  Bullet Ruck 15L  success  ...                     1.62 LBS           Backpack3      Atmos AG 65  success  ...                        50 ml           Skincare4          Exos 58  success  ...                      5.9 lbs    Vacuum Cleaners[5 rows x 5 columns]

### Question 4: Evaluate results

Build a summary showing extraction success rate and field coverage.

In [4]:
success_df = results_df[results_df["status"] == "success"]
total = len(results_df)
success_count = len(success_df)

# Summary stats
summary = pd.DataFrame({
    "Metric": ["Total products", "Successful", "Failed", "Success rate"],
    "Value": [
        total,
        success_count,
        total - success_count,
        f"{success_count / total:.0%}" if total > 0 else "N/A"
    ]
})

print("Extraction Summary:\n")
display(summary)

# Field coverage
if success_count > 0:
    fields = ["extracted_name", "extracted_brand", "extracted_price", "extracted_weight", "extracted_category"]
    coverage_data = []
    for field in fields:
        if field in success_df.columns:
            populated = success_df[field].notna().sum()
            coverage_data.append({
                "Field": field.replace("extracted_", ""),
                "Populated": populated,
                "Coverage": f"{populated}/{success_count}",
                "Percentage": f"{populated/success_count:.0%}"
            })
    
    coverage_df = pd.DataFrame(coverage_data)
    print("\nField Coverage (of successful extractions):\n")
    display(coverage_df)

Extraction Summary:           Metric Value0  Total products     51      Successful     52          Failed     03    Success rate  100%Field Coverage (of successful extractions):      Field  Populated Coverage Percentage0      name          5      5/5       100%1     brand          5      5/5       100%2     price          5      5/5       100%3    weight          4      4/5        80%4  category          5      5/5       100%

### Question 5: Estimate costs

Calculate token counts and projected costs for scaling to 450 products.

In [5]:
import tiktoken

def estimate_extraction_cost(text: str, model: str = "gpt-4o", output_tokens: int = 300) -> dict:
    encoder = tiktoken.encoding_for_model(model)
    input_tokens = len(encoder.encode(text))
    pricing = {
        "gpt-4o": {"input": 2.50 / 1_000_000, "output": 10.00 / 1_000_000},
        "gpt-4o-mini": {"input": 0.15 / 1_000_000, "output": 0.60 / 1_000_000},
    }
    rates = pricing.get(model, pricing["gpt-4o"])
    input_cost = input_tokens * rates["input"]
    output_cost = output_tokens * rates["output"]
    total_cost = input_cost + output_cost
    return {
        "input_tokens": input_tokens,
        "total_cost": round(total_cost, 6),
    }

total_input_tokens = 0
for row in html_data:
    if row["status"] == "fetched":
        cost_info = estimate_extraction_cost(row["cleaned_text"])
        total_input_tokens += cost_info["input_tokens"]

fetched_count = sum(1 for r in html_data if r["status"] == "fetched")
if fetched_count > 0:
    avg_tokens = total_input_tokens / fetched_count
    avg_cost = avg_tokens * (2.50 / 1_000_000) + 300 * (10.00 / 1_000_000)
    
    cost_summary = pd.DataFrame({
        "Metric": [
            f"Total input tokens ({fetched_count} products)",
            "Average tokens per product",
            f"Estimated cost ({fetched_count} products)",
            "Projected cost (450 products)"
        ],
        "Value": [
            f"{total_input_tokens:,}",
            f"{avg_tokens:,.0f}",
            f"${avg_cost * fetched_count:.4f}",
            f"${avg_cost * 450:.2f}"
        ]
    })
    
    print("Cost Estimation:\n")
    display(cost_summary)

Cost Estimation:                            Metric    Value0  Total input tokens (5 products)   14,6741       Average tokens per product    2,9352      Estimated cost (5 products)  $0.05173    Projected cost (450 products)    $4.65

### Question 6: Compare to manual extraction

Compare manual CSS selector approach vs AI extraction on the same product.

In [6]:
# Manual extraction for demonstration
def extract_manual_simple(html: str) -> dict:
    import re
    soup = BeautifulSoup(html, "html.parser")
    
    # Simple h1 extraction
    title_el = soup.find("h1")
    title = title_el.get_text(" ", strip=True) if title_el else None
    
    # Regex for price
    text = soup.get_text()
    price_match = re.search(r'\$([\d,]+(?:\.\d{2})?)', text)
    price = f"${price_match.group(1)}" if price_match else None
    
    return {
        "product_name": title,
        "price": price,
        "weight": None,
        "category": None,
    }

# Compare on first successful product
if len(html_data) > 0 and html_data[0]["status"] == "fetched":
    # Get raw HTML again for manual extraction
    test_url = df.iloc[0]["product_url"]
    manual_html = requests.get(test_url, timeout=10).text
    manual_result = extract_manual_simple(manual_html)
    
    # Get AI result
    ai_result = results_df.iloc[0] if len(results_df) > 0 else None
    
    comparison = pd.DataFrame({
        "Field": ["product_name", "price", "weight", "category"],
        "Manual Extraction": [
            manual_result["product_name"],
            manual_result["price"],
            manual_result["weight"],
            manual_result["category"]
        ],
        "AI Extraction": [
            ai_result.get("extracted_name") if ai_result is not None else None,
            ai_result.get("extracted_price") if ai_result is not None else None,
            ai_result.get("extracted_weight") if ai_result is not None else None,
            ai_result.get("extracted_category") if ai_result is not None else None,
        ],
    })
    
    print(f"Manual vs AI Comparison (Product: {df.iloc[0]['product_name']}):\n")
    display(comparison)
    
    print("\nKey Observations:")
    print("✓ AI extracted more fields (weight, category)")
    print("✓ AI works across all sites without site-specific code")
    print("✓ Manual approach requires CSS selectors for each site")

Manual vs AI Comparison (Product: GR1 26L):          Field  Manual Extraction                AI Extraction0  product_name  GR1 USA - Cordura                   GORUCK GR11         price            $155.00                      $335.002        weight                NaN  21L: 2.8 LBS / 26L: 3.1 LBS3      category                NaN                     RucksackKey Observations:✓ AI extracted more fields (weight, category)✓ AI works across all sites without site-specific code✓ Manual approach requires CSS selectors for each site