# Chapter 11 Lab

## Setup

In [1]:
# Load API keys from .env file
from dotenv import load_dotenv
load_dotenv()

print("✓ Environment variables loaded")
print("Ready to run lab exercises")

ModuleNotFoundError: No module named 'dotenv'

### Question 1: Select your products

Create a DataFrame with 10 products from at least 3 different brands.

In [2]:
import pandas as pd

print("="*60)
print("QUESTION 1: Select Products")
print("="*60)

products = [
    {"brand_name": "GORUCK", "product_name": "GR1 26L", "product_url": "https://www.goruck.com/products/gr1"},
    {"brand_name": "GORUCK", "product_name": "Rucker 4.0 20L", "product_url": "https://www.goruck.com/products/rucker"},
    {"brand_name": "GORUCK", "product_name": "Bullet Ruck 15L", "product_url": "https://www.goruck.com/products/bullet-ruck-15l"},
    {"brand_name": "Osprey", "product_name": "Atmos AG 65", "product_url": "https://www.osprey.com/us/en/product/atmos-ag-65-ATMOS65S23.html"},
    {"brand_name": "Osprey", "product_name": "Exos 58", "product_url": "https://www.osprey.com/us/en/product/exos-58-EXOS58F23.html"},
    {"brand_name": "Petzl", "product_name": "Actik Core", "product_url": "https://www.petzl.com/US/en/Sport/PERFORMANCE-headlamps/ACTIK-CORE"},
    {"brand_name": "Petzl", "product_name": "Tikka Core", "product_url": "https://www.petzl.com/US/en/Sport/TIKKA-CORE"},
    {"brand_name": "MSR", "product_name": "Hubba Hubba NX 2", "product_url": "https://www.msrgear.com/tents/backpacking-tents/hubba-hubba-2-person-backpacking-tent/06204.html"},
    {"brand_name": "Big Agnes", "product_name": "Copper Spur HV UL2", "product_url": "https://www.bigagnes.com/products/copper-spur-hv-ul2"},
    {"brand_name": "Sawyer", "product_name": "Squeeze", "product_url": "https://www.sawyer.com/products/squeeze-water-filtration-system"},
]

df = pd.DataFrame(products)

print(f"\nSelected {len(df)} products from {df['brand_name'].nunique()} brands\n")
display(df)
print(f"\n✓ Product selection complete")



### Question 2: Fetch and clean HTML

Fetch HTML for each product and apply aggressive cleaning. Demonstrate with 5 products.

In [3]:
import os
import time
import requests
from bs4 import BeautifulSoup

print("="*60)
print("QUESTION 2: Fetch & Clean HTML")
print("="*60)

# Cleaning functions from guide
REMOVE_TAGS = ["script", "style", "nav", "footer", "header", "iframe", "noscript", "svg", "form"]
REMOVE_CLASSES = ["breadcrumb", "related-products", "recently-viewed", "newsletter", "cookie-banner", "site-footer", "site-header", "cart-drawer", "search-modal", "review", "reviews", "ratings"]

def clean_html_aggressive(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag_name in REMOVE_TAGS:
        for element in soup.find_all(tag_name):
            element.decompose()
    for class_pattern in REMOVE_CLASSES:
        for element in soup.find_all(class_=lambda c: c and class_pattern in " ".join(c).lower()):
            element.decompose()
    for element in soup.find_all():
        if not element.get_text(strip=True) and not element.find("img"):
            element.decompose()
    return " ".join(soup.stripped_strings)

SERPAPI_KEY = os.getenv("SERPAPI_KEY")

def search_fallback_url(brand: str, product: str, avoid_domain: str = None) -> str:
    """Search for a working product URL when direct fetch fails."""
    # Search with retailer preference for JS-heavy brand sites
    search_key = f"{brand} {product} buy"
    params = {"q": search_key, "api_key": SERPAPI_KEY, "num": 10, "engine": "google"}
    resp = requests.get("https://serpapi.com/search", params=params)
    results = resp.json().get("organic_results", [])
    
    # Prefer retailers with static HTML (REI, Backcountry, etc.)
    preferred_retailers = ["rei.com", "backcountry.com", "moosejaw.com", "ems.com"]
    
    for r in results:
        url = r.get("link", "")
        # Skip the domain that didn't work
        if avoid_domain and avoid_domain in url:
            continue
        # Prefer known good retailers
        if any(retailer in url for retailer in preferred_retailers):
            return url
    
    # Fallback: any result not from the avoided domain
    for r in results:
        url = r.get("link", "")
        if avoid_domain and avoid_domain not in url:
            return url
    return None

print(f"\nFetching & cleaning HTML for 5 products...\n")

html_data = []
for _, row in df.head(5).iterrows():
    try:
        print(f"  • {row['product_name']}...")
        url = row["product_url"]
        raw_html = requests.get(url, timeout=10).text
        cleaned = clean_html_aggressive(raw_html)
        
        # Fallback: if cleaned content is too short, search for a better URL
        if len(cleaned) < 100:
            from urllib.parse import urlparse
            original_domain = urlparse(url).netloc
            print(f"    → Content too short ({len(cleaned)} chars), searching for better URL...")
            print(f"    → Avoiding {original_domain} (JS-heavy)")
            fallback_url = search_fallback_url(row["brand_name"], row["product_name"], avoid_domain=original_domain)
            if fallback_url and fallback_url != url:
                print(f"    → Found: {fallback_url[:60]}...")
                raw_html = requests.get(fallback_url, timeout=10).text
                cleaned = clean_html_aggressive(raw_html)
                url = fallback_url
        
        html_data.append({
            "product_name": row["product_name"],
            "brand_name": row["brand_name"],
            "raw_chars": len(raw_html),
            "cleaned_chars": len(cleaned),
            "reduction_pct": round((1 - len(cleaned) / len(raw_html)) * 100, 1),
            "cleaned_text": cleaned,
            "status": "fetched",
            "url_used": url,
        })
    except Exception as e:
        print(f"    Error: {type(e).__name__}")
        html_data.append({
            "product_name": row["product_name"],
            "brand_name": row["brand_name"],
            "raw_chars": 0,
            "cleaned_chars": 0,
            "reduction_pct": 0,
            "cleaned_text": "",
            "status": f"error: {type(e).__name__}",
            "url_used": row["product_url"],
        })
    time.sleep(1)

html_df = pd.DataFrame(html_data)
print(f"\nHTML Fetching & Cleaning Results:\n")
display(html_df[["product_name", "status", "raw_chars", "cleaned_chars", "reduction_pct"]])

avg_reduction = html_df[html_df["status"] == "fetched"]["reduction_pct"].mean()
print(f"\n✓ Average HTML reduction: {avg_reduction:.1f}%")



### Question 3: Run AI extraction

Extract product data using AI for all successfully fetched pages.

In [4]:
import openai
from pydantic import BaseModel, Field
from typing import Optional

print("="*60)
print("QUESTION 3: AI Product Extraction")
print("="*60)

# Define schema
class ProductExtraction(BaseModel):
    product_name: str = Field(description="Full product name")
    brand_name: str = Field(description="Manufacturer or brand")
    description: Optional[str] = Field(default=None, description="Product description")
    price: Optional[str] = Field(default=None, description="Current retail price")
    weight: Optional[str] = Field(default=None, description="Product weight with unit")
    primary_image_url: Optional[str] = Field(default=None, description="Main image URL")
    category: Optional[str] = Field(default=None, description="Product category")

EXTRACTION_PROMPT = """Extract product fields from web page text. Only use information explicitly present. Use null for missing fields."""

def extract_product_with_ai(cleaned_text: str, model: str = "gpt-4o") -> ProductExtraction:
    response = openai.beta.chat.completions.parse(
        model=model,
        messages=[
            {"role": "system", "content": EXTRACTION_PROMPT},
            {"role": "user", "content": cleaned_text[:8000]},
        ],
        response_format=ProductExtraction,
    )
    return response.choices[0].message.parsed

print(f"\nRunning AI extraction on {len(html_data)} products...\n")

results = []
for row in html_data:
    record = {
        "product_name": row["product_name"],
        "brand_name": row["brand_name"],
        "status": "error",
    }
    
    if row["status"] != "fetched":
        record["status"] = row["status"]
        results.append(record)
        continue
    
    try:
        print(f"  • {row['product_name']}...")
        extraction = extract_product_with_ai(row["cleaned_text"])
        record["extracted_name"] = extraction.product_name
        record["extracted_brand"] = extraction.brand_name
        record["extracted_price"] = extraction.price
        record["extracted_weight"] = extraction.weight
        record["extracted_category"] = extraction.category
        record["extracted_description"] = extraction.description[:60] + "..." if extraction.description else None
        record["status"] = "success"
    except Exception as e:
        print(f"    Error: {type(e).__name__}")
        record["status"] = f"error: {type(e).__name__}"
    
    results.append(record)
    time.sleep(2)

results_df = pd.DataFrame(results)
print(f"\nAI Extraction Results:\n")
display(results_df[["product_name", "status", "extracted_price", "extracted_weight", "extracted_category"]])

success_count = (results_df["status"] == "success").sum()
print(f"\n✓ {success_count}/{len(results_df)} extractions successful")



### Question 4: Evaluate results

Analyze extraction success rate and field coverage.

In [5]:
print("="*60)
print("QUESTION 4: Evaluate Results")
print("="*60)

success_df = results_df[results_df["status"] == "success"]
total = len(results_df)
success_count = len(success_df)

# Summary stats
print(f"\nExtraction Summary:\n")
summary = pd.DataFrame({
    "Metric": ["Total products", "Successful", "Failed", "Success rate"],
    "Value": [
        total,
        success_count,
        total - success_count,
        f"{success_count / total:.0%}" if total > 0 else "N/A"
    ]
})
display(summary)

# Field coverage
if success_count > 0:
    fields = ["extracted_name", "extracted_brand", "extracted_price", "extracted_weight", "extracted_category"]
    coverage_data = []
    for field in fields:
        if field in success_df.columns:
            populated = success_df[field].notna().sum()
            coverage_data.append({
                "Field": field.replace("extracted_", ""),
                "Populated": populated,
                "Coverage": f"{populated}/{success_count}",
                "Percentage": f"{populated/success_count:.0%}"
            })
    
    coverage_df = pd.DataFrame(coverage_data)
    print(f"\nField Coverage (of successful extractions):\n")
    display(coverage_df)
    
    avg_coverage = sum(row["Populated"] for _, row in coverage_df.iterrows()) / (len(coverage_df) * success_count) * 100
    print(f"\n✓ Average field coverage: {avg_coverage:.0f}%")



### Question 5: Estimate costs

Calculate token usage and project costs for scaling.

In [6]:
import tiktoken

print("="*60)
print("QUESTION 5: Cost Estimation")
print("="*60)

def estimate_extraction_cost(text: str, model: str = "gpt-4o", output_tokens: int = 300) -> dict:
    encoder = tiktoken.encoding_for_model(model)
    input_tokens = len(encoder.encode(text))
    pricing = {
        "gpt-4o": {"input": 2.50 / 1_000_000, "output": 10.00 / 1_000_000},
        "gpt-4o-mini": {"input": 0.15 / 1_000_000, "output": 0.60 / 1_000_000},
    }
    rates = pricing.get(model, pricing["gpt-4o"])
    input_cost = input_tokens * rates["input"]
    output_cost = output_tokens * rates["output"]
    total_cost = input_cost + output_cost
    return {
        "input_tokens": input_tokens,
        "total_cost": round(total_cost, 6),
    }

print(f"\nCalculating costs for {len(html_data)} products...\n")

total_input_tokens = 0
for row in html_data:
    if row["status"] == "fetched":
        cost_info = estimate_extraction_cost(row["cleaned_text"])
        total_input_tokens += cost_info["input_tokens"]

fetched_count = sum(1 for r in html_data if r["status"] == "fetched")
if fetched_count > 0:
    avg_tokens = total_input_tokens / fetched_count
    avg_cost = avg_tokens * (2.50 / 1_000_000) + 300 * (10.00 / 1_000_000)
    
    cost_summary = pd.DataFrame({
        "Metric": [
            f"Total input tokens ({fetched_count} products)",
            "Average tokens per product",
            f"Estimated cost ({fetched_count} products)",
            "Projected cost (450 products)"
        ],
        "Value": [
            f"{total_input_tokens:,}",
            f"{avg_tokens:,.0f}",
            f"${avg_cost * fetched_count:.4f}",
            f"${avg_cost * 450:.2f}"
        ]
    })
    
    print("Cost Estimation:\n")
    display(cost_summary)
    print(f"\n✓ Projected cost for full pipeline: ${avg_cost * 450:.2f}")



### Question 6: Compare manual vs AI extraction

Compare manual CSS selector approach to AI extraction.

In [7]:
print("="*60)
print("QUESTION 6: Manual vs AI Comparison")
print("="*60)

# Manual extraction function
def extract_manual_simple(html: str) -> dict:
    import re
    soup = BeautifulSoup(html, "html.parser")
    
    title_el = soup.find("h1")
    title = title_el.get_text(" ", strip=True) if title_el else None
    
    text = soup.get_text()
    price_match = re.search(r'\$([\d,]+(?:\.\d{2})?)', text)
    price = f"${price_match.group(1)}" if price_match else None
    
    return {
        "product_name": title,
        "price": price,
        "weight": None,
        "category": None,
    }

# Compare on first successful product
if len(html_data) > 0 and html_data[0]["status"] == "fetched":
    test_url = df.iloc[0]["product_url"]
    print(f"\nComparing extractions for: {df.iloc[0]['product_name']}\n")
    
    manual_html = requests.get(test_url, timeout=10).text
    manual_result = extract_manual_simple(manual_html)
    
    ai_result = results_df.iloc[0].to_dict() if len(results_df) > 0 else None
    
    comparison = pd.DataFrame({
        "Field": ["product_name", "price", "weight", "category"],
        "Manual (CSS selectors)": [
            manual_result["product_name"],
            manual_result["price"],
            manual_result["weight"],
            manual_result["category"]
        ],
        "AI (GPT-4o)": [
            ai_result.get("extracted_name") if ai_result else None,
            ai_result.get("extracted_price") if ai_result else None,
            ai_result.get("extracted_weight") if ai_result else None,
            ai_result.get("extracted_category") if ai_result else None,
        ],
    })
    
    display(comparison)
    
    manual_populated = sum(1 for v in manual_result.values() if v is not None)
    ai_populated = sum(1 for k in ["extracted_name", "extracted_price", "extracted_weight", "extracted_category"] 
                      if ai_result and ai_result.get(k) is not None)
    
    print(f"\nComparison Summary:")
    print(f"  Manual approach:  {manual_populated}/4 fields populated")
    print(f"  AI approach:      {ai_populated}/4 fields populated")
    print(f"\nKey Observations:")
    print(f"  ✓ AI extracted {ai_populated - manual_populated} additional field(s)")
    print(f"  ✓ AI works across all sites without site-specific code")
    print(f"  ✓ Manual approach requires CSS selectors for each site")

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().