# Chapter 11 Guide

## Setup

In [1]:
# Load API keys from .env file
from dotenv import load_dotenv
load_dotenv()

print("✓ Environment variables loaded")
print("Ready to run listings 11.1-11.3, 11.5-11.9")

✓ Environment variables loaded
Ready to run listings 11.1-11.3, 11.5-11.9


## 11.1

In [2]:
import os  #A
import requests  #B
import pandas as pd

print("="*60)
print("LISTING 11.1: Search Product URLs with SerpAPI")
print("="*60)

SERPAPI_KEY = os.getenv("SERPAPI_KEY")  #C

def search_product_urls(search_key: str, num_results: int = 5) -> list[dict]:  #D
    """Search for product page candidates using SerpAPI."""
    params = {  #E
        "q": search_key,
        "api_key": SERPAPI_KEY,
        "num": num_results,
        "engine": "google",
    }
    resp = requests.get("https://serpapi.com/search", params=params)  #F
    resp.raise_for_status()
    data = resp.json()
    
    candidates = []  #G
    for result in data.get("organic_results", []):
        candidates.append({
            "title": result.get("title", ""),
            "url": result.get("link", ""),
            "snippet": result.get("snippet", ""),
            "position": result.get("position", 0),
        })
    return candidates  #H

# Example usage
search_query = "GORUCK GR1 26L"
print(f"\nSearching for: '{search_query}'\n")

candidates = search_product_urls(search_query)  #I

# Display as DataFrame
df_candidates = pd.DataFrame(candidates)
print(f"Found {len(candidates)} candidate URLs:\n")
display(df_candidates[['position', 'title', 'url']])

print(f"\n✓ {len(candidates)} URLs ready for AI ranking")

  from pandas.core import (


LISTING 11.1: Search Product URLs with SerpAPI

Searching for: 'GORUCK GR1 26L'

Found 4 candidate URLs:



Unnamed: 0,position,title,url
0,1,GR1,https://www.goruck.com/collections/gr1?srsltid...
1,2,GR1,https://www.goruck.com/products/gr1-usa?srslti...
2,3,Goruck Gr1 26 liter worth it? : r/onebag,https://www.reddit.com/r/onebag/comments/1fydq...
3,4,GR1 USA - Heritage Waxed Canvas,https://www.goruck.com/products/gr1-usa-herita...



✓ 4 URLs ready for AI ranking


## 11.2

In [3]:
import openai  #A
from pydantic import BaseModel  #B

print("="*60)
print("LISTING 11.2: Rank URLs with AI")
print("="*60)

class URLRanking(BaseModel):  #C
    best_url: str
    confidence: str  # "high", "medium", "low"
    reasoning: str

def rank_urls_with_ai(  #D
    search_key: str,
    candidates: list[dict],
    model: str = "gpt-4o",
) -> URLRanking:
    """Use an LLM to pick the best product page from search results."""
    
    candidate_text = ""  #E
    for c in candidates:
        candidate_text += (
            f"Position {c['position']}:\n"
            f"  Title: {c['title']}\n"
            f"  URL: {c['url']}\n"
            f"  Snippet: {c['snippet']}\n\n"
        )
    
    system_prompt = """You are a data engineering assistant helping build a product database.
Given a product search key and a list of candidate URLs from search results,
pick the single best URL for extracting structured product data.

Prefer:
1. Manufacturer or official brand pages
2. Pages likely to contain: product name, price, description, weight, images
3. Individual product pages over category or listing pages
4. Stable URLs over session-specific or filtered URLs

Avoid:
- Review sites, forums, Reddit threads
- Retailer pages when a manufacturer page is available
- Category pages that list multiple products

Return the best URL, your confidence level, and a brief explanation."""  #F
    
    user_prompt = (
        f"Product: {search_key}\n\nCandidate URLs:\n{candidate_text}"
    )  #G
    
    response = openai.beta.chat.completions.parse(  #H
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        response_format=URLRanking,  #I
    )
    return response.choices[0].message.parsed  #J

# Example usage
print(f"\nAsking AI to rank 4{len(candidates)} URLs...\n")
ranking = rank_urls_with_ai(search_query, candidates)  #K

# Display results
print(f"Best URL (Confidence: {ranking.confidence}):")
print(f"{ranking.best_url}\n")
print(f"Reasoning:")
print(f"{ranking.reasoning}\n")
print(f"✓ Best URL selected for extraction")

LISTING 11.2: Rank URLs with AI

Asking AI to rank 4 URLs...

Best URL (Confidence: High):
https://www.goruck.com/products/gr1-usa?srsltid=AfmBOoqQDJXhcczl4aydkWLHDQnrTe4j4uzDdYWugoVm03qEMbgqbz5S

Reasoning:
The candidate URL from Position 2 (https://www.goruck.com/products/gr1-usa) is the best choice because it appears to be a direct product page for the GORUCK GR1 26L backpack. This URL is likely to contain comprehensive structured product data such as detailed product name, price, description, and possibly images, as it is designated as a product page rather than a collection or category page. Additionally, it is an official brand page directly from GORUCK, ensuring authenticity and accuracy.

✓ Best URL selected for extraction


## 11.3

In [4]:
from bs4 import BeautifulSoup  #A

print("="*60)
print("LISTING 11.3: Aggressive HTML Cleaning")
print("="*60)

REMOVE_TAGS = [  #B
    "script", "style", "nav", "footer", "header",
    "iframe", "noscript", "svg", "form",
]

REMOVE_CLASSES = [  #C
    "breadcrumb", "related-products", "recently-viewed",
    "newsletter", "cookie-banner", "site-footer",
    "site-header", "cart-drawer", "search-modal",
    "review", "reviews", "ratings",
]

def clean_html_aggressive(html: str) -> str:  #D
    """Remove non-product HTML elements to reduce noise and token count."""
    soup = BeautifulSoup(html, "html.parser")
    
    # Remove unwanted tags entirely
    for tag_name in REMOVE_TAGS:  #E
        for element in soup.find_all(tag_name):
            element.decompose()
    
    # Remove elements by class name patterns
    for class_pattern in REMOVE_CLASSES:  #F
        for element in soup.find_all(
            class_=lambda c: c and class_pattern in " ".join(c).lower()
        ):
            element.decompose()
    
    # Remove empty elements
    for element in soup.find_all():  #G
        if not element.get_text(strip=True) and not element.find("img"):
            element.decompose()
    
    clean_text = " ".join(soup.stripped_strings)  #H
    return clean_text

# Example usage - fetch the best URL from previous step
print(f"\nFetching HTML from: {ranking.best_url}\n")
raw_html = requests.get(ranking.best_url).text  #I
clean = clean_html_aggressive(raw_html)

# Display cleaning stats
reduction_pct = (1 - len(clean) / len(raw_html)) * 100
print(f"Cleaning Results:")
print(f"  Raw HTML:    {len(raw_html):,} characters")
print(f"  Cleaned:     {len(clean):,} characters")
print(f"  Reduction:   {reduction_pct:.1f}%\n")
print(f"First 200 characters of cleaned text:")
print(f"{clean[:200]}...\n")
print(f"✓ HTML cleaned and ready for extraction")

LISTING 11.3: Aggressive HTML Cleaning

Fetching HTML from: https://www.goruck.com/products/gr1-usa?srsltid=AfmBOoqQDJXhcczl4aydkWLHDQnrTe4j4uzDdYWugoVm03qEMbgqbz5S

Cleaning Results:
  Raw HTML:    2,032,625 characters
  Cleaned:     20,419 characters
  Reduction:   99.0%

First 200 characters of cleaned text:
GR1 | GORUCK Skip to content Presidents Day Sale | Steals, Deals & Bundles FREE RUCK PLATES WITH BASIC RUCKER FREE CURVED PLATES WITH RUCKING WEIGHT VEST FREE USA Shipping for GORUCK Tribe Members Pro...

✓ HTML cleaned and ready for extraction


## 11.4

In [5]:
from pydantic import BaseModel, Field  #A
from typing import Optional  #B

print("="*60)
print("LISTING 11.4: Define Extraction Schema")
print("="*60)

class ProductExtraction(BaseModel):  #C
    """Schema for extracting product data from web page content."""
    product_name: str = Field(  #D
        description="Full product name as shown on the page"
    )
    brand_name: str = Field(  #E
        description="Manufacturer or brand name"
    )
    description: Optional[str] = Field(  #F
        default=None,
        description="Product description, typically 1-3 sentences"
    )
    price: Optional[str] = Field(  #G
        default=None,
        description="Current retail price including currency symbol"
    )
    weight: Optional[str] = Field(  #H
        default=None,
        description="Product weight with unit (e.g., '2.5 lbs', '1.1 kg')"
    )
    primary_image_url: Optional[str] = Field(  #I
        default=None,
        description="URL of the main product image"
    )
    category: Optional[str] = Field(  #J
        default=None,
        description="Product category (e.g., backpack, tent, sleeping bag)"
    )

# Display schema
print("\nProductExtraction Schema:")
for field_name, field_info in ProductExtraction.model_fields.items():
    required = "required" if field_info.is_required() else "optional"
    print(f"  • {field_name:20s} ({required})")

print(f"\n✓ Schema defined with {len(ProductExtraction.model_fields)} fields")

LISTING 11.5: Define Extraction Schema

ProductExtraction Schema:
  • product_name         (required)
  • brand_name           (required)
  • description          (optional)
  • price                (optional)
  • weight               (optional)
  • primary_image_url    (optional)
  • category             (optional)

✓ Schema defined with 7 fields


## 11.5

In [6]:
import openai  #A

print("="*60)
print("LISTING 11.5: AI Product Extraction")
print("="*60)

EXTRACTION_PROMPT = """You are a product data extraction assistant for a data engineering pipeline.

Given the text content of a product web page, extract the following fields accurately:
- product_name: The full product name as displayed on the page
- brand_name: The manufacturer or brand
- description: A concise product description (1-3 sentences)
- price: The current retail price with currency symbol
- weight: The product weight with unit if available
- primary_image_url: The URL of the main product image if found in the text
- category: The product category (backpack, tent, sleeping bag, headlamp, etc.)

Rules:
- Only extract information that is explicitly present in the text
- Use null for any field you cannot find or confidently determine
- Do not guess or fabricate values
- For price, use the current or sale price, not the original price if both are shown
- For weight, include the unit (lbs, oz, kg, g)
- For category, use a simple label based on what the product is"""  #B

def extract_product_with_ai(  #C
    cleaned_text: str,
    model: str = "gpt-4o",
) -> ProductExtraction:
    """Extract product fields from cleaned page text using an LLM."""
    response = openai.beta.chat.completions.parse(  #D
        model=model,
        messages=[
            {"role": "system", "content": EXTRACTION_PROMPT},
            {"role": "user", "content": cleaned_text[:8000]},  #E
        ],
        response_format=ProductExtraction,  #F
    )
    return response.choices[0].message.parsed  #G

# Example usage
print(f"\nExtracting product data from cleaned HTML...\n")
product = extract_product_with_ai(clean)  #H

# Display extraction results
print(f"Extracted Product Data:")
print(f"  Name:        {product.product_name}")
print(f"  Brand:       {product.brand_name}")
print(f"  Price:       {product.price}")
print(f"  Weight:      {product.weight}")
print(f"  Category:    {product.category}")
if product.description:
    desc_preview = product.description[:100] + "..." if len(product.description) > 100 else product.description
    print(f"  Description: {desc_preview}")

print(f"\n✓ Product data extracted successfully")

LISTING 11.6: AI Product Extraction

Extracting product data from cleaned HTML...

Extracted Product Data:
  Name:        GR1 USA - Cordura
  Brand:       GORUCK
  Price:       $335.00
  Weight:      2.8 LBS
  Category:    backpack
  Description: The GR1 is a rugged rucksack built to thrive in both urban and extreme conditions. Tested by Green B...

✓ Product data extracted successfully


## 11.6

In [7]:
import pandas as pd  #A
from bs4 import BeautifulSoup  #B

print("="*60)
print("LISTING 11.6: Manual vs AI Extraction Comparison")
print("="*60)

def extract_manual_goruck(html: str) -> dict:  #C
    """Manual extraction using Chapter 10's CSS selector approach."""
    soup = BeautifulSoup(html, "html.parser")
    
    title_el = soup.find("h1")
    title = title_el.get_text(" ", strip=True) if title_el else None
    
    price_el = (
        soup.select_one(
            "div.product-block__price span.price-item--sale.price-item--last"
        )
        or soup.select_one("div.product-block__price span.price-item--regular")
    )
    price = price_el.get_text(" ", strip=True) if price_el else None
    
    return {
        "product_name": title,
        "brand_name": "GORUCK",  # hardcoded for this site
        "price": price,
        "weight": None,  # manual approach did not extract this
        "category": None,  # manual approach did not extract this
        "description": None,  # manual approach did not extract this
    }

# Run both approaches on the same page
print("\nComparing manual (CSS selectors) vs AI extraction...\n")
manual_result = extract_manual_goruck(raw_html)  #E
ai_result = product  #F (from previous cell)

# Build comparison table
comparison = pd.DataFrame({  #G
    "Field": ["product_name", "brand_name", "price", "weight",
              "category", "description"],
    "Manual (Ch 10)": [
        manual_result["product_name"],
        manual_result["brand_name"],
        manual_result["price"],
        manual_result["weight"],
        manual_result["category"],
        manual_result["description"],
    ],
    "AI (Ch 11)": [
        ai_result.product_name,
        ai_result.brand_name,
        ai_result.price,
        ai_result.weight,
        ai_result.category,
        ai_result.description[:50] + "..." if ai_result.description else None,
    ],
})

display(comparison)

# Count populated fields
manual_populated = sum(1 for v in manual_result.values() if v is not None)
ai_populated = sum(1 for v in [ai_result.product_name, ai_result.brand_name, ai_result.price, 
                                 ai_result.weight, ai_result.category, ai_result.description] if v is not None)

print(f"\nComparison Summary:")
print(f"  Manual approach: {manual_populated}/6 fields populated")
print(f"  AI approach:     {ai_populated}/6 fields populated")
print(f"\n✓ AI extracted {ai_populated - manual_populated} additional fields")

LISTING 11.7: Manual vs AI Extraction Comparison

Comparing manual (CSS selectors) vs AI extraction...



Unnamed: 0,Field,Manual (Ch 10),AI (Ch 11)
0,product_name,GR1 USA - Cordura,GR1 USA - Cordura
1,brand_name,GORUCK,GORUCK
2,price,$335.00,$335.00
3,weight,,2.8 LBS
4,category,,backpack
5,description,,The GR1 is a rugged rucksack built to thrive i...



Comparison Summary:
  Manual approach: 3/6 fields populated
  AI approach:     6/6 fields populated

✓ AI extracted 3 additional fields


## 11.7

In [8]:
import time  #A
import pandas as pd  #B

print("="*60)
print("LISTING 11.7: Batch AI Extraction Across Multiple Product Pages")
print("="*60)

# Product names from our curated list (selected for reliable extraction)  #C
test_products = [
    "GORUCK GR1 26L",
    "GORUCK GR2 34L", 
    "5.11 Tactical Rush72 2.0",
    "5.11 Tactical Rush24 2.0",
]

print(f"\nProcessing {len(test_products)} products...\n")

results = []  #D
for search_key in test_products:
    record = {
        "search_key": search_key,
        "status": "error",
    }
    try:
        print(f"  • {search_key}...")
        
        # Step 1: Search for product URLs  #E
        candidates = search_product_urls(search_key)
        if not candidates:
            record["status"] = "no_results"
            results.append(record)
            continue
            
        # Step 2: Rank URLs with AI  #F
        ranking = rank_urls_with_ai(search_key, candidates)
        record["url"] = ranking.best_url
        
        # Step 3: Fetch and clean HTML  #G
        raw_html = requests.get(ranking.best_url, timeout=10).text
        cleaned = clean_html_aggressive(raw_html)
        
        # Step 4: Extract product data with AI  #H
        extraction = extract_product_with_ai(cleaned)
        
        record["product_name"] = extraction.product_name
        record["brand_name"] = extraction.brand_name
        record["price"] = extraction.price
        record["weight"] = extraction.weight
        record["category"] = extraction.category
        record["description"] = (
            extraction.description[:60] + "..."
            if extraction.description
            else None
        )
        record["status"] = "success"  #I
    except Exception as e:
        record["status"] = f"error: {type(e).__name__}"  #J
        print(f"    Error: {type(e).__name__}")
    
    results.append(record)
    time.sleep(2)  #K

out = pd.DataFrame(results)  #L
print(f"\nBatch Extraction Results:\n")
display(out[["search_key", "status", "product_name", "price", "weight", "category"]])

success_count = (out["status"] == "success").sum()
print(f"\n✓ {success_count}/{len(test_products)} products extracted successfully")

LISTING 11.8: Batch AI Extraction Across Multiple Product Pages

Processing 4 products...

  • GORUCK GR1 26L...
  • GORUCK GR2 34L...
  • 5.11 Tactical Rush72 2.0...
  • 5.11 Tactical Rush24 2.0...

Batch Extraction Results:



Unnamed: 0,search_key,status,product_name,price,weight,category
0,GORUCK GR1 26L,success,GR1 USA - Cordura,$335.00,2.8 LBS,backpack
1,GORUCK GR2 34L,success,GR2 - Cordura,$385.00,4.1 lbs,backpack
2,5.11 Tactical Rush72 2.0,success,RUSH® 72 2.0 Backpack 55L,$152.00,2.4 kgs,backpack
3,5.11 Tactical Rush24 2.0,success,RUSH® 24 2.0 Backpack 37L,$112.00,3.85 lb,backpack



✓ 4/4 products extracted successfully


## 11.8 - Listing 11.8: Token & Cost Estimation

In [9]:
import tiktoken  #A

print("="*60)
print("LISTING 11.8: Token & Cost Estimation")
print("="*60)

def estimate_extraction_cost(  #B
    text: str,
    model: str = "gpt-4o",
    output_tokens: int = 300,
) -> dict:
    """Estimate the token count and cost for an extraction call."""
    encoder = tiktoken.encoding_for_model(model)  #C
    input_tokens = len(encoder.encode(text))  #D
    
    # Pricing as of mid-2025 (check docs for current rates)
    pricing = {  #E
        "gpt-4o": {"input": 2.50 / 1_000_000, "output": 10.00 / 1_000_000},
        "gpt-4o-mini": {"input": 0.15 / 1_000_000, "output": 0.60 / 1_000_000},
    }
    
    rates = pricing.get(model, pricing["gpt-4o"])  #F
    input_cost = input_tokens * rates["input"]
    output_cost = output_tokens * rates["output"]
    total_cost = input_cost + output_cost
    
    return {  #G
        "input_tokens": input_tokens,
        "output_tokens_est": output_tokens,
        "input_cost": round(input_cost, 6),
        "output_cost": round(output_cost, 6),
        "total_cost": round(total_cost, 6),
    }

# Estimate for a single product page
print(f"\nEstimating costs for cleaned HTML (model: gpt-4o)...\n")
cost = estimate_extraction_cost(clean)  #H

print(f"Token & Cost Analysis:")
print(f"  Input tokens:          {cost['input_tokens']:,}")
print(f"  Output tokens (est):   {cost['output_tokens_est']:,}")
print(f"  Cost per page:         ${cost['total_cost']:.4f}")
print(f"  Cost for 450 products: ${cost['total_cost'] * 450:.2f}\n")
print(f"✓ Cost estimate: ~${cost['total_cost'] * 450:.2f} for full pipeline")

LISTING 11.9: Token & Cost Estimation

Estimating costs for cleaned HTML (model: gpt-4o)...

Token & Cost Analysis:
  Input tokens:          5,121
  Output tokens (est):   300
  Cost per page:         $0.0158
  Cost for 450 products: $7.11

✓ Cost estimate: ~$7.11 for full pipeline
