# Chapter 11 Guide

## 11.4.1

In [1]:
import os  #A
import requests  #B

SERPAPI_KEY = os.getenv("SERPAPI_KEY")  #C

def search_product_urls(search_key: str, num_results: int = 5) -> list[dict]:  #D
    """Search for product page candidates using SerpAPI."""
    params = {  #E
        "q": search_key,
        "api_key": SERPAPI_KEY,
        "num": num_results,
        "engine": "google",
    }
    resp = requests.get("https://serpapi.com/search", params=params)  #F
    resp.raise_for_status()
    data = resp.json()
    
    candidates = []  #G
    for result in data.get("organic_results", []):
        candidates.append({
            "title": result.get("title", ""),
            "url": result.get("link", ""),
            "snippet": result.get("snippet", ""),
            "position": result.get("position", 0),
        })
    return candidates  #H

# Example usage
candidates = search_product_urls("GORUCK GR1 26L")  #I
for c in candidates[:3]:  # Show first 3 to save space
    print(f"  [{c['position']}] {c['title']}")
    print(f"       {c['url']}")

  [1] GR1       https://www.goruck.com/collections/gr1?srsltid=AfmBOoqgAX5xMipioTMJ2Q2tShBKUmht1LajhCfX0UiEiZNgGS2n6Jkt  [2] GR1       https://www.goruck.com/products/gr1-usa?srsltid=AfmBOoowHdfC_HUysWWLQaYEPMbwPPSxpf-VC1fdajBevE14xn5Y61xK  [3] Goruck Gr1 26 liter worth it? : r/onebag       https://www.reddit.com/r/onebag/comments/1fydqja/goruck_gr1_26_liter_worth_it/

## 11.4.2

In [2]:
import openai  #A
from pydantic import BaseModel  #B

class URLRanking(BaseModel):  #C
    best_url: str
    confidence: str  # "high", "medium", "low"
    reasoning: str

def rank_urls_with_ai(  #D
    search_key: str,
    candidates: list[dict],
    model: str = "gpt-4o",
) -> URLRanking:
    """Use an LLM to pick the best product page from search results."""
    
    candidate_text = ""  #E
    for c in candidates[:3]:  # Limit to save tokens
        candidate_text += (
            f"Position {c['position']}:\n"
            f"  Title: {c['title']}\n"
            f"  URL: {c['url']}\n"
            f"  Snippet: {c['snippet']}\n\n"
        )
    
    system_prompt = """You are a data engineering assistant helping build a product database.
Given a product search key and a list of candidate URLs from search results,
pick the single best URL for extracting structured product data.

Prefer:
1. Manufacturer or official brand pages
2. Pages likely to contain: product name, price, description, weight, images
3. Individual product pages over category or listing pages
4. Stable URLs over session-specific or filtered URLs

Avoid:
- Review sites, forums, Reddit threads
- Retailer pages when a manufacturer page is available
- Category pages that list multiple products

Return the best URL, your confidence level, and a brief explanation."""  #F
    
    user_prompt = (
        f"Product: {search_key}\n\nCandidate URLs:\n{candidate_text}"
    )  #G
    
    response = openai.beta.chat.completions.parse(  #H
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        response_format=URLRanking,  #I
    )
    return response.choices[0].message.parsed  #J

# Example usage
ranking = rank_urls_with_ai("GORUCK GR1 26L", candidates)  #K
print(f"Best URL: {ranking.best_url}")
print(f"Confidence: {ranking.confidence}")
print(f"Reasoning: {ranking.reasoning}")

Best URL: https://www.goruck.com/products/gr1-usa?srsltid=AfmBOoowHdfC_HUysWWLQaYEPMbwPPSxpf-VC1fdajBevE14xn5Y61xKConfidence: HighReasoning: The second URL (https://www.goruck.com/products/gr1-usa) is the best choice because it appears to be a direct link to the individual product page on the official GORUCK website. This page likely contains detailed product information, including the name, description, price, images, and specifications like weight, making it ideal for extracting structured product data. The first URL seems more like a category or collection page, which might list multiple products. The third URL is a Reddit discussion, which is unsuitable for structured product data extraction.

## 11.5.1

In [3]:
from bs4 import BeautifulSoup  #A

REMOVE_TAGS = [  #B
    "script", "style", "nav", "footer", "header",
    "iframe", "noscript", "svg", "form",
]

REMOVE_CLASSES = [  #C
    "breadcrumb", "related-products", "recently-viewed",
    "newsletter", "cookie-banner", "site-footer",
    "site-header", "cart-drawer", "search-modal",
    "review", "reviews", "ratings",
]

def clean_html_aggressive(html: str) -> str:  #D
    """Remove non-product HTML elements to reduce noise and token count."""
    soup = BeautifulSoup(html, "html.parser")
    
    # Remove unwanted tags entirely
    for tag_name in REMOVE_TAGS:  #E
        for element in soup.find_all(tag_name):
            element.decompose()
    
    # Remove elements by class name patterns
    for class_pattern in REMOVE_CLASSES:  #F
        for element in soup.find_all(
            class_=lambda c: c and class_pattern in " ".join(c).lower()
        ):
            element.decompose()
    
    # Remove empty elements
    for element in soup.find_all():  #G
        if not element.get_text(strip=True) and not element.find("img"):
            element.decompose()
    
    clean_text = " ".join(soup.stripped_strings)  #H
    return clean_text

# Example usage
raw_html = requests.get("https://www.goruck.com/products/gr1").text  #I
clean = clean_html_aggressive(raw_html)
print(f"Raw HTML: {len(raw_html):,} characters")
print(f"Cleaned:  {len(clean):,} characters")
print(f"Reduction: {(1 - len(clean) / len(raw_html)) * 100:.0f}%")

Raw HTML: 2,032,625 charactersCleaned:  20,419 charactersReduction: 99%

## 11.5.2

In [4]:
from pydantic import BaseModel  #A

class ContentTriage(BaseModel):  #B
    product_sections: list[int]
    non_product_sections: list[int]

def triage_content(text_blocks: list[str], model: str = "gpt-4o-mini") -> ContentTriage:  #C
    """Use a lightweight LLM to classify text blocks as product or non-product."""
    blocks_text = ""  #D
    for i, block in enumerate(text_blocks[:5]):  # Limit for demo
        blocks_text += f"[Block {i}]: {block[:100]}...\n\n"
    
    system_prompt = """You are a data engineering assistant. Given a list of text blocks
from a product web page, classify each block as either product-relevant or not.

Product-relevant blocks contain: product name, price, description, specifications,
weight, dimensions, materials, features, or sizing information.

Non-product blocks contain: navigation, shipping info, return policies, reviews,
promotional banners, newsletter signups, or generic site content.

Return two lists: product_sections (the block numbers that contain product data)
and non_product_sections (everything else)."""  #E
    
    response = openai.beta.chat.completions.parse(  #F
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": blocks_text},
        ],
        response_format=ContentTriage,
    )
    return response.choices[0].message.parsed  #G

# Example - split cleaned text into blocks
blocks = clean.split('.')[: 10]  # First 10 sentences
triage = triage_content(blocks)
print(f"Product sections: {triage.product_sections}")
print(f"Non-product sections: {triage.non_product_sections}")

Product sections: []Non-product sections: [0, 1, 2, 3, 4]

## 11.6.1

In [5]:
from pydantic import BaseModel, Field  #A
from typing import Optional  #B

class ProductExtraction(BaseModel):  #C
    """Schema for extracting product data from web page content."""
    product_name: str = Field(  #D
        description="Full product name as shown on the page"
    )
    brand_name: str = Field(  #E
        description="Manufacturer or brand name"
    )
    description: Optional[str] = Field(  #F
        default=None,
        description="Product description, typically 1-3 sentences"
    )
    price: Optional[str] = Field(  #G
        default=None,
        description="Current retail price including currency symbol"
    )
    weight: Optional[str] = Field(  #H
        default=None,
        description="Product weight with unit (e.g., '2.5 lbs', '1.1 kg')"
    )
    primary_image_url: Optional[str] = Field(  #I
        default=None,
        description="URL of the main product image"
    )
    category: Optional[str] = Field(  #J
        default=None,
        description="Product category (e.g., backpack, tent, sleeping bag)"
    )

# Schema is now defined and ready to use
print("ProductExtraction schema defined")

ProductExtraction schema defined

## 11.6.2

In [6]:
import openai  #A

EXTRACTION_PROMPT = """You are a product data extraction assistant for a data engineering pipeline.

Given the text content of a product web page, extract the following fields accurately:
- product_name: The full product name as displayed on the page
- brand_name: The manufacturer or brand
- description: A concise product description (1-3 sentences)
- price: The current retail price with currency symbol
- weight: The product weight with unit if available
- primary_image_url: The URL of the main product image if found in the text
- category: The product category (backpack, tent, sleeping bag, headlamp, etc.)

Rules:
- Only extract information that is explicitly present in the text
- Use null for any field you cannot find or confidently determine
- Do not guess or fabricate values
- For price, use the current or sale price, not the original price if both are shown
- For weight, include the unit (lbs, oz, kg, g)
- For category, use a simple label based on what the product is"""  #B

def extract_product_with_ai(  #C
    cleaned_text: str,
    model: str = "gpt-4o",
) -> ProductExtraction:
    """Extract product fields from cleaned page text using an LLM."""
    response = openai.beta.chat.completions.parse(  #D
        model=model,
        messages=[
            {"role": "system", "content": EXTRACTION_PROMPT},
            {"role": "user", "content": cleaned_text[:3000]},  #E - Limit for demo
        ],
        response_format=ProductExtraction,  #F
    )
    return response.choices[0].message.parsed  #G

# Example usage
product = extract_product_with_ai(clean)  #H

print(f"Name:        {product.product_name}")
print(f"Brand:       {product.brand_name}")
print(f"Price:       {product.price}")
print(f"Weight:      {product.weight}")
print(f"Category:    {product.category}")
if product.description:
    print(f"Description: {product.description[:80]}...")

Name:        GR1Brand:       GORUCKPrice:       NoneWeight:      NoneCategory:    backpack

## 11.6.3

In [7]:
import pandas as pd  #A
from bs4 import BeautifulSoup  #B

def extract_manual_goruck(html: str) -> dict:  #C
    """Manual extraction using Chapter 10's CSS selector approach."""
    soup = BeautifulSoup(html, "html.parser")
    
    title_el = soup.find("h1")
    title = title_el.get_text(" ", strip=True) if title_el else None
    
    price_el = (
        soup.select_one(
            "div.product-block__price span.price-item--sale.price-item--last"
        )
        or soup.select_one("div.product-block__price span.price-item--regular")
    )
    price = price_el.get_text(" ", strip=True) if price_el else None
    
    return {
        "product_name": title,
        "brand_name": "GORUCK",  # hardcoded for this site
        "price": price,
        "weight": None,  # manual approach did not extract this
        "category": None,  # manual approach did not extract this
        "description": None,  # manual approach did not extract this
    }

# Run both approaches on the same page
url = "https://www.goruck.com/products/gr1"  #D
raw_html = requests.get(url).text
manual_result = extract_manual_goruck(raw_html)  #E

cleaned = clean_html_aggressive(raw_html)
ai_result = extract_product_with_ai(cleaned)  #F

# Build comparison table
comparison = pd.DataFrame({  #G
    "Field": ["product_name", "brand_name", "price", "weight",
              "category", "description"],
    "Manual": [
        manual_result["product_name"],
        manual_result["brand_name"],
        manual_result["price"],
        manual_result["weight"],
        manual_result["category"],
        manual_result["description"],
    ],
    "AI": [
        ai_result.product_name,
        ai_result.brand_name,
        ai_result.price,
        ai_result.weight,
        ai_result.category,
        ai_result.description[:50] if ai_result.description else None,
    ],
})
print(comparison.to_string(index=False))

       Field            Manual     AIproduct_name GR1 USA - Cordura    GR1  brand_name            GORUCK GORUCK       price           $335.00    NaN      weight               NaN    NaN    category               NaN    NaN description               NaN    NaN

## 11.7.1

In [8]:
import time  #A
import pandas as pd  #B

test_products = [  #D
    {"name": "GORUCK GR1 26L",
     "url": "https://www.goruck.com/products/gr1"},
]

results = []  #E
for product in test_products:  # Demo with 1 product to save time/cost
    record = {  #F
        "search_key": product["name"],
        "url": product["url"],
        "status": "error",
    }
    try:
        raw_html = requests.get(product["url"]).text  #G
        cleaned = clean_html_aggressive(raw_html)
        extraction = extract_product_with_ai(cleaned)  #H
        
        record["product_name"] = extraction.product_name
        record["brand_name"] = extraction.brand_name
        record["price"] = extraction.price
        record["weight"] = extraction.weight
        record["category"] = extraction.category
        record["description"] = (
            extraction.description[:60] + "..."
            if extraction.description
            else None
        )
        record["status"] = "success"  #I
    except Exception as e:
        record["status"] = f"error: {type(e).__name__}"  #J
    
    results.append(record)
    time.sleep(1)  #K

out = pd.DataFrame(results)  #L
print(out[["search_key", "status", "product_name", "price", "weight", "category"]]
      .to_string(index=False))

    search_key  status product_name price weight categoryGORUCK GR1 26L success          GR1  None   None     None

## 11.8

In [9]:
import tiktoken  #A

def estimate_extraction_cost(  #B
    text: str,
    model: str = "gpt-4o",
    output_tokens: int = 300,
) -> dict:
    """Estimate the token count and cost for an extraction call."""
    encoder = tiktoken.encoding_for_model(model)  #C
    input_tokens = len(encoder.encode(text))  #D
    
    # Pricing as of mid-2025 (check docs for current rates)
    pricing = {  #E
        "gpt-4o": {"input": 2.50 / 1_000_000, "output": 10.00 / 1_000_000},
        "gpt-4o-mini": {"input": 0.15 / 1_000_000, "output": 0.60 / 1_000_000},
    }
    
    rates = pricing.get(model, pricing["gpt-4o"])  #F
    input_cost = input_tokens * rates["input"]
    output_cost = output_tokens * rates["output"]
    total_cost = input_cost + output_cost
    
    return {  #G
        "input_tokens": input_tokens,
        "output_tokens_est": output_tokens,
        "input_cost": round(input_cost, 6),
        "output_cost": round(output_cost, 6),
        "total_cost": round(total_cost, 6),
    }

# Estimate for a single product page
cost = estimate_extraction_cost(clean)  #H
print(f"Input tokens:  {cost['input_tokens']:,}")
print(f"Output tokens: {cost['output_tokens_est']:,} (estimated)")
print(f"Cost per page: ${cost['total_cost']:.4f}")
print(f"Cost for 450 products: ${cost['total_cost'] * 450:.2f}")

Input tokens:  5,121Output tokens: 300 (estimated)Cost per page: $0.0158Cost for 450 products: $7.11