# Chapter 10 Guide

## 10.1

In [1]:
import pandas as pd  #A

# Load the product spreadsheet
df = pd.read_excel('rucking_camping_products.xlsx')  #B

print(f"Loaded {len(df)} products")
print(f"Columns: {list(df.columns)}")

# Create search keys by combining brand and product name
df['search_key'] = df['Brand Name'] + ' ' + df['Product Name']  #C

# Preview the data
print("\nSample products:")
for _, row in df.head(5).iterrows():  #D
    print(f"  {row['search_key']}")

  from pandas.core import (


Loaded 445 products
Columns: ['Brand Name', 'Product Name']

Sample products:
  GORUCK GR1 26L
  GORUCK GR2 34L
  GORUCK GR2 40L
  GORUCK Rucker 4.0 20L
  GORUCK Rucker 4.0 25L


## 10.2

In [2]:
import requests  #A

url = "https://www.goruck.com/products/gr1"  #B
response = requests.get(url)  #C

if response.status_code == 200:  #D
    html_content = response.text
    print(f"Retrieved {len(html_content)} characters")
else:
    print(f"Failed: {response.status_code}")  #E


Retrieved 2032429 characters


## 10.3

In [3]:
from bs4 import BeautifulSoup  #A

soup = BeautifulSoup(html_content, 'html.parser')  #B

# Remove script and style elements
for element in soup(['script', 'style', 'nav', 'footer']):  #C
    element.decompose()

# Extract all text
clean_text = ' '.join(soup.stripped_strings)  #D

print(f"Cleaned text: {len(clean_text)} characters")
print(f"\nFirst 500 characters:\n{clean_text[:500]}")


Cleaned text: 20589 characters

First 500 characters:
GR1 | GORUCK Skip to content FREE RUCK PLATES WITH BASIC RUCKER FREE CURVED PLATES WITH RUCKING WEIGHT VEST FREE USA Shipping for GORUCK Tribe Members Menu Search Search Account View my cart (0) View my cart (0) 0 Search Search Cart Your cart is empty Loading... Product image 1, can be opened in a modal. Product image 2, can be opened in a modal. Product image 3, can be opened in a modal. Product image 4, can be opened in a modal. Product image 5, can be opened in a modal. Product image 6, can b


# 10.4

In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, "html.parser")

# Try to find the product title
title_el = soup.find("h1")  #A
title = title_el.get_text(strip=True) if title_el else None

# Find the price block used on this product page
price_block = soup.select_one("div.product-block__price")  #B

price = None
if price_block:
    # Prefer sale price if present, otherwise fall back to regular price
    price_el = (  #C
        price_block.select_one("span.price-item--sale")
        or price_block.select_one("span.price-item--regular")
    )
    if price_el:
        price = price_el.get_text(" ", strip=True)

print(f"Title: {title}")
print(f"Price: {price}")

Title: GR1 USA - Cordura
Price: $335.00


In [6]:
import json
import re
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# Browser-like session headers to reduce 403 blocks
session = requests.Session()  #A
session.headers.update({      #B
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://www.google.com/",
})

products = [
    ("GORUCK GR1 26L", "https://www.goruck.com/products/gr1", Path("fixtures/gr1.html")),
    ("Osprey Atmos AG 65", "https://www.osprey.com/atmos-ag-65-atmos65s22-415?size=S%2FM&color=Mythical+Green", Path("fixtures/osprey_atmos.html")),
]

def get_html(url: str, fixture: Path) -> str:
    """Fetch HTML, fall back to a saved fixture if blocked."""
    try:
        r = session.get(url, timeout=20, allow_redirects=True)

        # If blocked, retry once with a couple extra "browser-ish" fetch headers
        if r.status_code == 403:
            r = session.get(
                url,
                timeout=20,
                allow_redirects=True,
                headers={
                    "Sec-Fetch-Site": "none",
                    "Sec-Fetch-Mode": "navigate",
                    "Sec-Fetch-Dest": "document",
                },
            )

        r.raise_for_status()

        fixture.parent.mkdir(parents=True, exist_ok=True)
        fixture.write_text(r.text, encoding="utf-8")
        return r.text

    except Exception:
        if fixture.exists():
            print(f"  Loading fixture: {fixture}")
            return fixture.read_text(encoding="utf-8")
        raise

def extract_title_price(name: str, html: str) -> dict:
    soup = BeautifulSoup(html, "html.parser")

    title_el = soup.find("h1")
    title = title_el.get_text(" ", strip=True) if title_el else None

    # GORUCK selector
    if "GORUCK" in name:
        price_el = soup.select_one("div.product-block__price span.price-item--regular")
        price = price_el.get_text(" ", strip=True) if price_el else None
        return {"title": title, "price": price}

    # Osprey: prefer JSON-LD product schema when available
    price = None
    for s in soup.find_all("script", attrs={"type": "application/ld+json"}):
        try:
            data = json.loads(s.get_text(strip=True))
        except Exception:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if isinstance(obj, dict) and str(obj.get("@type", "")).lower() == "product":
                offers = obj.get("offers", {})
                if isinstance(offers, dict) and offers.get("price") is not None:
                    price = f"${offers['price']}"
                    break
        if price:
            break

    # Fallback: regex the visible text for a dollar price
    if price is None:
        text = soup.get_text(" ", strip=True)
        m = re.search(r"\$([\d,]+(?:\.\d{2})?)", text)
        price = f"${m.group(1)}" if m else None

    return {"title": title, "price": price}

for name, url, fixture in products:
    print(f"\nFetching: {name}")
    html = get_html(url, fixture)
    data = extract_title_price(name, html)
    print(f"  Title: {data['title']}")
    print(f"  Price: {data['price']}")


Fetching: GORUCK GR1 26L
  Title: GR1 USA - Cordura
  Price: $335.00

Fetching: Osprey Atmos AG 65
  Title: Atmos AG 65
  Price: $370
