In [None]:
# (Optional) If running locally, install deps here
import sys, subprocess
def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])

pkgs = [
    "aiohttp>=3.10",
    "beautifulsoup4>=4.12",
    "lxml>=5.2",
    "pandas>=2.2",
    "python-dotenv>=1.0",
    "openai>=1.51",
]
try:
    import aiohttp, bs4, lxml, pandas, dotenv, openai
except Exception:
    pip_install(pkgs)
print("Ready ✔")

In [None]:
import os, asyncio, aiohttp, pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()  # reads .env if present
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = os.getenv("OPENAI_DEFAULT_MODEL", "gpt-4.1-mini")
if not OPENAI_API_KEY:
    print("Set OPENAI_API_KEY in .env or environment.")
client = OpenAI(api_key=OPENAI_API_KEY)

DEFAULT_HEADERS = {"User-Agent": "FirstPageSummarizer/1.0 (+https://edwarddonner.com"}

In [None]:
URLS_INLINE = [
    "https://edwarddonner.com"
]
URLS_FILE = None  # e.g., "urls.txt" or "urls.csv"

In [None]:
import csv
def load_urls(urls_inline, urls_file):
    urls = []
    if urls_file and os.path.exists(urls_file):
        if urls_file.endswith(".csv"):
            df = pd.read_csv(urls_file)
            if "url" in df.columns:
                urls.extend(df["url"].dropna().tolist())
        else:
            with open(urls_file, "r", encoding="utf-8") as f:
                for line in f:
                    line=line.strip()
                    if line:
                        urls.append(line)
    urls.extend([u for u in urls_inline if u])
    # de-dup while preserving order
    seen=set(); out=[]
    for u in urls:
        if u not in seen:
            seen.add(u); out.append(u)
    return out

URLS = load_urls(URLS_INLINE, URLS_FILE)
print(f"Loaded {len(URLS)} URLs")

In [None]:
from aiohttp import ClientTimeout
from bs4 import BeautifulSoup
try:
    from bs4 import FeatureNotFound
except Exception:
    class FeatureNotFound(Exception):
        ...

DEFAULT_HEADERS = {"User-Agent": "FirstPageSummarizer/1.0 (+https://edwarddonner.com)"}

async def fetch_one(session, url):
    """Fetch just one page (HTML if available)."""
    try:
        async with session.get(
            url,
            timeout=ClientTimeout(total=20),
            headers=DEFAULT_HEADERS,
            allow_redirects=True
        ) as r:
            ctype = r.headers.get("Content-Type", "") or ""
            is_html = "html" in ctype.lower()
            text = await r.text(errors="ignore") if is_html else ""
            return {
                "url": str(r.url),
                "status": r.status,
                "content_type": ctype,
                "html": text,
            }
    except Exception as e:
        return {"url": url, "status": None, "content_type": "", "html": "", "error": str(e)}

def make_soup(html: str) -> BeautifulSoup:
    """Try lxml parser first, fall back to built-in html.parser if missing."""
    try:
        return BeautifulSoup(html, "lxml")
    except FeatureNotFound:
        return BeautifulSoup(html, "html.parser")

def extract_fields(url, html):
    """Extract title, meta description, and text from HTML."""
    soup = make_soup(html)
    title = soup.title.string.strip() if soup.title and soup.title.string else ""

    meta_desc = ""
    m = soup.find("meta", attrs={"name": "description"})
    if m and m.get("content"):
        meta_desc = m["content"].strip()

    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text = soup.get_text(" ", strip=True)
    text = text[:8000]  # truncate to limit token size
    return title, meta_desc, text

async def fetch_all(urls):
    """Fetch and extract fields for a list of URLs (first page only)."""
    import aiohttp
    out = []
    async with aiohttp.ClientSession() as session:
        for u in urls:
            resp = await fetch_one(session, u)
            if resp.get("html"):
                title, meta_desc, text = extract_fields(resp["url"], resp["html"])
                resp.update({"title": title, "meta_description": meta_desc, "text": text})
            out.append(resp)
    return out

# Example usage in notebook (if URLS is defined):
# results = await fetch_all(URLS)
# len(results), results[:1]


In [None]:
URLS = ["https://edwarddonner.com", "https://www.wikipedia.org/"]
results = await fetch_all(URLS)
len(results), results[:1]


In [None]:
import os, json
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
MODEL = os.getenv("OPENAI_DEFAULT_MODEL", "gpt-4.1-mini")

SYSTEM_PROMPT = """
You summarize a web page for migration planning. 
Return JSON with:
- title: short page title
- meta_description: concise (<= 160 chars)
- summary: 3-5 bullet points as a single string
- category: one of [blog, docs, product, pricing, careers, marketing, legal, support, account, other]
- key_entities: array of 3-8 important entities/keywords
"""

def summarize_page(row):
    user = (
        f"URL: {row['url']}\n"
        f"<title>{row.get('title','')}</title>\n"
        f"<meta_description>{row.get('meta_description','')}</meta_description>\n"
        f"<text>\n{row.get('text','')[:6000]}\n</text>"
    )
    resp = client.responses.create(
        model=MODEL,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user},
        ],
        response_format={"type": "json_object"}
    )
    return json.loads(resp.output[0].content[0].text)


In [None]:
enriched = []
for r in results:
    if r.get("status") and 200 <= r["status"] < 400 and "html" in r.get("content_type","").lower():
        try:
            data = summarize_page(r)
            enriched.append({**r, **data})
        except Exception as e:
            enriched.append({**r, "error": str(e)})
    else:
        enriched.append({**r, "error": "Non-HTML or bad status"})


In [None]:
import pandas as pd

df = pd.DataFrame(enriched)
df.to_csv("firstpage_summary.csv", index=False)
df.head()
