# NYTBee Scraper

This notebook fetches a NYTBee page and extracts visible text from the page's main content.


In [None]:
from __future__ import annotations

import argparse
from html.parser import HTMLParser
from typing import Optional
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

DEFAULT_URL = "https://nytbee.com/Bee_20260130.html"
USER_AGENT = "Mozilla/5.0 (compatible; NYTBeeScraper/1.0)"


In [None]:
class TagTextExtractor(HTMLParser):
    """Extract visible text from a specific tag."""

    def __init__(self, tag: str) -> None:
        super().__init__(convert_charrefs=True)
        self.tag = tag
        self.depth = 0
        self._texts: list[str] = []
        self._skip_depth = 0

    @property
    def text(self) -> str:
        lines = [line.strip() for line in "\n".join(self._texts).splitlines()]
        return "\n".join([line for line in lines if line])

    def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
        if tag in {"script", "style"}:
            self._skip_depth += 1
            return
        if tag == self.tag:
            self.depth += 1

    def handle_endtag(self, tag: str) -> None:
        if tag in {"script", "style"} and self._skip_depth:
            self._skip_depth -= 1
            return
        if tag == self.tag and self.depth:
            self.depth -= 1

    def handle_data(self, data: str) -> None:
        if self.depth and not self._skip_depth:
            self._texts.append(data)


In [None]:
def fetch_html(url: str, timeout: int = 20) -> str:
    request = Request(url, headers={"User-Agent": USER_AGENT})
    with urlopen(request, timeout=timeout) as response:
        charset = response.headers.get_content_charset() or "utf-8"
        return response.read().decode(charset, errors="replace")


def extract_content(html: str) -> str:
    for tag in ("main", "body"):
        extractor = TagTextExtractor(tag)
        extractor.feed(html)
        content = extractor.text
        if content:
            return content
    return ""


## Fetch and extract content

Set `url` to the NYTBee page you want to scrape, then run the cell.


In [None]:
url = DEFAULT_URL

try:
    html = fetch_html(url, timeout=20)
except HTTPError as exc:
    raise SystemExit(f"HTTP error fetching {url}: {exc}")
except URLError as exc:
    raise SystemExit(f"URL error fetching {url}: {exc}")

content = extract_content(html)
if not content:
    raise SystemExit("No content extracted from the page.")

print(content)
