# NYTBee Scraper

This notebook fetches a NYTBee page and extracts the list items from the main answer list.


In [None]:
from __future__ import annotations

from datetime import date, timedelta

import argparse
from html.parser import HTMLParser
from typing import Optional
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

DEFAULT_URL = "https://nytbee.com/Bee_20260130.html"
USER_AGENT = "Mozilla/5.0 (compatible; NYTBeeScraper/1.0)"


In [None]:
class MainAnswerListParser(HTMLParser):
    """Extract list items from the main answer list."""

    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self._div_depth = 0
        self._target_div_depth: Optional[int] = None
        self._ul_depth = 0
        self._li_stack: list[list[str]] = []
        self._items: list[str] = []
        self._skip_depth = 0

    @property
    def items(self) -> list[str]:
        return self._items

    def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
        if tag in {"script", "style"}:
            self._skip_depth += 1
            return
        if tag == "div":
            self._div_depth += 1
            if self._target_div_depth is None and dict(attrs).get("id") == "main-answer-list":
                self._target_div_depth = self._div_depth
            return
        if self._target_div_depth is None or self._div_depth < self._target_div_depth:
            return
        if tag == "ul":
            self._ul_depth += 1
            return
        if tag == "li" and self._ul_depth:
            self._li_stack.append([])

    def handle_endtag(self, tag: str) -> None:
        if tag in {"script", "style"} and self._skip_depth:
            self._skip_depth -= 1
            return
        if tag == "li" and self._li_stack:
            text = "".join(self._li_stack.pop()).strip()
            if text:
                self._items.append(text)
            return
        if tag == "ul" and self._ul_depth:
            self._ul_depth -= 1
            return
        if tag == "div":
            if self._target_div_depth is not None and self._div_depth == self._target_div_depth:
                self._target_div_depth = None
            if self._div_depth:
                self._div_depth -= 1

    def handle_data(self, data: str) -> None:
        if self._skip_depth or not self._li_stack:
            return
        self._li_stack[-1].append(data)


In [None]:
def fetch_html(url: str, timeout: int = 20) -> str:
    request = Request(url, headers={"User-Agent": USER_AGENT})
    with urlopen(request, timeout=timeout) as response:
        charset = response.headers.get_content_charset() or "utf-8"
        return response.read().decode(charset, errors="replace")


def extract_answer_list(html: str) -> list[str]:
    parser = MainAnswerListParser()
    parser.feed(html)
    return parser.items


## Fetch and extract answers

Set `url` to the NYTBee page you want to scrape, then run the cell.


In [None]:
base_url = "https://nytbee.com/Bee_{date}.html"

starting_date = date.today()
distinct_words: set[str] = set()
failed_urls: list[tuple[str, object]] = []

for offset in range(30):
    target_date = starting_date - timedelta(days=offset)
    url = base_url.format(date=target_date.strftime("%Y%m%d"))

    try:
        html = fetch_html(url, timeout=20)
    except (HTTPError, URLError) as exc:
        failed_urls.append((url, exc))
        continue

    items = extract_answer_list(html)
    if not items:
        failed_urls.append((url, "No answers extracted"))
        continue

    for item in items:
        word = item.strip()
        if word:
            distinct_words.add(word)

print(f"Collected {len(distinct_words)} distinct words from {30 - len(failed_urls)} days.")
for word in sorted(distinct_words):
    print(word)

if failed_urls:
    print("\nSkipped the following URLs:")
    for url, reason in failed_urls:
        print(f"- {url} ({reason})")
