# NYTBee Scraper

This notebook fetches a NYTBee page and extracts the list items from the main answer list.


In [None]:
%pip install git+https://github.com/fptprdqs66-dot/nytbee_scrapper.git


In [None]:
from datetime import date, timedelta

from tqdm import tqdm
from urllib.error import HTTPError, URLError

from nytbee_scrapper import BASE_URL, extract_answer_list, fetch_html, normalize_answer


## Fetch and extract answers

Set `url` to the NYTBee page you want to scrape, then run the cell.


In [None]:
base_url = BASE_URL
log_path = "nytbee_scrape.log"
dict_path = "nytbee_dict.txt"

start_input = input("Enter start date (YYYY-MM-DD) [default: today]: ").strip()
if start_input:
    starting_date = date.fromisoformat(start_input)
else:
    starting_date = date.today()

while True:
    days_input = input("Enter number of days to collect: ").strip()
    try:
        days_to_collect = int(days_input)
        if days_to_collect < 0:
            raise ValueError("Number of days must be positive")
    except ValueError:
        print("Please enter a non-negative integer for the number of days.")
        continue
    break

word_counts: dict[str, int] = {}
failed_urls: list[tuple[str, object]] = []

try:
    with open(dict_path, "r", encoding="utf-8") as file_handle:
        contents = file_handle.read().strip()
    if contents:
        if contents.lstrip().startswith("{"):
            from ast import literal_eval

            loaded = literal_eval(contents)
            if isinstance(loaded, dict):
                word_counts = {str(key): int(value) for key, value in loaded.items()}
                print(f"Loaded {len(word_counts)} words from {dict_path}.")
        else:
            words = [line.strip().lower() for line in contents.splitlines()]
            word_counts = {word: 1 for word in words if word}
            if word_counts:
                print(f"Loaded {len(word_counts)} words from {dict_path}.")
except FileNotFoundError:
    pass
except (ValueError, SyntaxError) as exc:
    print(f"Warning: Could not parse {dict_path}: {exc}")

scraped_urls: set[str] = set()
try:
    with open(log_path, "r", encoding="utf-8") as file_handle:
        scraped_urls = {line.strip() for line in file_handle if line.strip()}
    if scraped_urls:
        print(f"Loaded {len(scraped_urls)} scraped URLs from {log_path}.")
except FileNotFoundError:
    pass

with open(log_path, "a", encoding="utf-8") as log_handle:
    with tqdm(range(days_to_collect), desc="Collecting", unit="day") as progress:
        for offset in progress:
            target_date = starting_date - timedelta(days=offset)
            url = base_url.format(date=target_date.strftime("%Y%m%d"))

            if url in scraped_urls:
                progress.set_postfix({"status": "skipped", "url": url})
                continue

            progress.set_postfix({"status": "collecting", "url": url})
            try:
                html = fetch_html(url, timeout=20)
            except (HTTPError, URLError) as exc:
                failed_urls.append((url, exc))
                progress.set_postfix({"status": f"failed: {exc}", "url": url})
                continue

            items = extract_answer_list(html)
            if not items:
                failed_urls.append((url, "No answers extracted"))
                progress.set_postfix({"status": "failed: no answers", "url": url})
                continue

            for item in items:
                word = normalize_answer(item)
                if word:
                    word_counts[word] = word_counts.get(word, 0) + 1
            scraped_urls.add(url)
            log_handle.write(f"{url}\n")
            log_handle.flush()
            progress.set_postfix({"status": f"collected {len(items)}", "url": url})

print(f"Collected {len(word_counts)} distinct words from {days_to_collect - len(failed_urls)} days.")

with open(dict_path, "w", encoding="utf-8") as file_handle:
    for word in sorted(word_counts):
        file_handle.write(f"{word}\n")

print("First 100 words in the dictionary:")
for word in sorted(word_counts)[:100]:
    print(word)

if failed_urls:
    print("\nSkipped the following URLs:")
    for url, reason in failed_urls:
        print(f"- {url} ({reason})")

