In [1]:
import logging
from datetime import timedelta
from typing import Optional, Tuple
import os
import json

import requests
from bytewax import operators as op
from bytewax.dataflow import Dataflow
from bytewax.inputs import SimplePollingSource

# from proton import ProtonSink

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
("GLOBAL_ID", requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json").json(),)

('GLOBAL_ID', 42154914)

In [10]:
requests.get(
        f"https://hacker-news.firebaseio.com/v0/item/42154910.json"
    ).json()

{'by': 'titzer',
 'id': 42154910,
 'parent': 42150550,
 'text': '&gt; Weâ€™ve begun by enabling hardened libc++, which adds bounds checking to standard C++ data structures, eliminating a significant class of spatial safety bugs.<p>Well, it&#x27;s 2024 and remember arguing this 20+ years ago. Programs have bugs that bounds checking catches. And making it a language built-in exposes it to compiler optimizations specifically targeting bounds checks, eliminating many and bringing the dynamic cost down immensely. Just turning them on in libraries doesn&#x27;t necessarily expose all the compiler optimizations, but it&#x27;s a start. Safety checks should really be built into the language.',
 'time': 1731739897,
 'type': 'comment'}

In [None]:
requests.get(
        f"https://hacker-news.firebaseio.com/v0/item/42150550.json"
    ).json()

{'by': 'jandeboevrie',
 'descendants': 16,
 'id': 42150550,
 'kids': [42154910,
  42154518,
  42154598,
  42153813,
  42153898,
  42154658,
  42151587,
  42153459],
 'score': 50,
 'time': 1731702319,
 'title': 'Retrofitting spatial safety to lines of C++',
 'type': 'story',
 'url': 'https://security.googleblog.com/2024/11/retrofitting-spatial-safety-to-hundreds.html'}

In [2]:
class HNSource(SimplePollingSource):
    def next_item(self):
        return (
            "GLOBAL_ID",
            requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json").json(),
        )


def get_id_stream(old_max_id, new_max_id) -> Tuple[str,list]:
    if old_max_id is None:
        # Get the last 150 items on the first run.
        old_max_id = new_max_id - 150
    return (new_max_id, range(old_max_id, new_max_id))


def download_metadata(hn_id) -> Optional[Tuple[str, dict]]:
    # Given an hacker news id returned from the api, fetch metadata
    # Try 3 times, waiting more and more, or give up
    data = requests.get(
        f"https://hacker-news.firebaseio.com/v0/item/{hn_id}.json"
    ).json()

    if data is None:
        logger.warning(f"Couldn't fetch item {hn_id}, skipping")
        return None
    return (str(hn_id), data)


def recurse_tree(metadata, og_metadata=None) -> any:
    if not og_metadata:
        og_metadata = metadata
    try:
        parent_id = metadata["parent"]
        parent_metadata = download_metadata(parent_id)
        return recurse_tree(parent_metadata[1], og_metadata)
    except KeyError:
        return (metadata["id"], 
                {
                    **og_metadata, 
                    "root_id":metadata["id"]
                }
                )


def key_on_parent(key__metadata) -> tuple:
    key, metadata = recurse_tree(key__metadata[1])
    return (str(key), metadata)


def format(id__metadata):
    id, metadata = id__metadata
    return json.dumps(metadata)

In [4]:

flow = Dataflow("hn_scraper")

max_id = op.input("in", flow, HNSource(timedelta(seconds=15)))

id_stream = op.stateful_map("range", max_id, lambda: None, get_id_stream).then(
    op.flat_map, "strip_key_flatten", lambda key_ids: key_ids[1]).then(
    op.redistribute, "redist")

id_stream = op.filter_map("meta_download", id_stream, download_metadata)

split_stream = op.branch("split_comments", id_stream, lambda item: item[1]["type"] == "story")

story_stream = split_stream.trues

story_stream = op.map("format_stories", story_stream, format)

comment_stream = split_stream.falses

comment_stream = op.map("key_on_parent", comment_stream, key_on_parent)

comment_stream = op.map("format_comments", comment_stream, format)

op.inspect("stories", story_stream)
op.inspect("comments", comment_stream)

TypeError: operator 'stateful_map' called incorrectly; see cause above