In [2]:
import requests, time, json
from datetime import datetime, timezone
from pathlib import Path
import pandas as pd

BASE = "https://blockstream.info/api"               # Esplora
OUT_DIR = Path("../data/blockstream/")
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_DATE = datetime(2020, 1, 1, tzinfo=timezone.utc)   # stop when we reach here
CHECKPOINT = OUT_DIR / "blockstream_checkpoint.json"
CSV_PATH   = OUT_DIR / "btc_blocks_2020plus_summary.csv"

# ---- helpers ----
def get_tip_height():
    return int(requests.get(f"{BASE}/blocks/tip/height", timeout=15).text)

def get_blocks_page(start_height):
    """Return up to 10 blocks (descending) starting at start_height."""
    url = f"{BASE}/blocks/{start_height}"
    r = requests.get(url, timeout=30)
    if r.status_code == 429:
        raise requests.HTTPError("429 Too Many Requests")
    r.raise_for_status()
    return r.json()  # list of dicts

def load_checkpoint(default_height):
    if CHECKPOINT.exists():
        try:
            return json.loads(CHECKPOINT.read_text())["next_start_height"]
        except Exception:
            pass
    return default_height

def save_checkpoint(h):
    CHECKPOINT.write_text(json.dumps({"next_start_height": h}))

def to_utc(ts):
    return datetime.fromtimestamp(int(ts), tz=timezone.utc)

# ---- init ----
tip = get_tip_height()
start_h = load_checkpoint(tip)
print(f"Starting crawl at height {start_h} (tip={tip}), stopping at {START_DATE.date()}")

rows = []
calls = 0
backoff = 0.5  # seconds, will grow on 429

while True:
    try:
        page = get_blocks_page(start_h)
        calls += 1
        if not page:
            break

        # keep only blocks on/after our cutoff
        for b in page:
            ts = to_utc(b["timestamp"])
            if ts >= START_DATE:
                rows.append({
                    "height": b["height"],
                    "hash": b["id"],
                    "timestamp": ts,
                    "tx_count": b.get("tx_count"),
                    "size_bytes": b.get("size"),
                })

        # find the next page’s start height
        min_h = min(b["height"] for b in page)
        next_h = min_h - 1

        # stop if the *oldest* block in this page is already older than START_DATE
        if to_utc(min(b["timestamp"] for b in page)) < START_DATE:
            # we crossed the boundary; we’re done collecting 2020
            save_checkpoint(next_h)
            break

        # progress + periodic flush
        if calls % 50 == 0:
            print(f"…fetched {len(rows)} rows, next height {next_h}")
            # incremental save
            pd.DataFrame(rows).sort_values("height").to_csv(CSV_PATH, index=False)

        # be polite
        time.sleep(0.4)
        backoff = 0.5  # reset backoff after a good call
        start_h = next_h
        save_checkpoint(start_h)

    except requests.HTTPError as e:
        # exponential backoff on 429 or transient errors
        print(f"[warn] height {start_h}: {e} — sleeping {backoff:.1f}s")
        time.sleep(backoff)
        backoff = min(backoff + 0.5, 5.0)  # cap at 5s
        continue
    except Exception as e:
        print(f"[skip] unexpected error at {start_h}: {e}")
        time.sleep(1.0)
        continue

# final save / sort
df = pd.DataFrame(rows).drop_duplicates("height").sort_values("height").reset_index(drop=True)
df.to_csv(CSV_PATH, index=False)
print(f"Saved {len(df):,} blocks to {CSV_PATH}")

# quick peek
df.tail()

ValueError: invalid literal for int() with base 10: '{"error":"Too Many Requests","message":"Blockstream Explorer API NOTICE: Your request rate exceeds the current limit. Starting July 15 2025, monthly unauthenticated usage will be capped at 500,000 re