In [1]:
# ==================  High-throughput Elexon bulk crawler  ==================
import asyncio, aiohttp, gzip, json, datetime as dt
from pathlib import Path
import nest_asyncio, aiolimiter
nest_asyncio.apply()           # lets us await main() directly in Jupyter

In [None]:
BASE_URL = "https://data.elexon.co.uk/bmrs/api/v1"

ENDPOINTS = {

    # ------------------ Generation ----------------------------------------
    "GEN_PER_TYPE":
        "/datasets/AGPT?"
        "publishDateTimeFrom={from_ts}&publishDateTimeTo={to_ts}",

    "INTER":
        "/generation/outturn/interconnectors?"
        "settlementDateFrom={date}&settlementDateTo={date}",

    "DAYAHEAD_GEN_WIND_SOLAR":
        "/forecast/generation/wind-and-solar/day-ahead?"
        "from={from_ts}&to={to_ts}&processType=all",

    "ACTUAL_GEN_WIND_SOLAR":
        "/datasets/AGWS?publishDateTimeFrom={from_ts}&publishDateTimeTo={to_ts}",

    # ------------------ Demand -------------------------------------------
    "DAYAHEAD_DEMAND":
        "/forecast/demand/day-ahead/history?"
        "publishTime={date}",

    "INDICATED_DAYAHEAD_DEMAND":
        "/forecast/indicated/day-ahead/history?"
        "publishTime={date}",

    "ACTUAL_DEMAND":
        "/demand/outturn?"
        "settlementDateFrom={date}&settlementDateTo={date}",

    # ------------------ Balancing ----------------------------------------
    "SYSTEM_PRICES":
        "/balancing/settlement/system-prices/{date}",

    "BSAD":
        "/datasets/netbsad?from={from_ts}&to={to_ts}",

    "MID":
        "/datasets/mid?from={from_ts}&to={to_ts}",

    "NONBM":
        "/datasets/NONBM?from={from_ts}&to={to_ts}",

    # ------------------ Transmission -------------------------------------
    "LOLPDRM":
        "/forecast/system/loss-of-load?from={from_ts}&to={to_ts}",
}
# --------------------------------------------------------------------------

START_DATE = dt.date(2019, 1, 1)
END_DATE   = dt.date(2025, 5, 31)

BASE_DIR = Path("bmrs_raw")
BASE_DIR.mkdir(exist_ok=True)

def _ph(day: dt.date):
    """Return placeholders dict for a given date."""
    iso = day.isoformat()
    return {
        "date"   : iso,
        "from_ts": f"{iso}T00:00:00Z",
        "to_ts"  : f"{iso}T23:59:59Z",
    }

# --------------------------------------------------------------------------
CONCURRENCY = 256                 # sockets
PER_MINUTE  = aiolimiter.AsyncLimiter(4500, 60)   # 4 500 / 60 s
PER_SECOND  = aiolimiter.AsyncLimiter(75,   1)    # 70   / 1 s   ← new
TIMEOUT     = aiohttp.ClientTimeout(total=40)
# --------------------------------------------------------------------------

async def _fetch(sess: aiohttp.ClientSession, url: str, dest: Path, log: Path):
    if dest.exists():
        return

    # acquire from **both** buckets
    async with PER_SECOND, PER_MINUTE:
        async with sess.get(url) as r:
            if r.status == 404:
                return
            if r.status == 429:
                # simple exponential back-off then retry once
                await asyncio.sleep(5)
                async with sess.get(url) as r2:
                    if r2.status == 404 or r2.status == 429:
                        return
                    r2.raise_for_status()
                    payload = await r2.json()
            else:
                r.raise_for_status()
                payload = await r.json()

    if isinstance(payload, dict) and payload.get("data") == []:
        log.write_text((log.read_text() + dest.stem + "\n") if log.exists()
                       else dest.stem + "\n")
        return

    with gzip.open(dest, "wt") as fh:
        json.dump(payload, fh)


async def _all_tasks():
    day = START_DATE
    while day <= END_DATE:
        ph = _ph(day)
        for code, tpl in ENDPOINTS.items():
            sub   = BASE_DIR / code
            sub.mkdir(exist_ok=True)
            url   = f"{BASE_URL}{tpl.format(**ph)}"
            dest  = sub / f"{ph['date']}.json.gz"
            log   = sub / "empty.log"
            yield url, dest, log
        day += dt.timedelta(days=1)


async def main():
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector,
                                     timeout=TIMEOUT) as sess:
        tasks = []
        async for url, dest, log in _all_tasks():
            tasks.append(asyncio.create_task(_fetch(sess, url, dest, log)))
        CHUNK = 10_000
        for i in range(0, len(tasks), CHUNK):
            await asyncio.gather(*tasks[i:i+CHUNK])
            print(f"✓ {min(i+CHUNK, len(tasks)):,} / {len(tasks):,} done")


In [3]:
# This cell actually runs the download – it can take a while (≈ 40–60 k requests).
await main()
print("✅  completed")


✓ 2,343 / 2,343 done
✅  completed
