# High-throughput Elexon bulk crawler

Usage:
* Specify `START_DATE`, `END_DATE`
* Run required cell to define function
* Run final cell

## Setup

In [18]:
import asyncio
import aiohttp
import gzip
import json
import time
import datetime as dt
from aiolimiter import AsyncLimiter
from pathlib import Path
from statistics import mean
import nest_asyncio

nest_asyncio.apply()

BASE_URL = "https://data.elexon.co.uk/bmrs/api/v1"

START_DATE = dt.date(2021, 6, 30)
END_DATE   = dt.date(2025, 6, 30)

BASE_DIR = Path("bmrs_json_raw")
BASE_DIR.mkdir(exist_ok=True)

PER_MINUTE   = AsyncLimiter(4920, 60)
PER_SECOND   = AsyncLimiter(82, 1)
TIMEOUT      = aiohttp.ClientTimeout(sock_connect=3, sock_read=5)
CONCURRENCY  = 128


# Crawler Definitions

## Historical Features

In [None]:
ENDPOINTS = {
    # ------------------ Generation ----------------------------------------
    "GEN_PER_TYPE":
        "/datasets/AGPT?publishDateTimeFrom={from_ts}&publishDateTimeTo={to_ts}",
    "INTER":
        "/generation/outturn/interconnectors?settlementDateFrom={date}&settlementDateTo={date}",
    "DAYAHEAD_GEN_WIND_SOLAR":
        "/forecast/generation/wind-and-solar/day-ahead?from={from_ts}&to={to_ts}&processType=day%20ahead",
    "INTRADAYPROCESS_GEN_WIND_SOLAR":
        "/forecast/generation/wind-and-solar/day-ahead?from={from_ts}&to={to_ts}&processType=intraday%20process",
    "INTRADAYTOTAL_GEN_WIND_SOLAR":
        "/forecast/generation/wind-and-solar/day-ahead?from={from_ts}&to={to_ts}&processType=intraday%20total",
    "ACTUAL_GEN_WIND_SOLAR":
        "/datasets/AGWS?publishDateTimeFrom={from_ts}&publishDateTimeTo={to_ts}",

    # ------------------ Demand -------------------------------------------
    "DAYAHEAD_DEMAND":
        "/forecast/demand/day-ahead/history?publishTime={date}",
    "INDICATED_DAYAHEAD_DEMAND":
        "/forecast/indicated/day-ahead/history?publishTime={date}",
    "ACTUAL_DEMAND":
        "/demand/outturn?settlementDateFrom={date}&settlementDateTo={date}",

    # ------------------ Balancing ----------------------------------------
    "SYSTEM_PRICES":
        "/balancing/settlement/system-prices/{date}",
    "BSAD":
        "/datasets/netbsad?from={from_ts}&to={to_ts}",
    "MID":
        "/datasets/mid?from={from_ts}&to={to_ts}",
    "NONBM":
        "/datasets/NONBM?from={from_ts}&to={to_ts}",

    # ------------------ Transmission -------------------------------------
    "LOLPDRM":
        "/forecast/system/loss-of-load?from={from_ts}&to={to_ts}",

    # ------------------ Temperature --------------------------------
    "TEMPERATURE":
        "/temperature?from={from_ts}&to={to_ts}",
}
# --------------------------------------------------------------------------

def _ph(day: dt.date):
    """Return placeholders dict for a given date."""
    iso = day.isoformat()
    return {
        "date"   : iso,
        "from_ts": f"{iso}T00:00:00Z",
        "to_ts"  : f"{iso}T23:59:59Z",
    }

# --------------------------------------------------------------------------

# Stats
latencies = []
total_requests = 0
start_time_global = None

async def _fetch(sess: aiohttp.ClientSession, url: str, dest: Path, log: Path):
    global total_requests
    if dest.exists():
        return

    async with PER_SECOND, PER_MINUTE:
        start_time = time.perf_counter()
        async with sess.get(url) as r:
            if r.status == 404:
                return
            if r.status == 429:
                await asyncio.sleep(5)
                async with sess.get(url) as r2:
                    if r2.status in (404, 429):
                        return
                    r2.raise_for_status()
                    payload = await r2.json()
            else:
                r.raise_for_status()
                payload = await r.json()
        elapsed = time.perf_counter() - start_time
        latencies.append(elapsed)
        total_requests += 1

    if isinstance(payload, dict) and payload.get("data") == []:
        log.write_text((log.read_text() + dest.stem + "\n") if log.exists()
                       else dest.stem + "\n")
        return

    with gzip.open(dest, "wt") as fh:
        json.dump(payload, fh)

async def _all_tasks():
    day = START_DATE
    while day <= END_DATE:
        ph = _ph(day)
        for code, tpl in ENDPOINTS.items():
            sub = BASE_DIR / code
            sub.mkdir(exist_ok=True)
            if code == "TEMPERATURE":
                ph_temp = ph.copy()
                ph_temp["from_ts"] = ph_temp["date"]
                ph_temp["to_ts"]   = ph_temp["date"]
                url = f"{BASE_URL}{tpl.format(**ph_temp)}"
            else:
                url = f"{BASE_URL}{tpl.format(**ph)}"
            dest = sub / f"{ph['date']}.json.gz"
            log  = sub / "empty.log"
            yield url, dest, log
        day += dt.timedelta(days=1)

async def progress_report():
    print(f"{'Time':>8} | {'Reqs':>8} | {'Avg Lat':>8} | {'Max Lat':>8} | {'Thr/sec':>8} | {'Thr/min':>8}")
    print("-" * 62)
    while True:
        await asyncio.sleep(15)
        avg_lat = mean(latencies) if latencies else 0
        max_lat = max(latencies) if latencies else 0
        elapsed = (dt.datetime.now() - start_time_global).total_seconds()
        tput_sec = total_requests / elapsed if elapsed > 0 else 0
        tput_min = tput_sec * 60
        print(f"{dt.datetime.now().strftime('%H:%M:%S'):>8} | "
              f"{total_requests:8d} | "
              f"{avg_lat:8.3f} | "
              f"{max_lat:8.3f} | "
              f"{tput_sec:8.2f} | "
              f"{tput_min:8.0f}")

async def main():
    global start_time_global
    start_time_global = dt.datetime.now()
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    tasks = []
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        async for url, dest, log in _all_tasks():
            tasks.append(asyncio.create_task(_fetch(sess, url, dest, log)))
        reporter = asyncio.create_task(progress_report())
        await asyncio.gather(*tasks)
        reporter.cancel()

    elapsed_run = (dt.datetime.now() - start_time_global).total_seconds()
    avg_lat = mean(latencies) if latencies else 0
    max_lat = max(latencies) if latencies else 0
    tput_sec = total_requests / elapsed_run if elapsed_run > 0 else 0
    tput_min = tput_sec * 60
    print("\nFinal Summary")
    print("-" * 62)
    print(f"{'Total reqs:':<15}{total_requests}")
    print(f"{'Avg latency:':<15}{avg_lat:.3f} s")
    print(f"{'Max latency:':<15}{max_lat:.3f} s")
    print(f"{'Thr/sec:':<15}{tput_sec:.2f} req/sec")
    print(f"{'Thr/min:':<15}{tput_min:.0f} req/min")

## History of the wind generation forecast (WINDFOR)

In [None]:
import asyncio
import aiohttp
import gzip
import json
import pandas as pd
import time
from datetime import datetime, timedelta, date
from aiolimiter import AsyncLimiter
from pathlib import Path
from statistics import mean

# Config
ENDPOINT     = "/forecast/generation/wind/history"

# Explicit mapping of local clock to settlementPeriod
PUBLISH_MAP    = {
    "03:30": 6,
    "05:30": 10,
    "08:30": 16,
    "10:30": 20,
    "12:30": 24,
    "16:30": 32,
    "19:30": 38,
    "23:30": 46,
}

FINAL_DIR   = BASE_DIR / "DETAILED_WINDFOR"
FINAL_DIR.mkdir(exist_ok=True, parents=True)

# Stats
latencies = []
total_requests = 0
start_time_global = None

async def fetch_and_save(sess: aiohttp.ClientSession, date: datetime.date, pt: str, sp: int):
    global total_requests
    publish_str = f"{date.isoformat()}T{pt}Z"
    dest = FINAL_DIR / f"{date}_{sp}.json.gz"
    if dest.exists():
        return

    async with PER_SECOND, PER_MINUTE:
        start_time = time.perf_counter()
        resp = await sess.get(f"{BASE_URL}{ENDPOINT}", params={"publishTime": publish_str})
        resp.raise_for_status()
        payload = await resp.json()
        elapsed = time.perf_counter() - start_time
        latencies.append(elapsed)
        total_requests += 1

    data = payload.get("data", [])
    if not data:
        return

    df = pd.DataFrame(data)
    df["publishTime"] = pd.to_datetime(df["publishTime"], utc=True)
    df["startTime"]   = pd.to_datetime(df["startTime"],   utc=True)

    pub_dt = df["publishTime"].iloc[0]
    window_start = pub_dt
    window_end   = pub_dt + timedelta(hours=24)
    filtered = df[(df["startTime"] >= window_start) & (df["startTime"] <= window_end)].copy()

    filtered["publishTime"] = filtered["publishTime"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    filtered["startTime"]   = filtered["startTime"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    out = {"metadata": payload.get("metadata", {}), "data": filtered.to_dict(orient="records")}
    with gzip.open(dest, "wt") as fh:
        json.dump(out, fh)

async def progress_report():
    print(f"{'Time':>8} | {'Reqs':>8} | {'Avg Lat':>8} | {'Max Lat':>8} | {'Thr/sec':>8} | {'Thr/min':>8}")
    print("-" * 62)
    while True:
        await asyncio.sleep(15)
        avg_lat = mean(latencies) if latencies else 0
        max_lat = max(latencies) if latencies else 0
        elapsed = (datetime.now() - start_time_global).total_seconds()
        tput_sec = total_requests / elapsed if elapsed > 0 else 0
        tput_min = tput_sec * 60
        print(f"{datetime.now().strftime('%H:%M:%S'):>8} | "
              f"{total_requests:8d} | "
              f"{avg_lat:8.3f} | "
              f"{max_lat:8.3f} | "
              f"{tput_sec:8.2f} | "
              f"{tput_min:8.0f}")

async def main():
    global start_time_global
    start_time_global = datetime.now()
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        reporter = asyncio.create_task(progress_report())
        day = START_DATE
        while day <= END_DATE:
            tasks = [fetch_and_save(sess, day, pt, sp) for pt, sp in PUBLISH_MAP.items()]
            await asyncio.gather(*tasks)
            day += timedelta(days=1)
        reporter.cancel()

    elapsed_run = (datetime.now() - start_time_global).total_seconds()
    avg_lat = mean(latencies) if latencies else 0
    max_lat = max(latencies) if latencies else 0
    tput_sec = total_requests / elapsed_run if elapsed_run > 0 else 0
    tput_min = tput_sec * 60
    print("\nFinal Summary")
    print("-" * 62)
    print(f"{'Total reqs:':<15}{total_requests}")
    print(f"{'Avg latency:':<15}{avg_lat:.3f} s")
    print(f"{'Max latency:':<15}{max_lat:.3f} s")
    print(f"{'Thr/sec:':<15}{tput_sec:.2f} req/sec")
    print(f"{'Thr/min:':<15}{tput_min:.0f} req/min")


## Evolution of the Wind Generation Forecast Over Time (WINDFOR)

In [None]:
import asyncio
import aiohttp
import gzip
import json
import pandas as pd
from datetime import datetime, date, time, timedelta, timezone
from aiolimiter import AsyncLimiter
from pathlib import Path

# Configuration
ENDPOINT = "/forecast/generation/wind/evolution"

FINAL_DIR     = BASE_DIR / "EVOLUTION_WINDFOR"
FINAL_DIR.mkdir(exist_ok=True, parents=True)

# Rate limits
PER_MINUTE   = AsyncLimiter(4920, 60)
PER_SECOND   = AsyncLimiter(82, 1)
TIMEOUT      = aiohttp.ClientTimeout(sock_connect=3, sock_read=5)
CONCURRENCY  = 128

async def fetch_and_save_evolution(sess: aiohttp.ClientSession, start_dt: datetime):
    """
    Fetch the evolution series for a given forecast startTime,
    keep the 8 latest publishes at least 1h before start_dt, and save to gzipped JSON.
    """
    iso_start = start_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    stamp     = start_dt.strftime("%Y-%m-%d_%H%M")
    dest      = FINAL_DIR / f"{stamp}.json.gz"
    if dest.exists():
        return

    params = {"startTime": iso_start, "format": "json"}
    async with PER_SECOND, PER_MINUTE:
        resp = await sess.get(f"{BASE_URL}{ENDPOINT}", params=params)
        resp.raise_for_status()
        payload = await resp.json()

    data = payload.get("data", [])
    if not data:
        return

    df = pd.DataFrame(data)
    df["publishTime"] = pd.to_datetime(df["publishTime"], utc=True)

    # filter to publishes at least 1h before the startTime
    cutoff = start_dt - timedelta(hours=1)
    df = df[df["publishTime"] <= cutoff]
    if df.empty:
        return

    # take the 8 most recent publishes
    df = df.sort_values("publishTime", ascending=False).head(8).copy()
    df["publishTime"] = df["publishTime"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    out = {
        "metadata": payload.get("metadata", {}),
        "data": df.to_dict(orient="records")
    }

    with gzip.open(dest, "wt") as fh:
        json.dump(out, fh)

async def main():
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        tasks = []
        day = START_DATE
        one_day = timedelta(days=1)
        half_hour = timedelta(minutes=30)

        while day <= END_DATE:
            # start at 00:00 UTC of this day
            current = datetime.combine(day, time(0, 0), tzinfo=timezone.utc)
            end_of_day = current + one_day

            # step in 30-minute increments
            while current < end_of_day:
                tasks.append(asyncio.create_task(fetch_and_save_evolution(sess, current)))
                current += half_hour

            day += one_day

        await asyncio.gather(*tasks)
        print(f"✅ Completed all {len(tasks)} evolution fetches")


## Day-Ahead Demand Forecast History (NDF, TSDF)

In [None]:
import asyncio
import aiohttp
import gzip
import json
import pandas as pd
import time
from datetime import datetime, timedelta, timezone, date
from aiolimiter import AsyncLimiter
from pathlib import Path
from statistics import mean

# Config
ENDPOINT   = "/forecast/demand/day-ahead/history"

FINAL_DIR = BASE_DIR / "DEMAND_FORECASTS"
FINAL_DIR.mkdir(exist_ok=True, parents=True)

# Stats
latencies = []
total_requests = 0
start_time_global = None

def _round_up_to_next_30(dt: datetime) -> datetime:
    dt0 = dt.replace(second=0, microsecond=0)
    extra = dt0.minute % 30
    if extra == 0 and dt.second == 0 and dt.microsecond == 0:
        return dt0
    return dt0 + timedelta(minutes=(30 - extra))

async def fetch_and_save(sess: aiohttp.ClientSession, query_dt: datetime):
    global total_requests
    publish_str = query_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    rounded = _round_up_to_next_30(query_dt)
    fname = f"{rounded.date().isoformat()}_{rounded.strftime('%H%M')}.json.gz"
    dest = FINAL_DIR / fname
    if dest.exists():
        return

    async with PER_SECOND, PER_MINUTE:
        start_time = time.perf_counter()
        resp = await sess.get(f"{BASE_URL}{ENDPOINT}",
                              params={"publishTime": publish_str})
        resp.raise_for_status()
        payload = await resp.json()
        elapsed = time.perf_counter() - start_time
        latencies.append(elapsed)
        total_requests += 1

    data = payload.get("data", [])
    if not data:
        return

    df = pd.DataFrame(data)
    df["publishTime"] = pd.to_datetime(df["publishTime"], utc=True)
    df["startTime"] = pd.to_datetime(df["startTime"], utc=True)

    pub_dt = df["publishTime"].iloc[0]
    window_start = pub_dt + timedelta(minutes=30)
    window_end = pub_dt + timedelta(hours=24)
    df = df[(df["startTime"] >= window_start) & (df["startTime"] <= window_end)].copy()
    if df.empty:
        return

    df["publishTime"] = df["publishTime"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    df["startTime"] = df["startTime"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    out = {"metadata": payload.get("metadata", {}), "data": df.to_dict(orient="records")}
    with gzip.open(dest, "wt") as fh:
        json.dump(out, fh)

async def progress_report():
    print(f"{'Time':>8} | {'Reqs':>8} | {'Avg Lat':>8} | {'Max Lat':>8} | {'Thr/sec':>8} | {'Thr/min':>8}")
    print("-" * 62)
    while True:
        await asyncio.sleep(15)
        avg_lat = mean(latencies) if latencies else 0
        max_lat = max(latencies) if latencies else 0
        elapsed = (datetime.now() - start_time_global).total_seconds()
        tput_sec = total_requests / elapsed if elapsed > 0 else 0
        tput_min = tput_sec * 60
        print(f"{datetime.now().strftime('%H:%M:%S'):>8} | "
              f"{total_requests:8d} | "
              f"{avg_lat:8.3f} | "
              f"{max_lat:8.3f} | "
              f"{tput_sec:8.2f} | "
              f"{tput_min:8.0f}")

async def main():
    global start_time_global
    start_time_global = datetime.now()
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        reporter = asyncio.create_task(progress_report())
        day = START_DATE
        while day <= END_DATE:
            cursor = datetime.combine(day, datetime.min.time(), tzinfo=timezone.utc)
            end_dt = cursor + timedelta(days=1)
            tasks = []
            while cursor < end_dt:
                tasks.append(fetch_and_save(sess, cursor))
                cursor += timedelta(minutes=30)
            await asyncio.gather(*tasks)
            day += timedelta(days=1)
        reporter.cancel()

    elapsed_run = (datetime.now() - start_time_global).total_seconds()
    avg_lat = mean(latencies) if latencies else 0
    max_lat = max(latencies) if latencies else 0
    tput_sec = total_requests / elapsed_run if elapsed_run > 0 else 0
    tput_min = tput_sec * 60
    print("\nFinal Summary")
    print("-" * 62)
    print(f"{'Total reqs:':<15}{total_requests}")
    print(f"{'Avg latency:':<15}{avg_lat:.3f} s")
    print(f"{'Max latency:':<15}{max_lat:.3f} s")
    print(f"{'Thr/sec:':<15}{tput_sec:.2f} req/sec")
    print(f"{'Thr/min:':<15}{tput_min:.0f} req/min")

if __name__ == "__main__":
    asyncio.run(main())


## Market-Wide Bid-Offer Acceptances (BOALF)

In [None]:
import asyncio
import aiohttp
import gzip
import json
import pandas as pd
import time
from datetime import datetime, timedelta, date
from aiolimiter import AsyncLimiter
from pathlib import Path
from statistics import mean

# Config
ENDPOINT     = "/balancing/acceptances/all"

FINAL_DIR = BASE_DIR / "BIDOFFER_ACCEPTANCES"
FINAL_DIR.mkdir(exist_ok=True, parents=True)

# Stats
latencies = []
total_requests = 0
start_time_global = None

async def fetch_and_save(sess: aiohttp.ClientSession, query_dt: date, query_sp: int):
    global total_requests
    settlement_date = query_dt.strftime("%Y-%m-%d")
    settlement_period = str(query_sp)
    fname = f"{settlement_date}_{settlement_period}.json.gz"
    dest = FINAL_DIR / fname
    if dest.exists():
        return

    async with PER_SECOND, PER_MINUTE:
        start_time = time.perf_counter()
        resp = await sess.get(f"{BASE_URL}{ENDPOINT}",
                              params={"settlementDate": settlement_date, "settlementPeriod": settlement_period})
        resp.raise_for_status()
        payload = await resp.json()
        elapsed = time.perf_counter() - start_time
        latencies.append(elapsed)
        total_requests += 1

    data = payload.get("data", [])
    if not data:
        return

    df = pd.DataFrame(data)
    df = df[df["settlementPeriodFrom"] == query_sp]
    if df.empty:
        return

    out = {"metadata": payload.get("metadata", {}), "data": df.to_dict(orient="records")}
    with gzip.open(dest, "wt") as fh:
        json.dump(out, fh)

async def progress_report():
    # Table header
    print(f"{'Time':>8} | {'Reqs':>8} | {'Avg Lat':>8} | {'Max Lat':>8} | {'Thr/sec':>8} | {'Thr/min':>8}")
    print("-" * 62)
    while True:
        await asyncio.sleep(15)
        avg_lat = mean(latencies) if latencies else 0
        max_lat = max(latencies) if latencies else 0
        elapsed = (datetime.now() - start_time_global).total_seconds()
        tput_sec = total_requests / elapsed if elapsed > 0 else 0
        tput_min = tput_sec * 60
        print(f"{datetime.now().strftime('%H:%M:%S'):>8} | "
              f"{total_requests:8d} | "
              f"{avg_lat:8.3f} | "
              f"{max_lat:8.3f} | "
              f"{tput_sec:8.2f} | "
              f"{tput_min:8.0f}")

async def main():
    global start_time_global
    start_time_global = datetime.now()
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        reporter = asyncio.create_task(progress_report())
        day = START_DATE
        while day <= END_DATE:
            tasks = [fetch_and_save(sess, day, sp) for sp in range(1, 51)]
            await asyncio.gather(*tasks)
            day += timedelta(days=1)
        reporter.cancel()

    elapsed_run = (datetime.now() - start_time_global).total_seconds()
    avg_lat = mean(latencies) if latencies else 0
    max_lat = max(latencies) if latencies else 0
    tput_sec = total_requests / elapsed_run if elapsed_run > 0 else 0
    tput_min = tput_sec * 60
    print("\nFinal Summary")
    print("-" * 62)
    print(f"{'Total reqs:':<15}{total_requests}")
    print(f"{'Avg latency:':<15}{avg_lat:.3f} s")
    print(f"{'Max latency:':<15}{max_lat:.3f} s")
    print(f"{'Thr/sec:':<15}{tput_sec:.2f} req/sec")
    print(f"{'Thr/min:':<15}{tput_min:.0f} req/min")

if __name__ == "__main__":
    asyncio.run(main())


## Acceptances by Settlement Period (ISPSTACK, BOALF, BOD)

In [None]:
import asyncio
import aiohttp
import gzip
import json
import pandas as pd
from datetime import datetime, timedelta, date
from aiolimiter import AsyncLimiter
from pathlib import Path

# Configuration
FINAL_DIR = BASE_DIR / "BIDOFFER_PRICES"
FINAL_DIR.mkdir(exist_ok=True, parents=True)

async def fetch_and_save(sess: aiohttp.ClientSession, query_dt: date, query_sp: int):
    settlement_date = query_dt.strftime("%Y-%m-%d")
    settlement_period = str(query_sp)

    async with PER_SECOND, PER_MINUTE:
        url = f"{BASE_URL}/balancing/settlement/acceptances/all/{settlement_date}/{settlement_period}"
        resp = await sess.get(url)
        resp.raise_for_status()
        payload = await resp.json()

    data = payload.get("data", [])
    if not data:
        return

    df = pd.DataFrame(data)
    # keep exact SP
    if "settlementPeriodFrom" in df.columns:
        df = df[df["settlementPeriodFrom"] == query_sp]
        if df.empty:
            return

    # determine filename by settlement period
    fname = f"{settlement_date}_{settlement_period}.json.gz"
    dest = FINAL_DIR / fname
    if dest.exists():
        return

    out = {
        "metadata": payload.get("metadata", {}),
        "data": df.to_dict(orient="records")
    }
    with gzip.open(dest, "wt") as fh:
        json.dump(out, fh)

async def main():
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        tasks = []
        day = START_DATE
        while day <= END_DATE:
            for sp in range(1, 51):  # SP 1–50
                tasks.append(fetch_and_save(sess, day, sp))
            day += timedelta(days=1)

        await asyncio.gather(*tasks)
        print(f"✅ Completed all {len(tasks)} fetches")


## Settlement Bid-Offer Stacks by Settlement Period (ISPSTACK)

In [None]:
import asyncio
import aiohttp
import gzip
import json
import time
from datetime import datetime, timedelta, date
from aiolimiter import AsyncLimiter
from pathlib import Path
from statistics import mean

# Configuration
FINAL_DIR    = BASE_DIR / "ISPSTACK"
FINAL_DIR.mkdir(exist_ok=True, parents=True)

# Limits
PER_MINUTE   = AsyncLimiter(4920, 60)
PER_SECOND   = AsyncLimiter(82, 1)
TIMEOUT      = aiohttp.ClientTimeout(sock_connect=3, sock_read=5)
CONCURRENCY  = 128

# Stats
latencies = []
total_requests = 0
start_time_global = None

async def fetch_stack(sess: aiohttp.ClientSession, bid_offer_type: str, day: date, sp: int):
    global total_requests
    settlement_date = day.strftime("%Y-%m-%d")
    settlement_period = str(sp)
    fname = f"{bid_offer_type}_{settlement_date}_{settlement_period}.json.gz"
    dest = FINAL_DIR / fname
    if dest.exists():
        return

    async with PER_SECOND, PER_MINUTE:
        start_time = time.perf_counter()
        url = f"{BASE_URL}/balancing/settlement/stack/all/{bid_offer_type}/{settlement_date}/{settlement_period}"
        async with sess.get(url) as resp:
            resp.raise_for_status()
            payload = await resp.json()
        elapsed = time.perf_counter() - start_time
        latencies.append(elapsed)
        total_requests += 1

    data = payload.get("data", [])
    if not data:
        return
    out = {"metadata": payload.get("metadata", {}), "data": data}
    with gzip.open(dest, "wt", encoding="utf-8") as fh:
        json.dump(out, fh)

async def fetch_both(sess: aiohttp.ClientSession, day: date, sp: int):
    await asyncio.gather(
        fetch_stack(sess, "bid", day, sp),
        fetch_stack(sess, "offer", day, sp)
    )

async def progress_report():
    # Print header once
    print(f"{'Time':>8} | {'Reqs':>8} | {'Avg Lat':>8} | {'Max Lat':>8} | {'Thr/sec':>8} | {'Thr/min':>8}")
    print("-" * 62)
    while True:
        await asyncio.sleep(15)
        avg_lat = mean(latencies) if latencies else 0
        max_lat = max(latencies) if latencies else 0
        elapsed = (datetime.now() - start_time_global).total_seconds()
        tput_sec = total_requests / elapsed if elapsed > 0 else 0
        tput_min = tput_sec * 60
        print(f"{datetime.now().strftime('%H:%M:%S'):>8} | "
              f"{total_requests:8d} | "
              f"{avg_lat:8.3f} | "
              f"{max_lat:8.3f} | "
              f"{tput_sec:8.2f} | "
              f"{tput_min:8.0f}")

async def main():
    global start_time_global
    start_time_global = datetime.now()
    connector = aiohttp.TCPConnector(limit=CONCURRENCY, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as sess:
        reporter = asyncio.create_task(progress_report())
        day = START_DATE
        while day <= END_DATE:
            tasks = [fetch_both(sess, day, sp) for sp in range(1, 51)]
            await asyncio.gather(*tasks)
            day += timedelta(days=1)
        reporter.cancel()

    elapsed_run = (datetime.now() - start_time_global).total_seconds()
    avg_lat = mean(latencies) if latencies else 0
    max_lat = max(latencies) if latencies else 0
    tput_sec = total_requests / elapsed_run if elapsed_run > 0 else 0
    tput_min = tput_sec * 60

    print("\nFinal Summary")
    print("-" * 62)
    print(f"{'Total reqs:':<15}{total_requests}")
    print(f"{'Avg latency:':<15}{avg_lat:.3f} s")
    print(f"{'Max latency:':<15}{max_lat:.3f} s")
    print(f"{'Thr/sec:':<15}{tput_sec:.2f} req/sec")
    print(f"{'Thr/min:':<15}{tput_min:.0f} req/min")

# RUN

In [14]:
await main()

    Time |     Reqs |  Avg Lat |  Max Lat |  Thr/sec |  Thr/min
--------------------------------------------------------------
04:28:36 |     1292 |    0.031 |    0.149 |    86.12 |     5167

Final Summary
--------------------------------------------------------------
Total reqs:    1652
Avg latency:   0.029 s
Max latency:   0.149 s
Thr/sec:       85.36 req/sec
Thr/min:       5122 req/min
