In [1]:
from pathlib import Path
import datetime
import json
import gzip
import gc
from itertools import chain
from zoneinfo import ZoneInfo
import pandas as pd

# DATA PRE-PROCESSING

## BUILDING THE FINAL .csv FROM .json

* **Imports & configuration** – import `json`, `gzip`, `pathlib`, `itertools`, `pandas`, `tqdm`, `numpy`; set `RAW_DIR`, `CSV_DIR`, and the global `start_date`/`end_date`.

* **Raw JSON loading** – `_load_json_z` reads a single `.json.gz`; `_folder_to_df` concatenates all files in a folder into one DataFrame.

* **Timestamp construction** – `build_start_time` returns a UTC‐like `startTime` column, either from an existing field or by combining `settlementDate`+`settlementPeriod`.

* **Type coercion** – `_coerce_type` downcasts non-key columns to `float32` and normalises the date/time columns to naïve `datetime64[ns]`.

* **Small-gap interpolation** – `fill_small_gaps` linearly fills runs of up to two missing half-hours in numeric columns, leaving longer gaps untouched.

* **UK half-hour calendar** – `build_uk_halfhour_calendar` generates a full DST-aware sequence of half-hour intervals between any two dates.

* **Padding missing intervals** – `_pad_missing` merges data onto the full calendar (trimming to the global date range) so every expected interval appears.

* **Finalising pipeline** – `_finish` selects the requested columns, coerces types, pads missing, drops duplicates, interpolates small gaps, then sorts and resets the index.

* **Dataset builders** – each `b_<dataset>` function (e.g. `b_actual_demand`, `b_gen_per_type`, `b_system_prices`, etc.) computes `startTime`, selects its own `want` column list, and hands off to `_finish` to produce the final CSV.


In [27]:
# Set directories
RAW_DIR     = Path("bmrs_json_raw")
CSV_DIR     = Path("bmrs_csv_raw")
LOG_DIR     = Path("logs")

CSV_DIR.mkdir(exist_ok=True)
LOG_DIR.mkdir(exist_ok=True)


# Date range
start_date = "2017-01-01"
end_date   = "2025-06-30"



def _load_json_z(path: Path) -> list[dict]:
    with gzip.open(path, "rt") as fh:
        return json.load(fh)["data"]


def _folder_to_df(folder: Path) -> pd.DataFrame:
    files = sorted(folder.glob("*.json.gz"))
    rows  = chain.from_iterable((_load_json_z(f) for f in files))
    return pd.DataFrame.from_records(rows)


def build_start_time(df: pd.DataFrame) -> pd.Series:
    """Return UTC-like timestamp (start of settlement period)."""
    if "startTime" in df.columns:
        return pd.to_datetime(df["startTime"], errors="coerce")
    # otherwise compose from date + SP (SP1 = 00:00 UTC *winter*)
    base = pd.to_datetime(df["settlementDate"])
    off  = pd.to_timedelta(df["settlementPeriod"].astype(int).sub(1) * 30,
                           unit="m")
    return base + off




# ──────────────────────────────────────────────────────────────────────────
# ──────────────────────────────── finisher ────────────────────────────────
# ──────────────────────────────────────────────────────────────────────────

KEY_COLS = {"startTime", "settlementDate", "settlementPeriod"}

def _coerce_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    • convert every non-key column to float32
    • normalise datetime columns
    """
    # 1) numeric columns  → float32
    num_cols = [c for c in df.columns if c not in KEY_COLS]
    df[num_cols] = df[num_cols].apply(
        pd.to_numeric, errors="coerce", downcast="float"
    )

    # 2) settlementDate  → 00:00 of that day, no timezone
    df["settlementDate"] = (
        pd.to_datetime(df["settlementDate"], utc=True)   # ensure tz-aware
          .dt.normalize()                                # strip hh:mm:ss
          .dt.tz_localize(None)                          # drop timezone
    )

    # 3) startTime  → no timezone (but keep hh:mm)
    if "startTime" in df.columns:
        df["startTime"] = (
            pd.to_datetime(df["startTime"], utc=True)
              .dt.tz_localize(None)
        )

    # settlementPeriod stays int32
    if "settlementPeriod" in df.columns:
        df["settlementPeriod"] = df["settlementPeriod"].astype("int32")

    return df

# ──────────────────────────────────────────────────────────────────────────

def build_uk_halfhour_calendar(start_date, end_date):
    """
    Build UK half-hour calendar with correct DST handling:
      • Spring-forward days: 46 periods (including the skipped 01:00/01:30)
      • Normal days: 48 periods 00:00-23:30
      • BST days: 48 periods 23:00(prev day)-22:30
      • Autumn-back days: 50 periods 23:00(prev day)-22:30
    """

    def _to_date(x):
        if isinstance(x, str):
            
            if x.count("-") == 2 and x[4] == "-": # ISO format
                return datetime.date.fromisoformat(x)
            return datetime.datetime.strptime(x, "%d/%m/%Y").date() # UK format
        if isinstance(x, pd.Timestamp):
            return x.date()
        return x

    start = _to_date(start_date)
    end   = _to_date(end_date)

    london = ZoneInfo("Europe/London")
    utc    = ZoneInfo("UTC")
    rows   = []

    for single in pd.date_range(start, end, freq="D"):
        D   = single.date()
        # local midnights in London
        dt0 = datetime.datetime(D.year, D.month, D.day, tzinfo=london)
        dt1 = dt0 + datetime.timedelta(days=1)

        # number of half-hours that actually occur
        total_secs = (dt1.astimezone(utc) - dt0.astimezone(utc)).total_seconds()
        n_periods = int(total_secs // 1800)

        # align to UTC-naive base for SP1
        offset_h = dt0.utcoffset().total_seconds() / 3600
        if offset_h > 0:
            base = datetime.datetime(D.year, D.month, D.day) - datetime.timedelta(hours=int(offset_h))
        else:
            base = datetime.datetime(D.year, D.month, D.day)

        for i in range(n_periods):
            rows.append({
                "startTime":        base + datetime.timedelta(minutes=30 * i),
                "settlementDate":   D,
                "settlementPeriod": i + 1
            })

    df = pd.DataFrame(rows)

    # ─── coerce to pandas time types ───
    df["startTime"]      = pd.to_datetime(df["startTime"])
    df["settlementDate"] = pd.to_datetime(df["settlementDate"]).dt.normalize()
    df["settlementPeriod"] = df["settlementPeriod"].astype("int32")
    # ───────────────────────────────────

    return df


# ──────────────────────────────────────────────────────────────────────────

def _pad_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each settlementDate in df, build exactly the UK
    SP‐calendar via build_uk_halfhour_calendar(min,max), then
    left‐merge your data on (Date,Period,startTime).
    Also trims the input df to only include rows between start_date and end_date (inclusive).
    """
    if df.empty:
        return df

    df = df.copy()
    df=df.drop_duplicates(subset=["startTime"])
    # ensure proper types
    df["settlementDate"]   = pd.to_datetime(df["settlementDate"]).dt.normalize()
    df["settlementPeriod"] = df["settlementPeriod"].astype(int)
    df["startTime"]        = pd.to_datetime(df["startTime"])

    lo = start_date
    hi = end_date

    # trim input df to only include rows between start_date and end_date (inclusive)
    mask = (
        (df["settlementDate"] >= pd.to_datetime(start_date)) &
        (df["settlementDate"] <= pd.to_datetime(end_date))
    )
    df = df.loc[mask]

    # build the master calendar
    cal = build_uk_halfhour_calendar(lo, hi)
    cal["settlementDate"]   = pd.to_datetime(cal["settlementDate"])
    cal["settlementPeriod"] = cal["settlementPeriod"].astype(int)
    cal["startTime"]        = pd.to_datetime(cal["startTime"], dayfirst=True)

    # left‐join your actual data onto the calendar
    out = (
        cal
        .merge(df,
               on=["settlementDate","settlementPeriod","startTime"],
               how="left",
               sort=False)
    )

    return out

# ──────────────────────────────────────────────────────────────────────────

def _finish(out: pd.DataFrame, want: list[str]) -> pd.DataFrame:
    # 1) keep only requested columns
    out = out[want]
    # 2) downcast
    out = _coerce_type(out)
    # 3) pad missing with DST‐aware UK calendar
    out = _pad_missing(out)
    # 4) drop duplicate rows
    out = out.drop_duplicates()
    
    return out.sort_values("startTime").reset_index(drop=True)




# ──────────────────────────────────────────────────────────────────────────
# ──────────────────────────────── builders ────────────────────────────────
# ──────────────────────────────────────────────────────────────────────────

def b_actual_demand(df):
    df["startTime"] = build_start_time(df)
    want = ["startTime","settlementDate","settlementPeriod",
            "initialDemandOutturn",
            "initialTransmissionSystemDemandOutturn"]
    out = df[want]
    return _finish(out, want)

# ──────────────────────────────────────────────────────────────────────────
def _pivot_wind_solar(df, value_name):
    map_ = {
        "Wind Onshore" : "windOnshoreGeneration",
        "Wind Offshore": "windOffshoreGeneration",
        "Solar"        : "solarGeneration",
    }
    df  = df.replace({"psrType": map_})
    out = (df.pivot_table(index=["settlementDate","settlementPeriod","startTime"],
                          columns="psrType",
                          values=value_name, aggfunc="first")
             .reset_index())
    out["startTime"] = build_start_time(out)
    for c in map_.values():
        if c not in out.columns:
            out[c] = pd.NA
    return out

def b_actual_gen_ws(df):
    tidy = _pivot_wind_solar(df.rename(columns={"quantity":"gen"}), "gen")
    want = ["startTime","settlementDate","settlementPeriod",
            "windOnshoreGeneration","windOffshoreGeneration",
            "solarGeneration"]
    out = tidy[want]
    return _finish(out, want)

# ──────────────────────────────────────────────────────────────────────────

def _pivot_wind_solar_forecast(df, process_type, value_name):
    map_ = {
        "Wind Onshore":  "windOnshoreGeneration",
        "Wind Offshore": "windOffshoreGeneration",
        "Solar":         "solarGeneration",
    }
    df = (
        df.loc[df["processType"] == process_type]
          .rename(columns={"quantity": value_name})
          .replace({"psrType": map_})
    )
    out = (
        df.pivot_table(
            index=["settlementDate", "settlementPeriod", "startTime", "publishTime"],
            columns="psrType",
            values=value_name,
            aggfunc="first"
        )
        .reset_index()
    )
    out["startTime"] = build_start_time(out)
    for col in map_.values():
        if col not in out.columns:
            out[col] = pd.NA
    return out

def b_dayahead_gen_ws(df):
    tidy = _pivot_wind_solar_forecast(df, "Day ahead", "forecast")
    want = [
        "startTime", "settlementDate", "settlementPeriod", "publishTime",
        "windOnshoreGeneration", "windOffshoreGeneration", "solarGeneration"
    ]
    return _finish(tidy[want], want)

def b_intradayprocess_gen_ws(df):
    tidy = _pivot_wind_solar_forecast(df, "Intraday process", "forecast")
    want = [
        "startTime", "settlementDate", "settlementPeriod", "publishTime",
        "windOnshoreGeneration", "windOffshoreGeneration", "solarGeneration"
    ]
    return _finish(tidy[want], want)

def b_intradaytotal_gen_ws(df):
    tidy = _pivot_wind_solar_forecast(df, "Intraday total", "forecast")
    want = [
        "startTime", "settlementDate", "settlementPeriod", "publishTime",
        "windOnshoreGeneration", "windOffshoreGeneration", "solarGeneration"
    ]
    return _finish(tidy[want], want)

# ──────────────────────────────────────────────────────────────────────────

def _folder_to_df_detailed(folder: Path) -> pd.DataFrame:
    """
    Like _folder_to_df, but also injects `publishSP` from each filename:
      bmrs_json_raw/DETAILED_WINDFOR/YYYY-MM-DD_<SP>.json.gz
    """
    rows = []
    for f in sorted(folder.glob("*.json.gz")):
        name = f.name  # e.g. "2017-01-01_10.json.gz"
        # strip off the ".json.gz" bit, then split on "_"
        base = name[:-len(".json.gz")]            # → "2017-01-01_10"
        sp   = int(base.split("_", 1)[1])         # → 10
        # … load JSON …
        payload = json.load(gzip.open(f, "rt"))
        for rec in payload.get("data", []):
           rec["publishSP"] = sp
           rows.append(rec)
    return pd.DataFrame.from_records(rows)

def b_windfor(df):
    """
    Intraday wind forecasts with 8 columns f_1…f_8,
    where f_1 is the latest run ≥1h before delivery,
    f_2 the next‐latest, …, f_8 the 8th‐latest.
    """
    want = ["startTime","settlementDate","settlementPeriod"] + [f"f_{i}" for i in range(1,9)]

    if df.empty:
        return pd.DataFrame(columns=want)

    # 1) parse
    df["publishTimeUTC"] = pd.to_datetime(df["publishTime"], utc=True)
    df["startTime"]      = pd.to_datetime(df["startTime"],   utc=True)

    # 2) gate‐closure: only keep runs published ≥1h before delivery
    df = df[df["publishTimeUTC"] <= df["startTime"] - pd.Timedelta(hours=1)]

    # 3) sort by slot & publishTime descending
    df = df.sort_values(
        ["settlementDate","settlementPeriod","startTime","publishTimeUTC"],
        ascending=[True, True, True, False]
    )

    # 4) rank each slot’s forecasts 1…n by recency
    df["rank"] = (
        df
        .groupby(["settlementDate","settlementPeriod","startTime"])
        .cumcount()
        + 1
    )

    # 5) keep only the top 8 for each slot
    df8 = df[df["rank"] <= 8]

    # 6) pivot so rank → f_<rank>
    wide = (
        df8.pivot_table(
            index=["settlementDate","settlementPeriod","startTime"],
            columns="rank",
            values="generation",
            aggfunc="first"
        )
        .reset_index()
    )

    # 7) rename and ensure f_1…f_8 exist
    for i in range(1,9):
        col = f"f_{i}"
        if i in wide.columns:
            wide = wide.rename(columns={i: col})
        else:
            wide[col] = pd.NA

    # 8) rebuild startTime (UTC→naive half‐hour)
    wide["startTime"] = build_start_time(wide)

    # 9) finish (dtype coercion, calendar padding, dedupe, sort)
    return _finish(wide[want], want)





# ──────────────────────────────────────────────────────────────────────────

GEN_MAP = {
    "Hydro Pumped Storage":"hydroPumpedStorage",
    "Fossil Hard coal"    :"fossilCoal",
    "Fossil Gas"          :"fossilGas",
    "Fossil Oil"          :"fossilOil",
    "Nuclear"             :"nuclear",
    "Other"               :"other",
    "Wind Onshore"        :"windOnshore",
    "Wind Offshore"       :"windOffshore",
    "Solar"               :"solar",
}
def b_gen_per_type(df):
    df = (df.replace({"psrType": GEN_MAP})
            .pivot_table(index=["settlementDate","settlementPeriod","startTime"],
                         columns="psrType",
                         values="quantity", aggfunc="first")
            .reset_index())
    df["startTime"] = build_start_time(df)
    for col in GEN_MAP.values():
        if col not in df.columns:
            df[col] = pd.NA
    want = ["startTime","settlementDate","settlementPeriod"]+list(GEN_MAP.values())
    out = df[want]
    return _finish(out, want)

# ──────────────────────────────────────────────────────────────────────────
def b_dayahead_demand(df):
    df["startTime"] = build_start_time(df)
    want = ["startTime", "settlementDate", "settlementPeriod",
            "transmissionSystemDemand", "nationalDemand"]
    out = df[want]
    return _finish(out, want)


# ──────────────────────────────────────────────────────────────────────────
def b_indicated(df):
    df["startTime"] = build_start_time(df)
    want = ["startTime", "settlementDate", "settlementPeriod",
            "indicatedGeneration", "indicatedDemand",
            "indicatedMargin", "indicatedImbalance"]
    out = df[want]
    return _finish(out, want)


# ──────────────────────────────────────────────────────────────────────────
IC_NAME_TO_COL = {
    "Eleclink (INTELEC)"      : "INTELEC",
    "Ireland(East-West)"      : "INTEW",
    "France(IFA)"             : "INTFR",
    "Ireland (Greenlink)"     : "INTGRNL",
    "IFA2 (INTIFA2)"          : "INTIFA2",
    "Northern Ireland(Moyle)" : "INTIRL",
    "Netherlands(BritNed)"    : "INTNED",
    "Belgium (Nemolink)"      : "INTNEM",
    "North Sea Link (INTNSL)" : "INTNSL",
    "Denmark (Viking link)"   : "INTVKL",
}

def b_inter(df):
    df = df.replace({"interconnectorName": IC_NAME_TO_COL})

    df = (df.pivot_table(index=["settlementDate", "settlementPeriod", "startTime"],
                         columns="interconnectorName",
                         values="generation",
                         aggfunc="first")
            .reset_index())

    df["startTime"] = build_start_time(df)

    for col in IC_NAME_TO_COL.values():
        if col not in df.columns:
            df[col] = pd.NA
    

    want = ["startTime", "settlementDate", "settlementPeriod"] + list(IC_NAME_TO_COL.values())
    out = df[want]
    return _finish(out, want)


# ──────────────────────────────────────────────────────────────────────────
def b_mid(df):
    df = df.loc[df["dataProvider"] == "APXMIDP"].copy()
    df["startTime"] = build_start_time(df)
    want = ["startTime", "settlementDate", "settlementPeriod",
            "price", "volume"]
    return _finish(df[want], want)

# ──────────────────────────────────────────────────────────────────────────
def b_nonbm(df):
    df["startTime"] = build_start_time(df)
    want = ["startTime", "settlementDate", "settlementPeriod",
            "generation"]
    return _finish(df[want], want)

# ──────────────────────────────────────────────────────────────────────────
HORIZONS = [1, 2, 4, 8, 12]

def b_lolpdrm(df):
    df["startTime"] = build_start_time(df)

    # keep only horizons we care about
    df = df.loc[df["forecastHorizon"].isin(HORIZONS),
                ["startTime", "settlementDate", "settlementPeriod",
                 "forecastHorizon", "lossOfLoadProbability",
                 "deratedMargin"]]

    # ----------  LOLP (horizon 1)  ----------
    lolp = (df[df["forecastHorizon"] == 1]
              .rename(columns={"lossOfLoadProbability": "1hLOLP"})
              .loc[:, ["startTime", "settlementDate",
                       "settlementPeriod", "1hLOLP"]])

    # ----------  DRM (pivot all horizons)  ----------
    drm = (df.pivot_table(index=["startTime", "settlementDate",
                                 "settlementPeriod"],
                          columns="forecastHorizon",
                          values="deratedMargin")
             .rename(columns={h: f"{h}hDRM" for h in HORIZONS})
             .reset_index())

    # ----------  merge & order columns  ----------
    out = lolp.merge(drm, on=["startTime", "settlementDate",
                              "settlementPeriod"])

    want = ["startTime", "settlementDate", "settlementPeriod",
            "1hLOLP", "1hDRM"]
    out = out[want]
    return _finish(out, want)


# ──────────────────────────────────────────────────────────────────────────
def b_system_prices(df):
    df["startTime"] = build_start_time(df)

    df = df.rename(columns={
        "systemSellPrice"    : "systemPrice",      # SSP / SBP
        "netImbalanceVolume" : "netImbalanceVolume"
    })

    want = ["startTime", "settlementDate", "settlementPeriod",
            "systemPrice", "netImbalanceVolume",
            "sellPriceAdjustment", "buyPriceAdjustment",
            "replacementPrice", "replacementPriceReferenceVolume",
            "totalAcceptedOfferVolume", "totalAcceptedBidVolume",
            "totalAdjustmentSellVolume", "totalAdjustmentBuyVolume",
            "totalSystemTaggedAcceptedOfferVolume",
            "totalSystemTaggedAcceptedBidVolume",
            "totalSystemTaggedAdjustmentSellVolume",
            "totalSystemTaggedAdjustmentBuyVolume"]

    # create any missing columns so _finish keeps dtype order
    for col in want:
        if col not in df.columns:
            df[col] = pd.NA

    out = df[want]
    return _finish(out, want)


In [28]:
# BUILDERS = {
#     "ACTUAL_DEMAND"                  : b_actual_demand,
#     "ACTUAL_GEN_WIND_SOLAR"          : b_actual_gen_ws,
#     "DAYAHEAD_DEMAND"                : b_dayahead_demand,
#     "DETAILED_WINDFOR"               : b_windfor,
#     "DAYAHEAD_GEN_WIND_SOLAR"        : b_dayahead_gen_ws,
#     "INTRADAYPROCESS_GEN_WIND_SOLAR" : b_intradayprocess_gen_ws,
#     "INTRADAYTOTAL_GEN_WIND_SOLAR"   : b_intradaytotal_gen_ws,
#     "GEN_PER_TYPE"                   : b_gen_per_type,
#     "INDICATED_DAYAHEAD_DEMAND"      : b_indicated,
#     "INTER"                          : b_inter,
#     "LOLPDRM"                        : b_lolpdrm,
#     "NONBM"                          : b_nonbm,
#     "MID"                            : b_mid,
#     "SYSTEM_PRICES"                  : b_system_prices,
# }

BUILDERS = {
    "DAYAHEAD_GEN_WIND_SOLAR"        : b_dayahead_gen_ws,
    "INTRADAYPROCESS_GEN_WIND_SOLAR" : b_intradayprocess_gen_ws,
    "INTRADAYTOTAL_GEN_WIND_SOLAR"   : b_intradaytotal_gen_ws,
}


def process_one(code: str, builder):
    folder = RAW_DIR / code
    if code == "DETAILED_WINDFOR":
        df_raw = _folder_to_df_detailed(folder)
    else:
        df_raw = _folder_to_df(folder)
    if df_raw.empty:
        print(f"⚠ {code}: empty → skipped")
        return None
    df_tidy = builder(df_raw)
    out = CSV_DIR / f"{code}.csv"
    df_tidy.to_csv(out, index=False)
    print(f"✓ {code}: {len(df_tidy):,} rows → {out}")
    return df_tidy


def main():
    # # placeholders to capture the two dataframes
    # actual_ws   = None
    # gen_per_type = None

    # 1) run all builders and write CSVs
    for code, builder in BUILDERS.items():
        result = process_one(code, builder)
        # if code == "ACTUAL_GEN_WIND_SOLAR":
        #     actual_ws = result.copy() if result is not None else None
        # elif code == "GEN_PER_TYPE":
        #     gen_per_type = result.copy() if result is not None else None

    # # 2) mutual fill between those two
    # if actual_ws is not None and gen_per_type is not None:
    #     for gen_col, act_col in [
    #         ("windOffshore",           "windOffshoreGeneration"),
    #         ("windOnshore",            "windOnshoreGeneration"),
    #         ("solar",                  "solarGeneration")
    #     ]:
    #         # fill actual from gen_per_type
    #         actual_ws[act_col] = actual_ws[act_col].combine_first(
    #                                  gen_per_type[gen_col]
    #                              )
    #         # fill gen_per_type from actual
    #         gen_per_type[gen_col] = gen_per_type[gen_col].combine_first(
    #                                     actual_ws[act_col]
    #                                 )

    #     # 3) overwrite the two CSVs
    #     actual_ws.to_csv(CSV_DIR/"ACTUAL_GEN_WIND_SOLAR.csv", index=False)
    #     gen_per_type.to_csv(CSV_DIR/"GEN_PER_TYPE.csv",       index=False)
    #     print("↺ Mutual fill applied to ACTUAL_GEN_WIND_SOLAR and GEN_PER_TYPE")

if __name__ == "__main__":
    pd.set_option("future.no_silent_downcasting", True)
    main()


✓ DAYAHEAD_GEN_WIND_SOLAR: 148,942 rows → bmrs_csv_raw/DAYAHEAD_GEN_WIND_SOLAR.csv
✓ INTRADAYPROCESS_GEN_WIND_SOLAR: 148,942 rows → bmrs_csv_raw/INTRADAYPROCESS_GEN_WIND_SOLAR.csv
✓ INTRADAYTOTAL_GEN_WIND_SOLAR: 148,942 rows → bmrs_csv_raw/INTRADAYTOTAL_GEN_WIND_SOLAR.csv


In [26]:
def _pivot_wind_solar_forecast_debug(df, process_type, value_name):
    # 1) Before filtering
    print(f"\n>>> raw df shape: {df.shape}")
    print("   processType values:", df["processType"].unique())
    print("   psrType values:    ", df["psrType"].unique())
    
    # 2) Filter by process_type
    mask = df["processType"] == process_type
    print(f"   – selecting processType == {process_type!r}: {mask.sum()} rows")
    df2 = df.loc[mask].rename(columns={"quantity": value_name})
    if df2.empty:
        print(f"   !!! no rows match process_type={process_type!r}")
        return pd.DataFrame(columns=["settlementDate","settlementPeriod","startTime","publishTime",
                                      "windOnshoreGeneration","windOffshoreGeneration","solarGeneration"])
    
    # 3) Show how many of each PSR you have
    print("   psrType counts in filtered:", df2["psrType"].value_counts().to_dict())
    
    # 4) Pivot
    out = (df2
           .pivot_table(
               index=["settlementDate","settlementPeriod","startTime","publishTime"],
               columns="psrType",
               values=value_name,
               aggfunc="first")
           .reset_index())
    print(f"   after pivot: {out.shape}")
    print("   pivot columns:", out.columns.tolist())
    
    # 5) Rebuild startTime, fill missing cols
    out["startTime"] = build_start_time(out)
    for psr, col in {
        "Wind Onshore": "windOnshoreGeneration",
        "Wind Offshore": "windOffshoreGeneration",
        "Solar":         "solarGeneration"
    }.items():
        if col not in out.columns:
            print(f"   – missing column {col}, inserting NaNs")
            out[col] = pd.NA

    return out

# then, for each builder, do:

raw = _folder_to_df(RAW_DIR/"DAYAHEAD_GEN_WIND_SOLAR")
da_debug = _pivot_wind_solar_forecast_debug(raw, "Day ahead", "forecast")
print("→ debug day-ahead:\n", da_debug.head(), da_debug.shape)

raw_int_pr = _folder_to_df(RAW_DIR/"INTRADAY_PROCESS_GEN_WIND_SOLAR")
ip_debug = _pivot_wind_solar_forecast_debug(raw_int_pr, "Intraday process", "forecast")
print("→ debug intraday-process:\n", ip_debug.head(), ip_debug.shape)

raw_int_total = _folder_to_df(RAW_DIR/"INTRADAY_TOTAL_GEN_WIND_SOLAR")
it_debug = _pivot_wind_solar_forecast_debug(raw_int_total, "Intraday total", "forecast")
print("→ debug intraday-total:\n", it_debug.head(), it_debug.shape)



>>> raw df shape: (562254, 8)
   processType values: ['Day ahead' 'Intraday process' 'Intraday total']
   psrType values:     ['Wind Offshore' 'Solar' 'Wind Onshore']
   – selecting processType == 'Day ahead': 431424 rows
   psrType counts in filtered: {'Wind Offshore': 143808, 'Solar': 143808, 'Wind Onshore': 143808}
   after pivot: (143808, 7)
   pivot columns: ['settlementDate', 'settlementPeriod', 'startTime', 'publishTime', 'Solar', 'Wind Offshore', 'Wind Onshore']
   – missing column windOnshoreGeneration, inserting NaNs
   – missing column windOffshoreGeneration, inserting NaNs
   – missing column solarGeneration, inserting NaNs
→ debug day-ahead:
 psrType settlementDate  settlementPeriod                 startTime  \
0           2017-01-01                 1 2017-01-01 00:00:00+00:00   
1           2017-01-01                 2 2017-01-01 00:30:00+00:00   
2           2017-01-01                 3 2017-01-01 01:00:00+00:00   
3           2017-01-01                 4 2017-01-01 01:

KeyError: 'processType'