In [1]:
from pathlib import Path
from datetime import datetime, timedelta, date
import gzip
import json
import pandas as pd
import re
from tqdm import tqdm

In [None]:
RAW_DIR = Path("bmrs_json_raw")
INPUT_DIR = RAW_DIR / "BIDOFFER_ACCEPTANCES"

PROCESSED_DIR = Path("bmrs_csv_filled")
OUTPUT_DIR = PROCESSED_DIR / "BIDOFFER_ACCEPTANCES.csv"

def load_boalf(input_dir: Path) -> pd.DataFrame:
    """
    Loads and flattens all .json.gz files in the input_dir.
    Returns a DataFrame with one row per BOALF entry.
    """
    all_rows = []
    files = sorted(input_dir.glob("*.json.gz"))
    
    # Sort by acceptanceTime after loading all data
    temp_rows = []
    
    for file in tqdm(files, desc="Loading BOALF files"):
        try:
            with gzip.open(file, 'rt', encoding='utf-8') as f:
                data = json.load(f)
            if isinstance(data, dict) and 'data' in data:
                entries = data['data']
            elif isinstance(data, list):
                entries = data
            else:
                print(f"Unexpected format in file: {file}")
                continue

            for entry in entries:
                row = {
                    'settlementPeriodFrom': entry.get('settlementPeriodFrom'),
                    'settlementPeriodTo': entry.get('settlementPeriodTo'),
                    'levelFrom': float(entry.get('levelFrom', 0)),
                    'levelTo': float(entry.get('levelTo', 0)),
                    'timeFrom': entry.get('timeFrom'),
                    'timeTo': entry.get('timeTo'),
                    'elexonBmUnit': entry.get('bmUnit'),
                    'nationalGridBmUnit': entry.get('nationalGridBmUnit'),
                    'acceptanceNumber': entry.get('acceptanceNumber'),
                    'acceptanceTime': entry.get('acceptanceTime'),
                    'deemedBoFlag': entry.get('deemedBoFlag', False),
                    'soFlag': entry.get('soFlag', False),
                    'storFlag': entry.get('storFlag', False),
                    'rrFlag': entry.get('rrFlag', False),
                }
                temp_rows.append(row)
        
        except Exception as e:
            print(f"Error processing file {file}: {e}")
            continue

    # Convert to DataFrame and sort by acceptanceTime
    df = pd.DataFrame(temp_rows)
    
    # Convert time columns to datetime and make timezone naive
    df['timeFrom'] = pd.to_datetime(df['timeFrom']).dt.tz_localize("None")
    df['timeTo'] = pd.to_datetime(df['timeTo']).dt.tz_localize(None)
    df['acceptanceTime'] = pd.to_datetime(df['acceptanceTime']).dt.tz_localize(None)
    
    # Sort by acceptanceTime ascending
    df = df.sort_values('acceptanceTime', ascending=True).reset_index(drop=True)

    return df

In [7]:
df_boalf = load_boalf(INPUT_DIR)
df_boalf.head()

Loading BOALF files: 100%|██████████| 70094/70094 [00:26<00:00, 2687.68it/s]


Unnamed: 0,settlementPeriodFrom,settlementPeriodTo,levelFrom,levelTo,timeFrom,timeTo,elexonBmUnit,nationalGridBmUnit,acceptanceNumber,acceptanceTime,deemedBoFlag,soFlag,storFlag,rrFlag
0,1,1,0.0,0.0,2021-06-29 23:01:00,2021-06-29 23:02:00,T_DINO-6,DINO-6,131996,2021-06-29 22:00:00,False,False,False,False
1,1,1,176.0,130.0,2021-06-29 23:19:00,2021-06-29 23:21:00,T_CARR-2,CARR-2,67550,2021-06-29 22:01:00,False,True,False,False
2,1,1,130.0,100.0,2021-06-29 23:21:00,2021-06-29 23:23:00,T_CARR-2,CARR-2,67550,2021-06-29 22:01:00,False,True,False,False
3,1,1,100.0,0.0,2021-06-29 23:23:00,2021-06-29 23:29:00,T_CARR-2,CARR-2,67550,2021-06-29 22:01:00,False,True,False,False
4,1,1,190.0,0.0,2021-06-29 23:05:00,2021-06-29 23:29:00,T_CDCL-1,CDCL-1,118989,2021-06-29 22:02:00,False,False,False,False


In [8]:
# Load BMU data
BMU_DIR = Path("other")
ORIGINAL_BMU_PATH = BMU_DIR / "BMU_Dataset.csv"
OUTPUT_BMU_PATH = BMU_DIR / "BMU_Dataset_Grouped.csv"
df_bmu = pd.read_csv(ORIGINAL_BMU_PATH)

# Define modular grouping dictionary
FUEL_GROUP_MAPPING = {
    'THERMAL_GAS': ['GAS', 'CCGT', 'OCGT'],
    'THERMAL_OTHER': ['COAL', 'DIESEL', 'BIOMASS'],
    'STORAGE_BATTERY': ['BATTERY'],
    'STORAGE_PSH': ['PS', 'PS '],
    'RENEWABLE_WIND': ['WIND'],
    'RENEWABLE_SOLAR': ['SOLAR'],
    'RENEWABLE_HYDRO': ['NPSHYD', 'TIDAL'],
    'INTERCONNECTOR': ['INTBN', 'INTIRL', 'INTEW', 'INTFR', 'INTNEM', 'INTGRNL', 'INTGBR'],
    'LOAD_RESPONSE': ['LOAD RESPONSE'],
    'NUCLEAR': ['NUCLEAR']
    # 'OTHER' will be handled as fallback
}

# Flatten the dictionary into a reverse lookup
fuel_to_group = {
    fuel_type: group
    for group, fuel_list in FUEL_GROUP_MAPPING.items()
    for fuel_type in fuel_list
}

# Apply grouping
df_bmu["FUEL_TYPE_CLEAN"] = df_bmu["REG_FUEL_TYPE"].map(fuel_to_group)
df_bmu["FUEL_TYPE_CLEAN"] = df_bmu["FUEL_TYPE_CLEAN"].fillna("OTHER")

# Save updated dataframe
df_bmu.to_csv(OUTPUT_BMU_PATH, index=False)
print("✓ FUEL_TYPE_CLEAN column added and saved.")


✓ FUEL_TYPE_CLEAN column added and saved.


In [9]:
df_boalf = pd.read_parquet("df_boalf.parquet")

# Load cleaned BMU fuel type mapping
df_bmu = pd.read_csv(BMU_DIR / "BMU_Dataset_Grouped.csv")

# Build a mapping from nationalGridBmUnit to FUEL_TYPE_CLEAN
bmu_fuel_map = df_bmu.set_index("nationalGridBmUnit")["FUEL_TYPE_CLEAN"].to_dict()

# Map onto df_boalf
df_boalf["fuelType"] = df_boalf["nationalGridBmUnit"].map(bmu_fuel_map)

missing = df_boalf["fuelType"].isna().sum()
print(f"✓ fuelType mapping complete. {missing} unmatched BMUs.")

# Drop rows where fuelType is missing
df_boalf = df_boalf.dropna(subset=['fuelType'])
print(f"✓ Dropped rows with missing fuelType. Remaining rows: {len(df_boalf)}")

# Save the processed DataFrame
df_boalf.to_parquet("df_boalf.parquet", index=False)
print("✓ Processed DataFrame saved to df_boalf.parquet.")

✓ fuelType mapping complete. 0 unmatched BMUs.
✓ Dropped rows with missing fuelType. Remaining rows: 9720481
✓ Processed DataFrame saved to df_boalf.parquet.


In [16]:
# Convert timeFrom and timeTo to UTC (they should already be UTC but this ensures consistency)
df_boalf['timeFrom'] = pd.to_datetime(df_boalf['timeFrom']).dt.tz_localize(None)
df_boalf['timeTo'] = pd.to_datetime(df_boalf['timeTo']).dt.tz_localize(None)

print("✓ timeFrom and timeTo columns converted")

✓ timeFrom and timeTo columns converted


In [19]:
from datetime import datetime, timedelta, date
from zoneinfo import ZoneInfo
import pandas as pd
from tqdm import tqdm
import numpy as np

def build_uk_halfhour_calendar(start_date, end_date):
    """
    Build UK half-hour calendar with correct DST handling:
      • Spring-forward days: 46 periods (including the skipped 01:00/01:30)
      • Normal days: 48 periods 00:00-23:30
      • BST days: 48 periods 23:00(prev day)-22:30
      • Autumn-back days: 50 periods 23:00(prev day)-22:30
    """

    def _to_date(x):
        if isinstance(x, str):
            
            if x.count("-") == 2 and x[4] == "-": # ISO format
                return date.fromisoformat(x)
            return datetime.strptime(x, "%d/%m/%Y").date() # UK format
        if isinstance(x, pd.Timestamp):
            return x.date()
        return x

    start = _to_date(start_date)
    end   = _to_date(end_date)

    london = ZoneInfo("Europe/London")
    utc    = ZoneInfo("UTC")
    rows   = []

    for single in pd.date_range(start, end, freq="D"):
        D   = single.date()
        # local midnights in London
        dt0 = datetime(D.year, D.month, D.day, tzinfo=london)
        dt1 = dt0 + timedelta(days=1)

        # number of half-hours that actually occur
        total_secs = (dt1.astimezone(utc) - dt0.astimezone(utc)).total_seconds()
        n_periods = int(total_secs // 1800)

        # align to UTC-naive base for SP1
        offset_h = dt0.utcoffset().total_seconds() / 3600
        if offset_h > 0:
            base = datetime(D.year, D.month, D.day) - timedelta(hours=int(offset_h))
        else:
            base = datetime(D.year, D.month, D.day)

        for i in range(n_periods):
            rows.append({
                "startTime":        base + timedelta(minutes=30 * i),
                "settlementDate":   D,
                "settlementPeriod": i + 1
            })

    df = pd.DataFrame(rows)

    # ─── coerce to pandas time types ───
    df["startTime"]      = pd.to_datetime(df["startTime"])
    df["settlementDate"] = pd.to_datetime(df["settlementDate"]).dt.normalize()
    df["settlementPeriod"] = df["settlementPeriod"].astype("int32")
    # ───────────────────────────────────

    return df

calendar = build_uk_halfhour_calendar(df_boalf["timeFrom"].min(), df_boalf["timeTo"].max())

def process_batches(df_boalf, calendar, days_per_batch=1):
    all_results = []

    start = calendar["settlementDate"].min().date()
    end = calendar["settlementDate"].max().date()
    batch_dates = pd.date_range(start, end, freq=f"{days_per_batch}D")

    for i in tqdm(range(len(batch_dates)), desc="Processing batches"):
        batch_start = batch_dates[i].date()
        batch_end = (batch_start + timedelta(days=days_per_batch - 1))
        batch_mask = (calendar["settlementDate"].dt.date >= batch_start) & (calendar["settlementDate"].dt.date <= batch_end)
        calendar_batch = calendar.loc[batch_mask].copy()

        if calendar_batch.empty:
            continue

        cal_start = calendar_batch["startTime"].min()
        cal_end = calendar_batch["startTime"].max() + timedelta(minutes=30)
        
        # Convert to UTC timezone-aware for comparison with df_boalf timestamps
        cal_start_utc = pd.to_datetime(cal_start).tz_localize(None)
        cal_end_utc = pd.to_datetime(cal_end).tz_localize(None)

        df_subset = df_boalf[(df_boalf["timeTo"] > cal_start_utc) & (df_boalf["timeFrom"] < cal_end_utc)].copy()
        if df_subset.empty:
            continue

        df_subset["duration"] = (df_subset["timeTo"] - df_subset["timeFrom"]).dt.total_seconds()
        df_subset["powerAvg"] = (df_subset["levelFrom"] + df_subset["levelTo"]) / 2

        rows = []
        for _, sp in calendar_batch.iterrows():
            sp_start = pd.to_datetime(sp["startTime"]).tz_localize(None)
            sp_end = sp_start + timedelta(minutes=30)

            mask = (df_subset["timeTo"] > sp_start) & (df_subset["timeFrom"] < sp_end)
            overlapping = df_subset.loc[mask].copy()

            if overlapping.empty:
                continue

            overlap_start = np.maximum(overlapping["timeFrom"].values.astype("datetime64[ns]"), np.datetime64(sp_start))
            overlap_end = np.minimum(overlapping["timeTo"].values.astype("datetime64[ns]"), np.datetime64(sp_end))
            overlap_secs = (overlap_end - overlap_start) / np.timedelta64(1, "s")

            mwh = (overlap_secs * overlapping["powerAvg"].values) / 3600  # MWh

            result = pd.DataFrame({
                "startTime": sp["startTime"],
                "settlementDate": sp["settlementDate"],
                "settlementPeriod": sp["settlementPeriod"],
                "fuelType": overlapping["fuelType"].values,
                "MWh": mwh
            })

            rows.append(result)

        if rows:
            all_results.append(pd.concat(rows))

    df_energy = pd.concat(all_results)
    return df_energy

df_energy = process_batches(df_boalf, calendar, days_per_batch=1)

Processing batches: 100%|██████████| 1464/1464 [01:12<00:00, 20.28it/s]


In [23]:
# Filter from settlementDate 2021-07-01 onwards
df_wide_filtered = df_wide[df_wide['settlementDate'] >= '2021-07-01'].copy()

# Group by SP and fuelType, then pivot to wide format
df_wide = (
    df_energy
    .groupby(["startTime", "settlementDate", "settlementPeriod", "fuelType"], as_index=False)
    .agg({"MWh": "sum"})
    .pivot(index=["startTime", "settlementDate", "settlementPeriod"], columns="fuelType", values="MWh")
    .fillna(0)
    .reset_index()
)

# Set startTime as index
df_wide.set_index("startTime", inplace=True)

# Normalize settlementDate to remove time component
df_wide["settlementDate"] = df_wide["settlementDate"].dt.normalize()

# Filter from settlementDate 2021-07-01 onwards
df_wide = df_wide[df_wide['settlementDate'] >= '2021-07-01']

# Round all float columns to 3 decimal places and convert to float32
float_cols = df_wide.select_dtypes(include=['float64']).columns
df_wide[float_cols] = df_wide[float_cols].round(3).astype('float32')

# Optional: sort the result
df_wide.sort_index(inplace=True)

display(df_wide)

# Save the wide format DataFrame
FINAL_OUTPUT_DIR = Path("bmrs_csv_filled")
df_wide.to_csv(FINAL_OUTPUT_DIR / "BIDOFFER_ACCEPTANCES.csv")

fuelType,settlementDate,settlementPeriod,LOAD_RESPONSE,OTHER,RENEWABLE_HYDRO,RENEWABLE_SOLAR,RENEWABLE_WIND,STORAGE_BATTERY,STORAGE_PSH,THERMAL_GAS,THERMAL_OTHER
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-06-30 23:00:00,2021-07-01,1,0.0,0.0,0.000,0.0,0.0,0.000000,152.500000,3439.225098,0.000000
2021-06-30 23:30:00,2021-07-01,2,0.0,0.0,0.000,0.0,0.0,0.000000,82.250000,4679.742188,0.000000
2021-07-01 00:00:00,2021-07-01,3,0.0,0.0,0.000,0.0,0.0,0.000000,61.367001,4549.083008,0.000000
2021-07-01 00:30:00,2021-07-01,4,0.0,0.0,0.000,0.0,0.0,0.000000,57.450001,4128.350098,0.000000
2021-07-01 01:00:00,2021-07-01,5,0.0,0.0,0.000,0.0,0.0,0.000000,0.000000,3284.791992,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
2025-06-30 21:30:00,2025-06-30,46,0.0,0.0,22.333,0.0,0.0,-77.958000,0.000000,4081.816895,1446.625000
2025-06-30 22:00:00,2025-06-30,47,0.0,0.0,8.467,0.0,0.0,-11.525000,2.700000,2365.616943,1272.958008
2025-06-30 22:30:00,2025-06-30,48,0.0,0.0,3.975,0.0,0.0,-17.799999,1.350000,2533.392090,613.375000
2025-06-30 23:00:00,2025-07-01,1,0.0,0.0,0.000,0.0,0.0,0.000000,0.000000,1261.625000,0.000000
