In [1]:
from google.colab import drive
from pathlib import Path
import os

drive.mount("/content/drive")

PROJECT = Path("/content/drive/MyDrive/gb-flex-desk")  # change name if you want
PROJECT.mkdir(parents=True, exist_ok=True)
os.chdir(PROJECT)

Path("data/raw").mkdir(parents=True, exist_ok=True)
Path("data/processed").mkdir(parents=True, exist_ok=True)

print("cwd:", Path.cwd())
print("prices exists?", Path("data/processed/prices.parquet").exists())

Mounted at /content/drive
cwd: /content/drive/MyDrive/gb-flex-desk
prices exists? True


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor

# Load the combined history built in Notebook 2
prices = pd.read_parquet("data/processed/prices.parquet").sort_values("ts").reset_index(drop=True)

# SBP and SSP are the same under single imbalance pricing, so use one series
prices["price"] = prices["sbp"]

# Target: next settlement period price
prices["y"] = prices["price"].shift(-1)

# Lag features (simple memory of recent prices)
prices["lag1"] = prices["price"].shift(1)
prices["lag2"] = prices["price"].shift(2)
prices["lag48"] = prices["price"].shift(48)  # same time yesterday

# Time features
prices["hour"] = prices["ts"].dt.hour
prices["dow"] = prices["ts"].dt.dayofweek

# Drop rows that don't have enough history for the lags/target
df = prices.dropna().copy()

# Baseline: yesterday same time
baseline_mae = mean_absolute_error(df["y"], df["lag48"])
print("Baseline MAE (£/MWh):", baseline_mae)

# Model inputs
features = ["lag1", "lag2", "lag48", "hour", "dow"]
X = df[features]
y = df["y"]

# Time-aware cross-validation (no future leakage)
tscv = TimeSeriesSplit(n_splits=5)
fold_mae = []

for fold, (tr, te) in enumerate(tscv.split(X), start=1):
    m = HistGradientBoostingRegressor(max_depth=4, learning_rate=0.05, random_state=0)
    m.fit(X.iloc[tr], y.iloc[tr])
    pred = m.predict(X.iloc[te])
    mae = mean_absolute_error(y.iloc[te], pred)
    fold_mae.append(mae)
    print(f"Fold {fold} MAE: {mae:.2f} £/MWh")

print("Average model MAE (£/MWh):", float(np.mean(fold_mae)))
print("Improvement vs baseline (£/MWh):", baseline_mae - float(np.mean(fold_mae)))


Baseline MAE (£/MWh): 40.563735306462185
Fold 1 MAE: 25.02 £/MWh
Fold 2 MAE: 22.71 £/MWh
Fold 3 MAE: 26.61 £/MWh
Fold 4 MAE: 25.67 £/MWh
Fold 5 MAE: 24.60 £/MWh
Average model MAE (£/MWh): 24.92071947806254
Improvement vs baseline (£/MWh): 15.643015828399644


In [6]:
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd

# Fit one final model on all available data
m = HistGradientBoostingRegressor(max_depth=4, learning_rate=0.05, random_state=0)
m.fit(X, y)

# Predict next-period price for every row in df
df["pred"] = m.predict(X)

# Build a clean file for Notebook 5
# Use the actual current-period price as both sbp and ssp (single price)
preds = pd.DataFrame({
    "ts": df["ts"],
    "sbp": df["price"],   # actual price at time ts
    "ssp": df["price"],   # same as sbp here
    "pred": df["pred"]    # model's forecast for next period
})

preds.to_parquet("data/processed/preds.parquet", index=False)
print("saved:", "data/processed/preds.parquet")
preds.head()

saved: data/processed/preds.parquet


Unnamed: 0,ts,sbp,ssp,pred
48,2024-12-07 00:00:00,24.089521,24.089521,35.679037
49,2024-12-07 00:30:00,30.896732,30.896732,33.603436
50,2024-12-07 01:00:00,30.41,30.41,35.905037
51,2024-12-07 01:30:00,33.769878,33.769878,37.630586
52,2024-12-07 02:00:00,44.91,44.91,36.382431
