In [0]:

%pip install xgboost==1.7.6

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql import Row

import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import pickle
import base64

print("Imports loaded.")

In [0]:
df = spark.table("workspace.default.train_set_timebins_lags")

df_de = df.filter(F.col("country") == "DE")

print("Total DE rows:", df_de.count())
display(df_de.limit(5))

In [0]:
horizons = [1, 2, 3, 4, 5, 6]

w = Window.partitionBy("country").orderBy("timestamp")

for h in horizons:
    df_de = df_de.withColumn(
        f"stress_plus_{h}h",
        F.lag("grid_stress_score", -h).over(w)
    )

target_cols = [f"stress_plus_{h}h" for h in horizons]

df_de = df_de.dropna(subset=target_cols)

print("DE rows after targets:", df_de.count())
display(df_de.select("timestamp", "grid_stress_score", *target_cols).limit(5))


In [0]:
pdf = df_de.orderBy("timestamp").toPandas()

pdf["timestamp"] = pd.to_datetime(pdf["timestamp"])
pdf = pdf.sort_values("timestamp").reset_index(drop=True)

print("Pandas shape:", pdf.shape)
pdf.head()

In [0]:
null_frac = pdf.isna().mean()
high_null_cols = null_frac[null_frac > 0.5].index.tolist()

pdf_clean = pdf.drop(columns=high_null_cols)

# Correct indentation!
if "daytime_bin" in pdf_clean.columns:
    pdf_clean["daytime_bin"] = pdf_clean["daytime_bin"].map({
        "night": 0,
        "early_morning": 1,
        "morning": 2,
        "afternoon": 3,
        "evening": 4
    })

exclude_cols = ["index", "country", "timestamp", "grid_stress_score"] + target_cols
feature_cols = [c for c in pdf_clean.columns if c not in exclude_cols]

print("Feature count:", len(feature_cols))


In [0]:
pdf_model = pdf_clean.copy()

pdf_model[feature_cols] = (
    pdf_model[feature_cols]
        .fillna(method="ffill")
        .fillna(method="bfill")
)

for c in feature_cols:
    if pdf_model[c].isna().any():
        pdf_model[c] = pdf_model[c].fillna(pdf_model[c].median())

print("Nulls remaining:", pdf_model[feature_cols].isna().any().any())


In [0]:
n = len(pdf_model)
split_idx = int(n * 0.8)

X = pdf_model[feature_cols].values
y = pdf_model[target_cols]

X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train size:", X_train.shape, y_train.shape)
print("Val size:", X_val.shape, y_val.shape)

In [0]:
from xgboost import XGBRegressor

models = {}
metrics = {}

for h in horizons:
    target_name = f"stress_plus_{h}h"
    print(f"\n=== Training XGBoost for {target_name} ===")

    # Extract y for this horizon
    y_tr = y_train[target_name].values
    y_va = y_val[target_name].values

    # Define model
    model = XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )

    # Train
    model.fit(X_train, y_tr)

    # Predict
    y_pred = model.predict(X_val)

    # Evaluate
    mae = mean_absolute_error(y_va, y_pred)
    rmse = np.sqrt(mean_squared_error(y_va, y_pred))

    metrics[target_name] = {"MAE": mae, "RMSE": rmse}
    models[target_name] = model

    print(f"{target_name}: MAE={mae:.3f}, RMSE={rmse:.3f}")



In [0]:
# spark.sql("DROP TABLE IF EXISTS workspace.default.DE_XGBoost_grid_stress_models")

In [0]:
import pickle
import base64
from pyspark.sql import Row

rows = []

for h in horizons:
    name = f"stress_plus_{h}h"
    model_bytes = pickle.dumps(models[name])
    model_b64 = base64.b64encode(model_bytes).decode("utf-8")
    
    rows.append(Row(
        horizon=name,
        model_name=f"XGBoost_{name}",
        model_binary=model_b64
    ))

spark_df = spark.createDataFrame(rows)

spark_df.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.default.DE_XGBoost_grid_stress_models")

print("âœ” All 6 XGBoost models saved to Delta!")


In [0]:
display(spark.table("workspace.default.DE_XGBoost_grid_stress_models"))


In [0]:
df_check = spark.table("workspace.default.DE_XGBoost_grid_stress_models")
df_check.printSchema()
display(df_check)


In [0]:
print("Horizons:", horizons)
print("Models trained:", list(models.keys()))
print("Rows created:", len(rows))
for r in rows:
    print(r.horizon)


In [0]:
df_check = spark.sql("""
    SELECT * 
    FROM workspace.default.DE_XGBoost_grid_stress_models
""")
display(df_check)