In [0]:
%pip install xgboost

In [0]:
# ============================================
# 1. Imports
# ============================================
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [0]:
# ============================================
# 2. Load base table and filter to Austria (AT)
# ============================================
df = spark.table("workspace.default.train_set_timebins_lags")

print("Total rows (all countries):", df.count())
print("Total columns:", len(df.columns))

# df_de = df.filter(F.col("country") == "AT")
# print("Rows for AT:", df_de.count())

# display(df_de.limit(5))
# workspace.default.electricity_and_weather_europe_with_target

df_de = df.filter(F.col("country") == "AT")
print("Rows for AT:", df_de.count())

display(df_de.limit(5))

In [0]:
# ============================================
# 3. Create future target columns (1‚Äì6 hours ahead)
# ============================================

# Window by country + time (even though we only have DE, it's safer)
w = Window.partitionBy("country").orderBy("timestamp")

horizons = [1, 2, 3, 4, 5, 6]

for h in horizons:
    df_de = df_de.withColumn(f"stress_plus_{h}h", F.lag("grid_stress_score", -h).over(w))

# Drop rows where any target is null (end of series)
target_cols = [f"stress_plus_{h}h" for h in horizons]
df_de = df_de.dropna(subset=target_cols)

print("Rows for AT after adding targets and dropping tail:", df_de.count())
display(df_de.select("timestamp", "grid_stress_score", *target_cols).orderBy("timestamp").limit(10))


In [0]:
# ============================================
# 4. Convert to Pandas and basic cleaning
# ============================================

pdf = (
    df_de
    .orderBy("timestamp")
    .toPandas()
)

# Ensure timestamp is datetime and set as index for reference (not in features)
pdf["timestamp"] = pd.to_datetime(pdf["timestamp"])
pdf = pdf.sort_values("timestamp").reset_index(drop=True)

print("Pandas shape:", pdf.shape)
pdf.head()


In [0]:
# ============================================
# 5. Drop high-null columns & define features/targets
# ============================================

# 5.1 Drop columns with > 50% missing
null_frac = pdf.isna().mean()
high_null_cols = null_frac[null_frac > 0.5].index.tolist()
print("Columns with >50% nulls (will drop):")
print(high_null_cols)

pdf_clean = pdf.drop(columns=high_null_cols)

# ============================================
# üëâ ADD THIS HERE ‚Äî encode daytime_bin
# ============================================

if "daytime_bin" in pdf_clean.columns:
    pdf_clean["daytime_bin"] = pdf_clean["daytime_bin"].map({
        "night": 0,
        "early_morning": 1,
        "morning": 2,
        "afternoon": 3,
        "evening": 4
    })

# ============================================
# 5.2 Define targets
# ============================================
target_cols = [f"stress_plus_{h}h" for h in horizons]

# 5.3 Columns to exclude from features
exclude_cols = (
    ["index", "country", "timestamp", "grid_stress_score"]
    + target_cols
)

# 5.4 Build feature list
feature_cols = [c for c in pdf_clean.columns if c not in exclude_cols]

print("Number of feature columns:", len(feature_cols))
print("Example features:", feature_cols[:20])


In [0]:
# ============================================
# 6. Simple imputation for remaining nulls
# ============================================

# For time series, a simple forward-fill then back-fill is reasonable for a first model
pdf_model = pdf_clean.copy()

pdf_model[feature_cols] = (
    pdf_model[feature_cols]
        .ffill()
        .bfill()
)

# If still any null remains (edge cases), fill with column median
for c in feature_cols:
    if pdf_model[c].isna().any():
        pdf_model[c] = pdf_model[c].fillna(pdf_model[c].median())

# Sanity check
print("Any nulls left in features?", pdf_model[feature_cols].isna().any().any())


In [0]:
# ============================================
# 7. Time-based train/validation split
# ============================================

n = len(pdf_model)
split_idx = int(n * 0.8)

X = pdf_model[feature_cols].values
y = pdf_model[target_cols]   # this is a DataFrame with 6 target columns

X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train size:", X_train.shape, y_train.shape)
print("Val size  :", X_val.shape, y_val.shape)


In [0]:
df_de.count()


In [0]:
len(feature_cols)


In [0]:
# ============================================
# 8. Train one XGBoost model per horizon
# ============================================

horizons = [1, 2, 3, 4, 5, 6]  # ensure defined

models = {}
metrics = {}

for h in horizons:
    target_name = f"stress_plus_{h}h"
    print(f"\n=== Training XGBoost model for {target_name} ===")
    
    y_tr = y_train[target_name].values
    y_va = y_val[target_name].values

    # XGBoost regressor (you can tune these later)
    xgb = XGBRegressor(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1,
        tree_method="hist"   # fast & works well on CPUs
    )

    xgb.fit(X_train, y_tr)

    y_pred = xgb.predict(X_val)

    mae = mean_absolute_error(y_va, y_pred)
    rmse = np.sqrt(mean_squared_error(y_va, y_pred))

    models[target_name] = xgb
    metrics[target_name] = {"MAE": mae, "RMSE": rmse}

    print(f"{target_name} -> MAE: {mae:.3f}, RMSE: {rmse:.3f}")


In [0]:
# ============================================
# 9. Show metrics nicely
# ============================================

import pandas as pd

metrics_df = pd.DataFrame(metrics).T
metrics_df


In [0]:
# ============================================
# 10. Plot actual vs predicted for 1h horizon
# ============================================
h = 1
target_name = f"stress_plus_{h}h"

y_va = y_val[target_name].values
y_pred = models[target_name].predict(X_val)

plt.figure(figsize=(14,5))
plt.plot(y_va, label="Actual future stress (+1h)", alpha=0.8)
plt.plot(y_pred, label="Predicted future stress (+1h)", alpha=0.8)
plt.title("AT - Grid Stress Prediction (+1h)")
plt.xlabel("Time steps (validation)")
plt.ylabel("Stress Score")
plt.legend()
plt.show()


In [0]:
# ============================================
# Better Plot: Zoom into first 300 validation points
# ============================================
h = 1
target_name = f"stress_plus_{h}h"

y_va = y_val[target_name].values
y_pred = models[target_name].predict(X_val)

N = 300   # zoom window

plt.figure(figsize=(14,5))
plt.plot(y_va[:N], label="Actual (+1h)", alpha=0.9)
plt.plot(y_pred[:N], label="Predicted (+1h)", alpha=0.9)
plt.title("AT ‚Äî Grid Stress Prediction (+1h) ‚Äì Zoomed (first 300 samples)")
plt.xlabel("Time steps (validation)")
plt.ylabel("Stress Score")
plt.legend()
plt.show()  

# üìå Now the plot will be readable and show if the model tracks the signal.


In [0]:
# ============================================
# Scatter Plot: Actual vs Predicted
# ============================================

plt.figure(figsize=(6,6))
plt.scatter(y_va, y_pred, alpha=0.3, s=10)
plt.plot([0,80], [0,80], "r--", label="Perfect prediction")
plt.xlabel("Actual stress (+1h)")
plt.ylabel("Predicted stress (+1h)")
plt.title("Actual vs Predicted Stress (+1h)")
plt.legend()
plt.show()  

# üìå If points are close to the red line ‚Üí strong model.


In [0]:
# ============================================
# Residual Plot
# ============================================

residuals = y_va - y_pred

plt.figure(figsize=(14,4))
plt.plot(residuals, alpha=0.7)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals ‚Äî Stress Prediction (+1h)")
plt.xlabel("Time steps (validation)")
plt.ylabel("Error (Actual - Predicted)")
plt.show()

# üìå Good models ‚Üí residuals centered around zero, no obvious structure.


In [0]:
# ============================================
# Histogram of Residuals
# ============================================

plt.figure(figsize=(6,4))
plt.hist(residuals, bins=40, alpha=0.7)
plt.title("Residual Distribution (+1h)")
plt.xlabel("Residual")
plt.ylabel("Count")
plt.show()

# üìå Ideal ‚Üí symmetric around zero.


In [0]:
# ============================================
# 6-Panel Comparison: Actual vs Predicted for 1‚Äì6 hours
# ============================================

plt.figure(figsize=(18, 14))

for i, h in enumerate(horizons):
    target = f"stress_plus_{h}h"
    
    # actual & predicted
    y_va = y_val[target].values
    y_pred = models[target].predict(X_val)
    
    # zoom window for readability
    N = 300
    
    plt.subplot(3, 2, i+1)
    plt.plot(y_va[:N], label=f"Actual (+{h}h)", alpha=0.9)
    plt.plot(y_pred[:N], label=f"Predicted (+{h}h)", alpha=0.9)
    plt.title(f"Grid Stress Prediction (+{h}h)")
    plt.xlabel("Validation time steps (zoom 0‚Äì300)")
    plt.ylabel("Stress Score")
    plt.legend()

plt.tight_layout()
plt.show()


In [0]:
# ============================================
# 6-Panel Scatter Dashboard: Actual vs Predicted
# ============================================

plt.figure(figsize=(18, 14))

for i, h in enumerate(horizons):
    target = f"stress_plus_{h}h"
    
    # Actual & predicted
    y_va = y_val[target].values
    y_pred = models[target].predict(X_val)

    # Scatter plot (Actual vs Predicted)
    plt.subplot(3, 2, i+1)
    plt.scatter(y_va, y_pred, s=8, alpha=0.5)
    plt.plot([0, 100], [0, 100], 'r--', linewidth=1)  # perfect line

    plt.title(f"Actual vs Predicted (+{h}h)")
    plt.xlabel("Actual Stress Score")
    plt.ylabel("Predicted Stress Score")
    plt.xlim(0, 100)
    plt.ylim(0, 100)

plt.tight_layout()
plt.show()

# Scatter of actual vs predicted values
# Red dashed line = perfect predictions
# Tight axis limits (0‚Äì100 stress score) for consistency
# Helps detect:
#   over-prediction
#   under-prediction
#   heteroskedasticity
#   horizon degradation


In [0]:
# =====================================================
# 1. Residual Distribution Dashboard (6 Horizons)
# =====================================================

plt.figure(figsize=(18, 14))

for i, h in enumerate(horizons):
    target = f"stress_plus_{h}h"
    y_va = y_val[target].values
    y_pred = models[target].predict(X_val)

    residuals = y_va - y_pred

    plt.subplot(3, 2, i+1)
    plt.hist(residuals, bins=40, color="steelblue", alpha=0.75)
    plt.axvline(0, color="red", linestyle="--")
    plt.title(f"Residual Distribution (+{h}h)")
    plt.xlabel("Residual (Actual ‚Äì Predicted)")
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()


In [0]:
# =====================================================
# 2. Residual vs Predicted (6 Horizons)
# =====================================================

plt.figure(figsize=(18, 14))

for i, h in enumerate(horizons):
    target = f"stress_plus_{h}h"
    y_va = y_val[target].values
    y_pred = models[target].predict(X_val)

    residuals = y_va - y_pred

    plt.subplot(3, 2, i+1)
    plt.scatter(y_pred, residuals, s=8, alpha=0.5)
    plt.axhline(0, color="red", linestyle="--")
    
    plt.title(f"Residual vs Predicted (+{h}h)")
    plt.xlabel("Predicted Stress Score")
    plt.ylabel("Residual (Actual ‚Äì Predicted)")
    plt.ylim(-40, 40)  # adjust if needed

plt.tight_layout()
plt.show()


####‚úÖ Why the diagonal clusters + stripes are OK

Your target variable grid_stress_score has discrete plateaus (e.g., 0, 12.5, 25, 37.5, 50, 62.5, 75, 87.5).  

This means:  
Models predict continuous values ‚Üí  
When plotted against discrete actual values ‚Üí  
You naturally see vertical bands (Actual fixed)  

And in residual plots, you see diagonal stripes  
üëâ This is NOT a modeling error.  
üëâ Both RF and XGB behave the same because the target itself is discrete.  

This pattern will always appear unless the scoring system changes.  

In [0]:
# =====================================================
# 3. RMSE Trend Plot
# =====================================================

rmses = []
for h in horizons:
    target = f"stress_plus_{h}h"
    y_va = y_val[target].values
    y_pred = models[target].predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_va, y_pred))
    rmses.append(rmse)

plt.figure(figsize=(10, 5))
plt.plot(horizons, rmses, marker="o", linewidth=2)
plt.title("RMSE Trend Across Prediction Horizons")
plt.xlabel("Prediction Horizon (hours ahead)")
plt.ylabel("RMSE")
plt.grid(True)
plt.show()


In [0]:
# =====================================================
# Bar Chart: Top 10 Features per Horizon
# =====================================================

for h in horizons:
    target = f"stress_plus_{h}h"
    model = models[target]

    imp = pd.DataFrame({
        "feature": feature_cols,
        "importance": model.feature_importances_
    }).sort_values("importance", ascending=False).head(10)

    plt.figure(figsize=(8, 5))
    # sns.barplot(data=imp, x="importance", y="feature", palette="magma") -> for old seaborn version
    
    sns.barplot(
    data=imp,
    x="importance",
    y="feature",
    hue="feature",         # assign hue so palette is valid
    palette="magma",
    dodge=False,
    legend=False           # hide legend, not useful here
)

    plt.title(f"Top 10 Features for +{h}h Prediction")
    plt.xlabel("Importance Score")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()


In [0]:
# ============================================
# 11. Feature importance for +1h model
# ============================================

h = 1
target_name = f"stress_plus_{h}h"
rf = models[target_name]

importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=feature_cols).sort_values(ascending=False)

plt.figure(figsize=(8,10))
feat_imp.head(20).plot(kind="barh")
plt.gca().invert_yaxis()
plt.title("Top 20 Feature Importances ‚Äì Stress +1h")
plt.xlabel("Importance")
plt.show()

feat_imp.head(30)


In [0]:
# =====================================================
# Save trained XGBoost models
# =====================================================

import pickle
import base64
from pyspark.sql import Row

rows = []

for h in horizons:
    target = f"stress_plus_{h}h"
    
    if target not in models:
        print("‚ùå Model missing:", target)
        continue

    model = models[target]
    
    model_bytes = pickle.dumps(model)
    model_b64 = base64.b64encode(model_bytes).decode("utf-8")
    
    rows.append(Row(
        horizon=target,
        model_name=f"XGBoost_{target}",
        model_binary=model_b64
    ))

spark_df = spark.createDataFrame(rows)


In [0]:
spark_df.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.default.AT_XGBoost_grid_stress_models")


#### ‚úÖ What you have now

- A **time-based modeling dataset** with:  
  - lag features  
  - rolling features  
  - time bins  
  - weather  
  - imports/exports  
- Targets: stress_plus_1h ‚Ä¶ stress_plus_6h  
- A **XGBoost model per horizon**  
- MAE/RMSE for each horizon  
- Example prediction plot  
- Feature importance for interpretability

In [0]:
print(models.keys())

In [0]:
print(df.count())

In [0]:
df_saved = spark.table("workspace.default.AT_XGBoost_grid_stress_models")
df_saved.printSchema()
display(df_saved.limit(5))

# ‚úî Each row = one XGBoost model
# ‚úî model_binary = pickled model stored as bytes
# ‚úî You do NOT need .pkl files anymore


In [0]:
print(horizons)

In [0]:
print(len(rows))

In [0]:
# spark.sql("DROP TABLE IF EXISTS workspace.default.DE_XGBoost_grid_stress_models")

