In [0]:
# =========================
# 0. Imports
# =========================
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


In [0]:
# =========================
# 1. Load table & filter DE
# =========================

# Load full train table with lags & timebins
df = spark.table("workspace.default.train_set_timebins_lags")

print("Total rows (all countries):", df.count())

# Filter for Germany
df_de = df.filter(F.col("country") == "DE")

print("Rows for DE:", df_de.count())
display(df_de.limit(5))


In [0]:
# =========================
# 2. Convert to Pandas & set time index
# =========================

# Select only needed columns for TS analysis
df_de_small = (
    df_de
    .orderBy("timestamp")
    .select(
        "timestamp",
        "Actual_Load",
        "grid_stress_score",
        "net_imports",
        "mean_ssrd",
        "mean_wind_speed",
        "mean_temperature_c",
        "reserve_margin_ml",
        "forecast_load_error",
        "load_rel_error",
        "score_T8",
        "score_reserve_margin"
    )
)

pdf = df_de_small.toPandas()

# Ensure timestamp is datetime & set as index
pdf["timestamp"] = pd.to_datetime(pdf["timestamp"])
pdf = pdf.set_index("timestamp").sort_index()

pdf.head()


In [0]:
# =========================
# 3. Stationarity ‚Äì Rolling Mean & Std
# =========================

target = "grid_stress_score"

plt.figure(figsize=(14,6))
pdf[target].plot(label="Grid Stress Score", alpha=0.7)

pdf[target].rolling(window=24).mean().plot(label="Rolling Mean (24h)")
pdf[target].rolling(window=24).std().plot(label="Rolling Std (24h)")

plt.title("DE ‚Äì Grid Stress Score with 24h Rolling Mean & Std")
plt.legend()
plt.show()



In [0]:
# =========================
# 4. ADF Test (stationarity test)
# =========================

def adf_report(series, name="series"):
    series = series.dropna()
    result = adfuller(series)
    print(f"ADF Test for {name}")
    print(f"  Test Statistic : {result[0]:.4f}")
    print(f"  p-value        : {result[1]:.4f}")
    for key, value in result[4].items():
        print(f"  Critical Value {key}: {value:.4f}")
    print()

adf_report(pdf["grid_stress_score"], "grid_stress_score")
adf_report(pdf["Actual_Load"], "Actual_Load")


In [0]:
# =========================
# 5. Seasonal Decomposition (24h ‚Äì daily pattern)
# =========================

# Re-sample/ensure hourly frequency (if some hours missing, asfreq will insert NaNs)
series_load = pdf["Actual_Load"].asfreq("H")

decomp = seasonal_decompose(series_load, model="additive", period=24)

plt.rcParams["figure.figsize"] = (14,8)
decomp.plot()
plt.suptitle("DE ‚Äì Actual Load: Trend / Seasonality / Residual (Daily)", y=1.02)
plt.show()


In [0]:
# Optional: decomposition for grid_stress_score
series_stress = pdf["grid_stress_score"].asfreq("H")

decomp_stress = seasonal_decompose(series_stress, model="additive", period=24)

plt.figure(figsize=(14,8))
decomp_stress.plot()
plt.suptitle("DE - Grid Stress Score: Trend / Seasonality / Residual (Daily)", y=1.02)
plt.show()


In [0]:
# =========================
# 6. ACF / PACF for Actual Load
# =========================

plt.figure(figsize=(12,4))
plot_acf(pdf["Actual_Load"].dropna(), lags=50)
plt.title("DE ‚Äì ACF of Actual Load")
plt.show()

plt.figure(figsize=(12,4))
plot_pacf(pdf["Actual_Load"].dropna(), lags=50, method="ywm")
plt.title("DE ‚Äì PACF of Actual Load")
plt.show()


In [0]:
# =========================
# 7. ACF / PACF for Grid Stress Score
# =========================

plt.figure(figsize=(12,4))
plot_acf(pdf["grid_stress_score"].dropna(), lags=50)
plt.title("DE - ACF of Grid Stress Score")
plt.show()

plt.figure(figsize=(12,4))
plot_pacf(pdf["grid_stress_score"].dropna(), lags=50, method="ywm")
plt.title("DE - PACF of Grid Stress Score")
plt.show()


In [0]:
# =========================
# 8. Hour-of-day patterns (average)
# =========================

pdf["hour"] = pdf.index.hour
hourly_stats = pdf.groupby("hour")[["Actual_Load", "grid_stress_score"]].mean()

plt.figure(figsize=(10,4))
plt.plot(hourly_stats.index, hourly_stats["Actual_Load"], marker="o")
plt.title("DE - Average Actual Load by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Load (MW)")
plt.xticks(range(0,24))
plt.show()

plt.figure(figsize=(10,4))
plt.plot(hourly_stats.index, hourly_stats["grid_stress_score"], marker="o")
plt.title("DE - Average Grid Stress Score by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Stress Score")
plt.xticks(range(0,24))
plt.show()


In [0]:
# =========================
# 9. Weekday patterns (Mon‚ÄìSun)
# =========================

# Monday=0, Sunday=6
pdf["weekday"] = pdf.index.dayofweek

weekday_stats = pdf.groupby("weekday")[["Actual_Load", "grid_stress_score"]].mean()

plt.figure(figsize=(8,4))
plt.plot(weekday_stats.index, weekday_stats["Actual_Load"], marker="o")
plt.title("DE - Average Actual Load by Weekday")
plt.xlabel("Weekday (0=Mon, 6=Sun)")
plt.ylabel("Load (MW)")
plt.xticks(range(0,7))
plt.show()

plt.figure(figsize=(8,4))
plt.plot(weekday_stats.index, weekday_stats["grid_stress_score"], marker="o")
plt.title("DE - Average Grid Stress Score by Weekday")
plt.xlabel("Weekday (0=Mon, 6=Sun)")
plt.ylabel("Stress Score")
plt.xticks(range(0,7))
plt.show()


In [0]:
# =========================
# 10. Volatility / Ramping (ŒîLoad, ŒîStress, Rolling Std)
# =========================

pdf["load_diff_1h"] = pdf["Actual_Load"].diff(1)
pdf["stress_diff_1h"] = pdf["grid_stress_score"].diff(1)

pdf["load_rolling_std_24h"] = pdf["Actual_Load"].rolling(window=24).std()
pdf["stress_rolling_std_24h"] = pdf["grid_stress_score"].rolling(window=24).std()

plt.figure(figsize=(14,5))
pdf["load_diff_1h"].plot(alpha=0.7)
plt.title("DE - 1h Load Change (Ramping)")
plt.ylabel("Œî Load (MW)")
plt.show()

plt.figure(figsize=(14,5))
pdf["stress_diff_1h"].plot(alpha=0.7, color="tab:red")
plt.title("DE - 1h Change in Grid Stress Score")
plt.ylabel("Œî Stress")
plt.show()

plt.figure(figsize=(14,5))
pdf["stress_rolling_std_24h"].plot(alpha=0.7)
plt.title("DE - 24h Rolling Std of Grid Stress (Volatility)")
plt.ylabel("Rolling Std (24h)")
plt.show()


In [0]:
# =========================
# 11. Correlation matrix between key features
# =========================

cols_corr = [
    "Actual_Load",
    "grid_stress_score",
    "net_imports",
    "mean_ssrd",
    "mean_wind_speed",
    "mean_temperature_c",
    "reserve_margin_ml",
    "forecast_load_error",
    "load_rel_error",
    "score_T8",
    "score_reserve_margin",
]

corr = pdf[cols_corr].corr()

plt.figure(figsize=(8,6))
plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar()
plt.xticks(range(len(cols_corr)), cols_corr, rotation=90)
plt.yticks(range(len(cols_corr)), cols_corr)
plt.title("DE ‚Äì Correlation Heatmap (Key Features)")
plt.tight_layout()
plt.show()

corr


In [0]:
# =========================
# 12. Stress anomalies via z-score
# =========================

from scipy.stats import zscore

pdf["stress_z"] = zscore(pdf["grid_stress_score"].fillna(method="ffill"))

# Mark anomalies, e.g. |z| > 2.5
anomalies = pdf[np.abs(pdf["stress_z"]) > 2.5]

print("Number of stress anomalies (|z| > 2.5):", len(anomalies))
anomalies[["Actual_Load", "grid_stress_score", "stress_z"]].head(10)


In [0]:
# Plot anomalies on top of stress series
plt.figure(figsize=(14,5))
pdf["grid_stress_score"].plot(label="Grid Stress Score", alpha=0.7)

plt.scatter(
    anomalies.index,
    anomalies["grid_stress_score"],
    color="red",
    label="Anomalies",
    zorder=5
)

plt.title("DE - Grid Stress Score with Detected Anomalies")
plt.legend()
plt.show()


####üö´ Why we cannot analyze all countries together

Electricity systems differ a lot between countries, so mixing them breaks the time-series logic.

‚ùå 1. **Each country has different grid behavior**  
**DE**: High renewables, big grid, strong interconnections    
**FR**: Nuclear-heavy, stable base load   
**AT**: Hydro-dominant, lots of variation   
**DK**: wind-driven  
**GR**: Island-like behavior, high imports  
**PL**: Coal-heavy, different seasonality  
Their daily and seasonal patterns are not comparable, so mixing them destroys the signal.  

‚ùå 2. **Lag features become incorrect**  
Lag-1 must follow the previous hour of the same country.  
If data is mixed, Lag-1 might jump from AT ‚Üí GR ‚Üí PL, which ruins the time ordering.  

‚ùå 3. **Stress score scales differ**  
Some countries have frequent stress events; others almost none.  
Mixing them makes the model biased and unstable.  

‚úÖ **Therefore:**  
Time-series analysis must be done country by country.  
Each country gets its own patterns, correlations, lags, and model.

In [0]:
pdf = df_de.toPandas()
pdf['timestamp'] = pd.to_datetime(pdf['timestamp'])
pdf = pdf.set_index('timestamp').sort_index()

# Ensure these columns exist
print(pdf[['weekday', 'hour', 'daytime_bin']].head())


In [0]:
weekday_stats = pdf.groupby("weekday")[["Actual_Load", "grid_stress_score"]].mean()

plt.figure(figsize=(10,4))
weekday_stats["Actual_Load"].plot(marker="o")
plt.title("DE - Average Load by Weekday (1=Mon, 7=Sun)")
plt.xlabel("Weekday")
plt.ylabel("Load")
plt.grid(True)
plt.show()

plt.figure(figsize=(10,4))
weekday_stats["grid_stress_score"].plot(marker="o", color='red')
plt.title("DE - Average Grid Stress Score by Weekday")
plt.xlabel("Weekday")
plt.ylabel("Stress Score")
plt.grid(True)
plt.show()


In [0]:
daytime_stats = pdf.groupby("daytime_bin")[["Actual_Load", "grid_stress_score"]].mean()

plt.figure(figsize=(8,4))
daytime_stats["Actual_Load"].plot(kind="bar", color="skyblue")
plt.title("DE - Average Load by Daytime Bin")
plt.ylabel("Load (MW)")
plt.show()

plt.figure(figsize=(8,4))
daytime_stats["grid_stress_score"].plot(kind="bar", color="salmon")
plt.title("DE - Average Grid Stress Score by Daytime Bin")
plt.ylabel("Stress Score")
plt.show()


In [0]:
plt.figure(figsize=(8,5))
pdf.boxplot(column="grid_stress_score", by="daytime_bin")
plt.title("DE ‚Äì Stress Score Distribution by Daytime")
plt.suptitle("")
plt.xlabel("Daytime Bin")
plt.ylabel("Stress Score")
plt.show()
