In [0]:
# ---------------------------------------------------------
# 1. Imports
# ---------------------------------------------------------
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8-darkgrid")

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


In [0]:
# ---------------------------------------------------------
# 2. Load the train dataset
# ---------------------------------------------------------
df = spark.table("workspace.default.train_set_timebins_lags")

In [0]:
# ---------------------------------------------------------
# 3. Filter for Germany (DE)
# ---------------------------------------------------------
df_de = df.filter(F.col("country") == "DE")

print("Rows for Germany:", df_de.count())
display(df_de.limit(5))


In [0]:
# ---------------------------------------------------------
# 4. Convert to Pandas (DE only → SAFE)
# ---------------------------------------------------------
pdf = df_de \
    .orderBy("timestamp") \
    .select("timestamp", "Actual_Load", "grid_stress_score", 
            "mean_ssrd", "mean_wind_speed", "mean_temperature_c") \
    .toPandas()

# Make timestamp a real datetime object
pdf["timestamp"] = pd.to_datetime(pdf["timestamp"])
pdf.set_index("timestamp", inplace=True)

pdf.head()


In [0]:
# ---------------------------------------------------------
# 5. Plot Load & Stress Over Time
# ---------------------------------------------------------

plt.figure(figsize=(16,6))
plt.plot(pdf.index, pdf["Actual_Load"], label="Actual Load (MW)", alpha=0.8)
plt.plot(pdf.index, pdf["grid_stress_score"], label="Grid Stress Score", alpha=0.6)
plt.title("Germany — Load vs Grid Stress Over Time")
plt.legend()
plt.show()


In [0]:
# ---------------------------------------------------------
# 6. Seasonal Decomposition (Daily Pattern)
# ---------------------------------------------------------

daily_series = pdf["Actual_Load"].asfreq("H")

result = seasonal_decompose(daily_series, model="additive", period=24)

plt.rcParams["figure.figsize"] = (14,8)
result.plot()
plt.show()


In [0]:
# ---------------------------------------------------------
# 7. ACF / PACF (Autocorrelation Analysis)
# ---------------------------------------------------------

fig, ax = plt.subplots(2, 1, figsize=(12,10))

plot_acf(pdf["Actual_Load"].dropna(), lags=50, ax=ax[0])
ax[0].set_title("ACF — Actual Load (DE)")

plot_pacf(pdf["Actual_Load"].dropna(), lags=50, ax=ax[1])
ax[1].set_title("PACF — Actual Load (DE)")

plt.show()


In [0]:
# ---------------------------------------------------------
# 8. Correlation Matrix (Load, Stress, Weather)
# ---------------------------------------------------------

corr = pdf[["Actual_Load", "grid_stress_score",
            "mean_ssrd", "mean_wind_speed", "mean_temperature_c"]].corr()

plt.figure(figsize=(6,4))
plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Correlation Heatmap (DE)")
plt.show()

corr
