In [0]:
df = spark.table("workspace.default.electricity_and_weather_europe")

In [0]:
missing_countries = ["DK", "FI", "LV", "SE", "EE", "GR", "RO", "SI"]
df = df.filter(~df["country"].isin(missing_countries))

In [0]:
df.columns

In [0]:
columns_to_drop = [
    'Hydro_Pumped_Storage',
    'Hydro_Run_of_river_and_poundage',
    'Hydro_Water_Reservoir',
    'Nuclear',
    'Solar',
    'Wind_Onshore',
    'Biomass',
    'Fossil_Brown_coal_Lignite',
    'Fossil_Coal_derived_gas',
    'Fossil_Gas',
    'Fossil_Hard_coal',
    'Fossil_Oil',
    'Waste',
    'Wind_Offshore',
    'Other',
    'Other_renewable',
    'Fossil_Peat',
    'Energy_storage',
    'Fossil_Oil_shale'
]
df = df.drop(*columns_to_drop)

In [0]:
display(df)

In [0]:
from pyspark.sql.functions import col
generation_columns_to_sum = ["Biomass__Actual_Aggregated", "Energy_storage__Actual_Aggregated", "Fossil_Brown_coal_Lignite__Actual_Aggregated", "Fossil_Coal_derived_gas__Actual_Aggregated", "Fossil_Gas__Actual_Aggregated", "Fossil_Hard_coal__Actual_Aggregated", "Fossil_Oil__Actual_Aggregated", "Fossil_Oil_shale__Actual_Aggregated", "Fossil_Peat__Actual_Aggregated", "Geothermal__Actual_Aggregated", "Hydro_Pumped_Storage__Actual_Aggregated", "Hydro_Run_of_river_and_poundage__Actual_Aggregated", "Hydro_Water_Reservoir__Actual_Aggregated", "Marine__Actual_Aggregated", "Nuclear__Actual_Aggregated", "Other__Actual_Aggregated", "Other_renewable__Actual_Aggregated", "Solar__Actual_Aggregated", "Waste__Actual_Aggregated", "Wind_Offshore__Actual_Aggregated", "Wind_Onshore__Actual_Aggregated"]
df = df.withColumn("total_generation", sum([col(x) for x in generation_columns_to_sum]))


In [0]:
consumption_cols = [c for c in df.columns if c.endswith("__Actual_Consumption")]
df = df.withColumn("total_load", col("Actual_load") + sum([col(x) for x in consumption_cols]))

In [0]:
from pyspark.sql.functions import col, try_divide

df = df.withColumn(
    "reserve_margin",
    try_divide(
        col("total_generation") + col("net_imports") - col("total_load"),
        col("total_load")
    )
)

In [0]:
display(df)

In [0]:
from pyspark.sql import functions as F
df.agg(F.max("reserve_margin")).show()

In [0]:
df = df.withColumn("system_imbalance", col("total_generation") + col("net_imports") - col("total_load"))

In [0]:
df = df.withColumn("forecast_load_error", col("Forecasted_Load") - col("total_load"))
df = df.withColumn("forecast_solar_generation_error", col("solar_forecast") - col("Solar__Actual_Aggregated"))
df = df.withColumn("forecast_wind_generation_error", col("wind_forecast") - col("Wind_Onshore__Actual_Consumption"))

In [0]:
# Compute relative forecast errors
df = df.withColumn("load_rel_error",
                       F.abs(F.col("forecast_load_error")) / (F.col("total_load") + F.lit(1e-6)))

df = df.withColumn("solar_rel_error",
                       F.abs(F.col("forecast_solar_generation_error")) / (F.col("Solar__Actual_Aggregated") + F.lit(1e-6)))

df = df.withColumn("wind_rel_error",
                       F.abs(F.col("forecast_wind_generation_error")) / (F.col("Wind_Onshore__Actual_Consumption") + F.lit(1e-6)))

In [0]:
display(df)

In [0]:
df = df.withColumn("renewable_penetration", try_divide((col("Solar__Actual_Aggregated") + col("Wind_Onshore__Actual_Aggregated") + col("Wind_Offshore__Actual_Aggregated") + col("Hydro_Run_of_river_and_poundage__Actual_Aggregated") + col("Hydro_Water_Reservoir__Actual_Aggregated") + col("Geothermal__Actual_Aggregated") + col("Hydro_Pumped_Storage__Actual_Aggregated") + col("Biomass__Actual_Aggregated") + col("Other_renewable__Actual_Aggregated") + col("Marine__Actual_Aggregated")),(col("total_generation") + col("net_imports"))))

In [0]:
display(df)

In [0]:
# Net Load = demand that must be met by conventional/non-renewable generation
# Net Load Ramp = How fast the controllable generation needs to change
# Renewable sources like solar and wind fluctuate and often have priority dispatch.
# The system operator needs to know how much conventional generation must cover the remaining load.

from pyspark.sql.window import Window
from pyspark.sql.functions import lag
from pyspark.sql.functions import abs
from functools import reduce

renewable_cols = [
    "Solar__Actual_Aggregated",
    "Wind_Onshore__Actual_Aggregated",
    "Wind_Offshore__Actual_Aggregated",
    "Hydro_Run_of_river_and_poundage__Actual_Aggregated",
    "Hydro_Water_Reservoir__Actual_Aggregated",
    "Geothermal__Actual_Aggregated",
    "Hydro_Pumped_Storage__Actual_Aggregated",
    "Biomass__Actual_Aggregated",
    "Other_renewable__Actual_Aggregated",
    "Marine__Actual_Aggregated"
]

renewable_sum = reduce(lambda a, b: a + b, [col(c) for c in renewable_cols])

df = df.withColumn("net_load", col("total_load") - renewable_sum)

# Define a window per country, ordered by timestamp
w = Window.partitionBy("country").orderBy("index")

# Compute 1-hour net load ramp
df = df.withColumn("net_load_ramp_1h", abs(col("net_load") - lag("net_load", 1).over(w)))

# Filter out first row per country
df = df.filter(col("net_load_ramp_1h").isNotNull())


In [0]:
display(df)

In [0]:
# 0 (stable) or 1 (unstable) based on multiple criteria evaluated for every timestamp

# Based on ENTSO-E: 
# Solar forecast error > 30% → high
# Wind forecast error > 30% → high

def calculate_grid_stress(df):
    """
    Define grid stress events based on multiple indicators
    """
    stress_conditions = (
        (F.col("renewable_penetration") > 0.7) |         # Very high renewable energy penetration
        (F.col("net_load_ramp_1h") > 3000) |             # Rapid net-load ramp >3 GW
        (F.col("reserve_margin") < 0.1) |                # Reserve margin <10%
        (F.col("load_rel_error") > 0.03) |   # Very large load forecast error >2 GW
        (F.col("solar_rel_error") > 0.30) |              # Solar forecast error >30%
        (F.col("wind_rel_error") > 0.30) |               # Wind forecast error >30%
        (F.abs(F.col("system_imbalance")) > 0.05)        # System imbalance >5%
    )

    df = df.withColumn("grid_stress", stress_conditions.cast("int"))
    
    return df

In [0]:
calculate_grid_stress(df)

In [0]:
df.write \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .saveAsTable("electricity_and_weather_europe_with_target")