In [0]:
# Load table
df = spark.table("workspace.default.electricity_weather_blackout_ready")


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col, lag, abs, when, greatest, sum as spark_sum,
    try_divide, coalesce, lit
)

In [0]:
# ============================================================
# Target 1 — Renewable Instability Risk (Wind/Solar Variability)
# Blackouts often happen when: solar suddenly drops (evening)
#                              wind suddenly collapses
#                              weather is unstable
# Using hour-to-hour changes

# ============================================================
window_spec = Window.partitionBy("country").orderBy("index")

df = df.withColumn("wind_drop", abs(col("Wind_Onshore") - lag("Wind_Onshore").over(window_spec)))
df = df.withColumn("solar_drop", abs(col("Solar") - lag("Solar").over(window_spec)))
df = df.withColumn("renewable_variability", col("wind_drop") + col("solar_drop"))

df = df.withColumn(
    "target1_renewable_instability_risk",
    (col("renewable_variability") > 0.40 * col("total_generation")).cast("int")
)

# This flags hours where renewable changes are extreme.


In [0]:
# ============================================================
# Target 2 — Weather-Based Blackout Risk
# Use weather conditions directly: Low wind + no sun → renewable stress 
#                                  Very cold temps → heating load spikes
#                                  Extreme heat → AC load spikes

# Add a simple rule-based label:
# ============================================================

df = df.withColumn(
    "target2_weather_blackout_risk",
    (
        (col("mean_wind_speed") < 1) &    
        (col("mean_ssrd") < 20) &          
        (col("mean_temperature_c") < -5)   
    ).cast("int")
)

# This creates a weather-only risk flag.

In [0]:
# ============================================================
# Target 3 — Forecasted Load Exceeds Capacity
# ============================================================

df = df.withColumn(
    "target3_forecast_overload_risk",
    (col("Forecasted_Load") > (col("total_generation") + col("net_imports"))).cast("int")
)

# This is a future-oriented risk, excellent for 6h prediction.

In [0]:
# ============================================================
# Target 4 — Supply Diversity Risk (Concentration of Generation)

# Countries with only 1–2 sources have high blackout risk.
# Compute diversity:
# ============================================================

gen_cols = [
    "Nuclear__Actual_Aggregated", "Wind_Onshore__Actual_Aggregated", 
    "Wind_Offshore__Actual_Aggregated", "Solar__Actual_Aggregated",
    "Hydro_Water_Reservoir__Actual_Aggregated", "Fossil_Gas__Actual_Aggregated"
]

df = df.withColumn(
    "max_share",
    coalesce(
        try_divide(greatest(*gen_cols), col("total_generation")),
        lit(0)
    )
)

# Then flag risk when one source provides >70%:
df = df.withColumn(
    "target4_supply_concentration_risk",
    (col("max_share") > 0.70).cast("int")
)

# High concentration → fragile system.

In [0]:
# ============================================================
# Target 5 — Combined Multi-Feature Blackout Score

# Combine everything with weights:
# score = 0.4 * imbalance_pct  
#       + 0.2 * renewable_variability  
#       + 0.2 * forecast_overload_risk  
#       + 0.1 * supply_concentration_risk  
#       + 0.1 * weather_blackout_risk  
# ============================================================
df = df.withColumn(
    "target5_blackout_score_multi",
    0.4*col("imbalance_pct") +
    0.2*col("renewable_variability") +
    0.2*col("target3_forecast_overload_risk") +
    0.1*col("target4_supply_concentration_risk") +
    0.1*col("target2_weather_blackout_risk")
)

df = df.withColumn(
    "blackout_multi_label",
    (col("target5_blackout_score_multi") > 0.5).cast("int")
)

# This becomes a scientifically stronger label than any single one.

In [0]:
# ============================================================
# Target 6: Imbalance-based blackout risk (already in the table)
# Based on: imbalance_pct
#           relationship between load and total_generation 
#           adjusted for each country
# ============================================================

In [0]:
df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.default.blackout_targets")
