In [0]:
df = spark.table("workspace.default.validation_set_imputed")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
# =====================================
# weekday = 1 (Monday), ..., 7 (Sunday)
# daytime bins = night / morning / afternoon / evening
# =====================================

df = df.withColumn("timestamp", F.to_timestamp("index"))

# Weekday (1â€“7)
df = df.withColumn("weekday", F.date_format("timestamp", "u").cast("int"))

# Hour of day (0â€“23)
df = df.withColumn("hour", F.hour("timestamp"))

# Daytime bins
df = df.withColumn(
    "daytime_bin",
    F.when(F.col("hour") < 6, "night")
     .when(F.col("hour") < 12, "morning")
     .when(F.col("hour") < 18, "afternoon")
     .otherwise("evening")
)


In [0]:
# Define window for lag features (per country)

w = Window.partitionBy("country").orderBy("timestamp")


In [0]:
# Create lag features (1h, 2h, 3h, 6h, 24h)
# Lag only important columns to avoid too many features.


lag_cols = [
    "Actual_Load",
    "Forecasted_Load",
    "load_rel_error",
    "grid_stress_score",
    "score_T8",
    "score_reserve_margin",
    "net_imports",
    "T7_high_exports",
    "T8_high_imports",
    "mean_ssrd",
    "mean_temperature_c",
    "mean_wind_speed",
    "P10_net",
    "P90_net"
]


lags = [1, 2, 3, 6, 24]

for col_name in lag_cols:
    for h in lags:
        df = df.withColumn(f"{col_name}_lag_{h}h", F.lag(col_name, h).over(w))


####ðŸŽ¯ Recommended Variables for Lag Features

Based on correlation strength, signal relevance, and time-series behavior, only the following variables should receive lag features.

1. **Load Lags**
Load dynamics strongly influence grid stress.  
  âœ” Actual_Load  
  âœ” Forecasted_Load  
  âœ” load_rel_error  

2. **Stress-Related Lags**
Stress tends to persist or build over time.  
  âœ” grid_stress_score  
  âœ” score_T8  
  âœ” score_reserve_margin  

3. **Import/Export Lags**
Cross-border flows are a key driver of grid stability.  
  âœ” net_imports  
  âœ” T7_high_exports  
  âœ” T8_high_imports  

4. **Weather Lags**
Only these weather features show meaningful correlation with stress.  
  âœ” mean_ssrd  
  âœ” mean_temperature_c  
  âœ” mean_wind_speed  

5. **Risk or Threshold Signals**
These reflect system imbalance and volatility.  
  âœ” P10_net  
  âœ” P90_net  

In [0]:
# Create rolling mean features (3h, 6h, 24h)

rolling_windows = {
    "3h": 3,
    "6h": 6,
    "24h": 24
}

roll_cols = ["Actual_Load", "grid_stress_score", "net_imports"]

for label, size in rolling_windows.items():
    win = w.rowsBetween(-size, -1)  # lookback window
    for col_name in roll_cols:
        df = df.withColumn(f"{col_name}_rolling_mean_{label}", F.avg(col_name).over(win))


In [0]:
# Fix Spark datetime parsing issue
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Save table
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.default.validation_imputed_timebins_lags")


In [0]:
display(df.limit(5))

In [0]:
df.printSchema()