In [0]:
# This notebook:
# 1. Loads the raw generation data from the curlybyte_solutions_rawdata_europe_grid_load database
# 2. Compute hourly averages for each country and time
# 3. Save the cleaned generation table under "workspace.schema_capstone.generation_clean"


In [0]:
generation = spark.table("curlybyte_solutions_rawdata_europe_grid_load.european_grid_raw__v2.generation")

In [0]:
numeric_cols = [
    c for c, t in generation.dtypes 
    if t in ("double", "float", "int", "bigint")
]

In [0]:
from pyspark.sql import functions as F
generation = generation.withColumn("hour",F.date_trunc("hour", F.col("index")))

In [0]:
agg_exprs = [F.mean(c).alias(c) for c in numeric_cols]  # keep same names

generation_hourly = generation.groupBy("country","hour").agg(*agg_exprs).orderBy("hour")
generation_hourly = generation_hourly.withColumnRenamed("hour", "index")


In [0]:
# Create schema under workspace
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.schema_capstone")

In [0]:
# Save the cleaned generation table under workspace
schema_name = "live_data"
generation_hourly.write.format("delta").mode("overwrite").saveAsTable(f"{schema_name}.generation_clean")
