Sample BI queries
- Trend of daily public net electricity production in Germany for each production type. 
- Prediction of underperformance of public net electricity on 30min intervals.
- Analysis of daily price against the net power for offshore and onshore wind (= production_type)

In [38]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import trunc,avg

In [39]:
# Get Jar path needed for spark session
# For simplicity using locally downloaded jars for delta format
cwd = os.getcwd()
if cwd.endswith("notebooks"):
    proj_dir = os.path.abspath("..")
else:
    proj_dir = cwd
jar_dir = os.path.join(proj_dir, "jars")
jar1 = os.path.join(jar_dir, "delta-spark_2.13-4.0.0.jar")
jar2 = os.path.join(jar_dir, "delta-storage-4.0.0.jar")

In [40]:
spark = SparkSession.builder.appName("EnergyBI_Insights") \
            .config("spark.jars", f"{jar1},{jar2}") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.sql.warehouse.dir", f"{proj_dir}/data-warehouse") \
            .getOrCreate()


25/07/01 21:50:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [41]:
spark.conf.get("spark.sql.warehouse.dir")

'file:/Users/zodenath/Desktop/LOCAL/energy-proj/data-warehouse'

## Loading data to SQL warehouse

In [42]:
spark.sql(""" CREATE SCHEMA IF NOT EXISTS energy """)

DataFrame[]

In [43]:
data_lake_path = f"{proj_dir}/data/silver/public_power_data"

spark.sql(f"""
            CREATE TABLE IF NOT EXISTS energy.public_power_data
            USING DELTA
            LOCATION '{data_lake_path}'
""")

price_lake_path = f"{proj_dir}/data/silver/price_data"

spark.sql(f"""
            CREATE TABLE IF NOT EXISTS energy.price
            USING DELTA
            LOCATION '{price_lake_path}'
""")

installed_lake_path = f"{proj_dir}/data/silver/public_power_data"

spark.sql(f"""
            CREATE TABLE IF NOT EXISTS energy.installed_power_data
            USING DELTA
            LOCATION '{installed_lake_path}'
""")



DataFrame[]

In [44]:
spark.sql("DESCRIBE FORMATTED energy.public_power_data").show(truncate=False)

+----------------------------+----------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                   |comment|
+----------------------------+----------------------------------------------------------------------------+-------+
|production_type             |string                                                                      |NULL   |
|net_power_produced          |float                                                                       |NULL   |
|timestamp                   |timestamp                                                                   |NULL   |
|# Partition Information     |                                                                            |       |
|# col_name                  |data_type                                                                   |comment|
|production_type             |string                                    

In [45]:
# Trend of daily public net electricity production in Germany for each production type.

daily_trend = spark.sql("""
    SELECT
        DATE(timestamp) as date,
        production_type,
        SUM(net_power_produced) AS daily_net_production
    FROM energy.public_power_data
    GROUP BY DATE(timestamp), production_type
    ORDER BY date, production_type
""")
daily_trend.show(10)

+----------+--------------------+--------------------+
|      date|     production_type|daily_net_production|
+----------+--------------------+--------------------+
|2025-06-30|             Biomass|   76155.49926757812|
|2025-06-30|Cross border elec...|  145359.10009765625|
|2025-06-30|Fossil brown coal...|            185681.0|
|2025-06-30|Fossil coal-deriv...|             10088.0|
|2025-06-30|          Fossil gas|            183015.0|
|2025-06-30|    Fossil hard coal|             81534.0|
|2025-06-30|          Fossil oil|              5394.0|
|2025-06-30|          Geothermal|   286.9000072479248|
|2025-06-30|  Hydro Run-of-River|   37398.59997558594|
|2025-06-30|Hydro pumped storage|             79601.0|
+----------+--------------------+--------------------+
only showing top 10 rows


In [46]:
# Prediction of underperformance of public net electricity on 30min intervals.
unperform_prediction = spark.sql("""
    SELECT
        production_type,
        window.start AS interval_start,
        window.end AS interval_end,
        SUM(net_power_produced) AS total_power
    FROM (
        SELECT *, window(timestamp, '30 minutes') AS window
        FROM energy.public_power_data
    )
    GROUP BY production_type, window
    ORDER BY interval_start
""")
unperform_prediction.show(10)


+--------------------+-------------------+-------------------+-----------------+
|     production_type|     interval_start|       interval_end|      total_power|
+--------------------+-------------------+-------------------+-----------------+
|       Residual load|2025-06-30 19:00:00|2025-06-30 19:30:00|          45144.0|
|               Waste|2025-06-30 19:00:00|2025-06-30 19:30:00|727.4000244140625|
|          Fossil gas|2025-06-30 19:00:00|2025-06-30 19:30:00|           9320.0|
|Fossil coal-deriv...|2025-06-30 19:00:00|2025-06-30 19:30:00|            441.0|
|                Load|2025-06-30 19:00:00|2025-06-30 19:30:00|          57476.0|
|             Biomass|2025-06-30 19:00:00|2025-06-30 19:30:00|3972.699951171875|
|    Fossil hard coal|2025-06-30 19:00:00|2025-06-30 19:30:00|           4204.0|
|        Wind onshore|2025-06-30 19:00:00|2025-06-30 19:30:00|           2062.0|
|Hydro pumped storage|2025-06-30 19:00:00|2025-06-30 19:30:00|           4033.0|
|Renewable share o...|2025-0

                                                                                

In [47]:
# Analysis of daily price against the net power for offshore and onshore wind (= production_type)

price_vs_power = spark.sql("""
    SELECT
        DATE(p.timestamp) AS date,
        p.production_type,
        SUM(p.net_power_produced) AS total_power,
        AVG(pr.price) AS avg_price
    FROM energy.public_power_data p
    JOIN energy.price pr
        ON p.timestamp = pr.timestamp
    WHERE p.production_type IN ('Wind offshore', 'Wind onshore')
    GROUP BY DATE(p.timestamp), p.production_type
    ORDER BY date
""")
price_vs_power.show(10)

+----------+---------------+-----------+------------------+
|      date|production_type|total_power|         avg_price|
+----------+---------------+-----------+------------------+
|2025-06-30|   Wind onshore|    22370.0| 200.4925022125244|
|2025-06-30|  Wind offshore|     5542.0| 200.4925022125244|
|2025-07-01|   Wind onshore|    96261.0|102.00449981689454|
|2025-07-01|  Wind offshore|    28365.0|102.00449981689454|
+----------+---------------+-----------+------------------+



## Ad-hoc analytics by reading data from silver layer and processing further
- Read data from silver layer ; modify accoriding to need 
- Load to warehouse in a table (managed)

In [48]:
# Example Business query : Get monthly avr price and installed capacity
# Example of case where we read data directly from silver layer and get business insights
price = spark.read.format("delta").load(f"{proj_dir}/data/silver/price_data")  
installed_power = spark.read.format("delta").load(f"{proj_dir}/data/silver/installed_power_data")   

price_monthly = price.withColumn("month", trunc("timestamp", "month")) \
                          .groupBy("month") \
                          .agg(avg("price").alias("avg_price"))
installed_monthly = installed_power.groupBy("date", "production_type") \
                                   .agg(avg("installed_power").alias("avg_capacity"))

joined_df = price_monthly.join(
    installed_monthly,
    price_monthly["month"] == installed_monthly["date"]
).drop("date")

In [49]:
joined_df.show()

+----------+-----------------+--------------------+------------------+
|     month|        avg_price|     production_type|      avg_capacity|
+----------+-----------------+--------------------+------------------+
|2025-06-01|200.4925022125244|Battery Storage (...|20.910999298095703|
|2025-06-01|200.4925022125244|         Solar gross|107.09200286865234|
|2025-06-01|200.4925022125244|        Wind onshore|              NULL|
|2025-06-01|200.4925022125244|Battery Storage (...|14.178000450134277|
|2025-06-01|200.4925022125244|             Biomass|              NULL|
|2025-06-01|200.4925022125244|           Solar net| 95.50800323486328|
|2025-06-01|200.4925022125244|       Wind offshore|              NULL|
+----------+-----------------+--------------------+------------------+



In [None]:
# Show all tbales in DW
spark.sql("SHOW TABLES IN energy").show()

In [None]:
spark.stop()