Sample BI queries
- Trend of daily public net electricity production in Germany for each production type. 
- Prediction of underperformance of public net electricity on 30min intervals.
- Analysis of daily price against the net power for offshore and onshore wind (= production_type)

In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import trunc,avg

In [2]:
# Get Jar path needed for spark session
# For simplicity using locally downloaded jars for delta format
cwd = os.getcwd()
if cwd.endswith("notebooks"):
    proj_dir = os.path.abspath("..")
else:
    proj_dir = cwd
jar_dir = os.path.join(proj_dir, "jars")
jar1 = os.path.join(jar_dir, "delta-spark_2.13-4.0.0.jar")
jar2 = os.path.join(jar_dir, "delta-storage-4.0.0.jar")

In [3]:
spark = SparkSession.builder.appName("EnergyBI_Insights") \
            .config("spark.jars", f"{jar1},{jar2}") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.sql.warehouse.dir", f"{proj_dir}/data-warehouse") \
            .getOrCreate()




25/06/29 19:22:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark.conf.get("spark.sql.warehouse.dir")

'file:/Users/zodenath/Desktop/projects/energy-proj/data-warehouse'

## Loading data to SQL warehouse

In [5]:
spark.sql(""" CREATE SCHEMA IF NOT EXISTS energy """)

DataFrame[]

In [6]:
data_lake_path = f"{proj_dir}/data/silver/public_power_data"

spark.sql(f"""
            CREATE TABLE IF NOT EXISTS energy.public_power_data
            USING DELTA
            LOCATION '{data_lake_path}'
""")

price_lake_path = f"{proj_dir}/data/silver/price_data"

spark.sql(f"""
            CREATE TABLE IF NOT EXISTS energy.price
            USING DELTA
            LOCATION '{price_lake_path}'
""")

installed_lake_path = f"{proj_dir}/data/silver/public_power_data"

spark.sql(f"""
            CREATE TABLE IF NOT EXISTS energy.installed_power_data
            USING DELTA
            LOCATION '{installed_lake_path}'
""")



DataFrame[]

In [7]:
# Trend of daily public net electricity production in Germany for each production type.

daily_trend = spark.sql("""
    SELECT
        DATE(timestamp) as date,
        production_type,
        SUM(net_power_produced) AS daily_net_production
    FROM energy.public_power_data
    GROUP BY DATE(timestamp), production_type
    ORDER BY date, production_type
""")
daily_trend.show(10)

25/06/29 19:22:16 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




[Stage 2:=====>                                                   (5 + 11) / 50]



                                                                                





+----------+--------------------+--------------------+
|      date|     production_type|daily_net_production|
+----------+--------------------+--------------------+
|2025-04-01|             Biomass|   395143.2001953125|
|2025-04-01|Cross border elec...|  218034.29940795898|
|2025-04-01|Fossil brown coal...|   934869.3037109375|
|2025-04-01|Fossil coal-deriv...|   59497.80001831055|
|2025-04-01|          Fossil gas|   576568.5981445312|
|2025-04-01|    Fossil hard coal|   400229.3992919922|
|2025-04-01|          Fossil oil|    32854.1008605957|
|2025-04-01|          Geothermal|  2021.1000022888184|
|2025-04-01|  Hydro Run-of-River|  163559.99951171875|
|2025-04-01|Hydro pumped storage|  112119.10014736652|
+----------+--------------------+--------------------+
only showing top 10 rows


In [8]:
# Prediction of underperformance of public net electricity on 30min intervals.
unperform_prediction = spark.sql("""
    SELECT
        production_type,
        window.start AS interval_start,
        window.end AS interval_end,
        SUM(net_power_produced) AS total_power
    FROM (
        SELECT *, window(timestamp, '30 minutes') AS window
        FROM energy.public_power_data
    )
    GROUP BY production_type, window
    ORDER BY interval_start
""")
unperform_prediction.show(10)






+--------------------+-------------------+-------------------+------------------+
|     production_type|     interval_start|       interval_end|       total_power|
+--------------------+-------------------+-------------------+------------------+
|       Residual load|2025-04-01 00:00:00|2025-04-01 00:30:00|     79468.8984375|
|          Fossil oil|2025-04-01 00:00:00|2025-04-01 00:30:00|             684.5|
|Hydro pumped storage|2025-04-01 00:00:00|2025-04-01 00:30:00| 659.3000183105469|
|    Fossil hard coal|2025-04-01 00:00:00|2025-04-01 00:30:00|  11579.2998046875|
|               Waste|2025-04-01 00:00:00|2025-04-01 00:30:00|            1781.5|
|        Wind onshore|2025-04-01 00:00:00|2025-04-01 00:30:00|  13683.7001953125|
|Fossil coal-deriv...|2025-04-01 00:00:00|2025-04-01 00:30:00|1286.3999633789062|
|Fossil brown coal...|2025-04-01 00:00:00|2025-04-01 00:30:00|  23208.2998046875|
|             Biomass|2025-04-01 00:00:00|2025-04-01 00:30:00|            8042.0|
|       Wind off

                                                                                

In [9]:
# Analysis of daily price against the net power for offshore and onshore wind (= production_type)

price_vs_power = spark.sql("""
    SELECT
        DATE(p.timestamp) AS date,
        p.production_type,
        SUM(p.net_power_produced) AS total_power,
        AVG(pr.price) AS avg_price
    FROM energy.public_power_data p
    JOIN energy.price pr
        ON p.timestamp = pr.timestamp
    WHERE p.production_type IN ('Wind offshore', 'Wind onshore')
    GROUP BY DATE(p.timestamp), p.production_type
    ORDER BY date
""")
price_vs_power.show(10)

+----------+---------------+-----------------+------------------+
|      date|production_type|      total_power|         avg_price|
+----------+---------------+-----------------+------------------+
|2025-06-28|  Wind offshore|467266.4045410156| 94.47846048306197|
|2025-06-28|   Wind onshore|  944764.98828125| 94.47846048306197|
|2025-06-29|  Wind offshore|601184.8062133789|39.690943403415524|
|2025-06-29|   Wind onshore|    2057580.40625|39.690943403415524|
+----------+---------------+-----------------+------------------+



## Ad-hoc analytics without creating tables

In [10]:
#public_power = spark.read.format("delta").load(f"{proj_dir}/data/silver/public_power_data")  
#price = spark.read.format("delta").load(f"{proj_dir}/data/silver/price_data")   
#installed_power = spark.read.format("delta").load(f"{proj_dir}/data/silver/installed_power_data")  

In [11]:
#public_power.createOrReplaceTempView("net_power")
#price.createOrReplaceTempView("price")
#installed_power.createOrReplaceTempView("installed_power")
