# Energy Data Query Notebook

This notebook queries the Delta tables created by the data ingestion script.

In [1]:
# Import required libraries
from pyspark.sql import SparkSession

# Initialize Spark session with Delta support
spark = SparkSession.builder \
    .appName("Energy Data Query") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c1d3aa63-22b9-4406-85ee-62af4bb244c4;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 367ms :: artifacts dl 18ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0 

In [2]:
# Define the paths to the Delta tables (adjust these paths as per your config)
public_power_path = "/workspaces/baywa-data-pipeline/data/public_power"  # Replace with your actual path
price_path = "/workspaces/baywa-data-pipeline/data/price"                  # Replace with your actual path
installed_power_path = "/workspaces/baywa-data-pipeline/data/installed_power"  # Replace with your actual path

In [12]:
# Querying public power data
public_power_df = spark.read.format("delta").load(public_power_path)
public_power_df.createOrReplaceTempView("public_power")

# Example SQL query to get average production values
average_public_power = spark.sql("""
    SELECT *
    FROM public_power
    LIMIT 100
""")
average_public_power.show()

+---------------+-------------------+-------+
|production_type|          timestamp|  value|
+---------------+-------------------+-------+
|   Wind onshore|2023-12-31 18:45:00|29554.0|
|   Wind onshore|2023-12-31 18:45:00|29062.9|
|   Wind onshore|2023-12-31 18:45:00|29185.5|
|   Wind onshore|2023-12-31 18:45:00|28968.5|
|   Wind onshore|2023-12-31 18:45:00|28369.2|
|   Wind onshore|2023-12-31 18:45:00|28610.7|
|   Wind onshore|2023-12-31 18:45:00|29241.6|
|   Wind onshore|2023-12-31 18:45:00|29230.1|
|   Wind onshore|2023-12-31 18:45:00|29299.7|
|   Wind onshore|2023-12-31 18:45:00|29385.3|
|   Wind onshore|2023-12-31 18:45:00|29085.6|
|   Wind onshore|2023-12-31 18:45:00|28724.5|
|   Wind onshore|2023-12-31 18:45:00|28758.0|
|   Wind onshore|2023-12-31 18:45:00|28762.7|
|   Wind onshore|2023-12-31 18:45:00|29124.9|
|   Wind onshore|2023-12-31 18:45:00|29548.1|
|   Wind onshore|2023-12-31 18:45:00|30055.3|
|   Wind onshore|2023-12-31 18:45:00|30108.9|
|   Wind onshore|2023-12-31 18:45:

In [3]:
# Querying public power data
public_power_df = spark.read.format("delta").load(public_power_path)
public_power_df.createOrReplaceTempView("public_power")

# Example SQL query to get average production values
average_public_power = spark.sql("""
    SELECT production_type, AVG(value) AS average_value
    FROM public_power
    GROUP BY production_type
""")
average_public_power.show()

24/10/30 10:00:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 9:>                                                        (0 + 12) / 12]

+--------------------+-------------------+
|     production_type|      average_value|
+--------------------+-------------------+
|               Waste| 1115.4239583333328|
|          Fossil gas| 2713.9281249999976|
|          Fossil oil| 392.81145833333363|
|Hydro water reser...|  75.12604166666678|
|Renewable share o...|  92.49791666666657|
|               Solar| 1135.5500000000018|
|Renewable share o...|  80.11875000000002|
|       Wind offshore|  5707.162499999985|
|             Nuclear|               null|
|Cross border elec...| -6802.878124999999|
|Hydro pumped storage| 1050.1208333333345|
|          Geothermal|  20.22395833333327|
|  Hydro Run-of-River| 2080.4947916666647|
|        Wind onshore| 26499.397916666672|
|       Residual load|  10486.62499999999|
|Fossil brown coal...| 3250.2406250000004|
|              Others|  211.0635416666664|
|    Fossil hard coal| 1713.7614583333345|
|             Biomass|  4531.489583333334|
|Hydro pumped stor...|-2010.4166666666622|
+----------

                                                                                

In [14]:
# Querying price data
price_df = spark.read.format("delta").load(price_path)
price_df.createOrReplaceTempView("price")

# Example SQL query to get maximum price
max_price = spark.sql("""
    SELECT *
    FROM price
""")
max_price.show()

                                                                                

+------+-------------------+-------------------+
| price|          timestamp|               unit|
+------+-------------------+-------------------+
|115.34|2024-10-30 21:00:00|EUR / megawatt_hour|
|112.37|2024-10-30 22:00:00|EUR / megawatt_hour|
|102.53|2024-10-30 11:00:00|EUR / megawatt_hour|
|106.95|2024-10-30 12:00:00|EUR / megawatt_hour|
| 160.0|2024-10-30 15:00:00|EUR / megawatt_hour|
|193.88|2024-10-30 16:00:00|EUR / megawatt_hour|
| 115.0|2024-10-30 09:00:00|EUR / megawatt_hour|
|109.68|2024-10-30 10:00:00|EUR / megawatt_hour|
|123.54|2024-10-30 05:00:00|EUR / megawatt_hour|
|139.68|2024-10-30 06:00:00|EUR / megawatt_hour|
| 98.16|2024-10-30 01:00:00|EUR / megawatt_hour|
| 98.05|2024-10-30 02:00:00|EUR / megawatt_hour|
|111.31|2024-10-30 13:00:00|EUR / megawatt_hour|
|126.16|2024-10-30 14:00:00|EUR / megawatt_hour|
|192.89|2024-10-30 17:00:00|EUR / megawatt_hour|
|159.44|2024-10-30 18:00:00|EUR / megawatt_hour|
|100.71|2024-10-29 23:00:00|EUR / megawatt_hour|
|100.52|2024-10-30 0

In [13]:
# Querying price data
price_df = spark.read.format("delta").load(price_path)
price_df.createOrReplaceTempView("price")

# Example SQL query to get maximum price
max_price = spark.sql("""
    SELECT MAX(price) AS max_price
    FROM price
""")
max_price.show()

                                                                                

+---------+
|max_price|
+---------+
|   193.88|
+---------+



In [16]:
# Querying installed power data
installed_power_df = spark.read.format("delta").load(installed_power_path)
installed_power_df.createOrReplaceTempView("installed_power")

# Example SQL query to get total installed power by year
total_installed_power = spark.sql("""
    SELECT year, SUM(installed_power) AS total_power
    FROM installed_power
    GROUP BY year
    ORDER BY year
""")
total_installed_power.show()


+----+------------------+
|year|       total_power|
+----+------------------+
|2002| 4270.264999999997|
|2003| 4270.264999999997|
|2004| 4270.264999999997|
|2005| 4270.264999999998|
|2006| 4270.264999999997|
|2007|4270.2649999999985|
|2008| 4270.264999999997|
|2009|          4270.265|
|2010| 4270.264999999997|
|2011| 4270.264999999999|
|2012| 4270.264999999997|
|2013|          4270.265|
|2014| 4270.264999999997|
|2015| 4270.264999999999|
|2016| 4270.264999999997|
|2017| 4270.264999999999|
|2018| 4270.264999999997|
|2019| 4270.264999999999|
|2020| 4270.264999999997|
|2021| 4270.264999999999|
+----+------------------+
only showing top 20 rows

