# Energy Data Analytics - Delta Lake Queries

This notebook demonstrates reading Delta format data and executing three core energy analytics queries using Spark SQL:

1. **Daily Production Trends** - Daily electricity production by production type
2. **Underperformance Prediction Features** - ML features for energy production forecasting
3. **Wind Price Analysis** - Wind power production vs electricity prices

## Data Sources
- Delta tables in the Gold layer: `gold_fact_power`, `gold_dim_production_type`, `gold_fact_power_30min_agg`

## 1. Setup Spark Session and Delta Lake

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create Spark session with Delta Lake configuration
spark = SparkSession.builder \
    .appName("EnergyAnalyticsDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

print("Spark Session created with Delta Lake support")
print(f"Spark Version: {spark.version}")
print(f"Application Name: {spark.sparkContext.appName}")

## 2. Read Delta Lake Tables

In [None]:
# Define Delta table paths (update these paths to match your environment)
delta_base_path = "/Users/srsu/Downloads/spark_datapipeline/delta_lake"

# Read Delta tables from the Gold layer
print("📊 Reading Delta tables...")

try:
    # Read dimension table
    gold_dim_production_type = spark.read.format("delta").load(f"{delta_base_path}/gold/gold_dim_production_type")
    gold_dim_production_type.createOrReplaceTempView("gold_dim_production_type")
    print(f"Loaded gold_dim_production_type: {gold_dim_production_type.count()} records")
    
    # Read daily fact table
    gold_fact_power = spark.read.format("delta").load(f"{delta_base_path}/gold/gold_fact_power")
    gold_fact_power.createOrReplaceTempView("gold_fact_power")
    print(f"Loaded gold_fact_power: {gold_fact_power.count()} records")
    
    # Read 30-minute aggregated fact table
    gold_fact_power_30min_agg = spark.read.format("delta").load(f"{delta_base_path}/gold/gold_fact_power_30min_agg")
    gold_fact_power_30min_agg.createOrReplaceTempView("gold_fact_power_30min_agg")
    print(f"Loaded gold_fact_power_30min_agg: {gold_fact_power_30min_agg.count()} records")
    
except Exception as e:
    print(f"Could not read Delta tables: {e}")
    print("Creating sample data for POC demonstration...")
    
    # Create sample data for demonstration
    # Dimension data
    from pyspark.sql.types import *
    
    dim_data = [
        (1, "Solar", "Solar", "Renewable", "Uncontrollable", True),
        (2, "Wind_Onshore", "Wind_Onshore", "Renewable", "Uncontrollable", True),
        (3, "Wind_Offshore", "Wind_Offshore", "Renewable", "Uncontrollable", True),
        (4, "Nuclear", "Nuclear", "Nuclear", "Controllable", True)
    ]
    
    dim_schema = StructType([
        StructField("production_type_id", IntegerType(), True),
        StructField("production_type", StringType(), True),
        StructField("production_plant_name", StringType(), True),
        StructField("energy_category", StringType(), True),
        StructField("controllability_type", StringType(), True),
        StructField("active_flag", BooleanType(), True)
    ])
    
    spark.createDataFrame(dim_data, dim_schema).createOrReplaceTempView("gold_dim_production_type")
    print("Created sample gold_dim_production_type")

print("\n🎯 Delta tables loaded and ready for queries!")

## 3. Query 1: Daily Production Trends

This query analyzes daily electricity production trends by production type.

In [None]:
# Query 1: Daily Production Trends
daily_production_query = """
SELECT
  f.year,
  f.month,
  f.day,
  d.production_plant_name AS production_type,
  SUM(f.electricity_produced) AS total_daily_production
FROM gold_fact_power f
JOIN gold_dim_production_type d ON f.production_type_id = d.production_type_id
WHERE f.country = 'de'
GROUP BY f.year, f.month, f.day, d.production_plant_name
ORDER BY f.year, f.month, f.day, d.production_plant_name
"""

print("🔍 Executing Query 1: Daily Production Trends")
print("=" * 50)

try:
    daily_trends_df = spark.sql(daily_production_query)
    
    print(f"Query executed successfully!")
    print(f"Results: {daily_trends_df.count()} records found")
    
    print("\nSample Results:")
    daily_trends_df.show(10, truncate=False)
    
    # Show schema
    print("\nSchema:")
    daily_trends_df.printSchema()
    
except Exception as e:
    print(f"Query failed: {e}")
    print("This is expected if Delta tables don't exist - using sample data for POC")

## 4. Query 2: Underperformance Prediction Features

This query generates ML features for predicting energy production underperformance with lag features and rolling averages.

In [None]:
# Query 2: Underperformance Prediction Features
underperformance_query = """
SELECT
    f.timestamp_30min,
    f.production_type_id,
    d.production_plant_name,
    d.energy_category,
    d.controllability_type,
    f.total_electricity_produced,
    f.year, f.month, f.day, f.hour, f.minute_interval_30,
    LAG(f.total_electricity_produced, 48) OVER (PARTITION BY f.production_type_id ORDER BY f.timestamp_30min) AS lag_1d,
    LAG(f.total_electricity_produced, 336) OVER (PARTITION BY f.production_type_id ORDER BY f.timestamp_30min) AS lag_1w,
    AVG(f.total_electricity_produced) OVER (
        PARTITION BY f.production_type_id, f.hour, f.minute_interval_30
        ORDER BY f.timestamp_30min
        RANGE BETWEEN 336 PRECEDING AND 1 PRECEDING
    ) AS rolling_7d_avg
FROM gold_fact_power_30min_agg f
JOIN gold_dim_production_type d ON f.production_type_id = d.production_type_id
WHERE f.country = 'de' AND d.active_flag = TRUE
"""

print("🔍 Executing Query 2: ML Features for Underperformance Prediction")
print("=" * 65)

try:
    underperformance_query_df = spark.sql(underperformance_query)
    
    print(f"Query executed successfully!")
    print(f"Results: {underperformance_query_df.count()} records found")
    
    print("\nSample ML Features:")
    underperformance_query_df.show(5, truncate=False)
    
    # Show feature statistics
    print("\nFeature Statistics:")
    underperformance_query_df.select("total_electricity_produced", "lag_1d", "lag_1w", "rolling_7d_avg").describe().show()
    
except Exception as e:
    print(f" Query failed: {e}")
    print("This is expected if 30-minute Delta table doesn't exist - using sample data for POC")

## 5. Query 3: Wind Price Analysis

This query analyzes the relationship between wind power production (offshore and onshore) and electricity prices.

In [None]:
# Query 3: Wind Price Analysis
wind_price_query = """
SELECT
  f.year, f.month, f.day,
  d.production_plant_name AS production_type,
  SUM(f.electricity_produced) AS total_daily_production_mw,
  AVG(f.electricity_price) AS avg_daily_price_eur_per_mwh
FROM gold_fact_power f
JOIN gold_dim_production_type d ON f.production_type_id = d.production_type_id
WHERE f.country = 'de'
  AND d.production_plant_name IN ('Wind_Offshore', 'Wind_Onshore') 
  AND d.active_flag = TRUE 
GROUP BY f.year, f.month, f.day, d.production_plant_name
ORDER BY f.year, f.month, f.day, d.production_plant_name
"""

print("🔍 Executing Query 3: Wind Power vs Price Analysis")
print("=" * 50)

try:
    wind_analysis_df = spark.sql(wind_price_query)
    
    print(f"Query executed successfully!")
    print(f"Results: {wind_analysis_df.count()} records found")
    
    print("\nWind Power vs Price Results:")
    wind_analysis_df.show(10, truncate=False)
    
    # Summary statistics by wind type
    if wind_analysis_df.count() > 0:
        print("\nSummary by Wind Type:")
        wind_summary = wind_analysis_df.groupBy("production_type").agg(
            avg("total_daily_production_mw").alias("avg_production"),
            avg("avg_daily_price_eur_per_mwh").alias("avg_price"),
            count("*").alias("total_days")
        )
        wind_summary.show(truncate=False)
    
except Exception as e:
    print(f"Query failed: {e}")
    print("This is expected if Delta tables don't exist - using sample data for POC")

In [None]:
#Stop Spark session
spark.stop()