In [0]:
"""
04_gold_analytics.py

Purpose:
- Aggregate Silver sensor measurements
- Produce daily, location-level and latest sensor snapshot air quality metrics
- Create analytics-ready Gold Delta table for python engineer analytics
- Create analytics-ready Gold Delta table for dashboard analytics

Input:
- air_quality_silver.sensor_measurements

Output:
- air_quality_gold.daily_air_quality
- air_quality_gold.latest_sensor_snapshot
"""
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import (
    col,
    to_date,
    avg,
    count,
    countDistinct,
    when,
    max as spark_max
)

# Ensure Gold database exists
spark.sql("CREATE DATABASE IF NOT EXISTS air_quality_gold")

# Read Silver table
silver_df = spark.read.table("air_quality_silver.sensor_measurements")

# Filter PM measurements only
pm_df = silver_df.filter(
    col("measurement_type").isin("P1", "P2")
)

# Add date column
pm_df = pm_df.withColumn(
    "date",
    to_date(col("measurement_ts"))
)

# Aggregate to location level
gold_df = (
    pm_df.groupBy(
        "date",
        "location_id",
        "country",
        "latitude",
        "longitude"
    )
    .agg(
        avg(
            when(col("measurement_type") == "P1", col("measurement_value"))
        ).alias("pm10_avg"),

        avg(
            when(col("measurement_type") == "P2", col("measurement_value"))
        ).alias("pm25_avg"),

        count(
            when(col("measurement_type") == "P1", col("measurement_value"))
        ).alias("pm10_count"),

        count(
            when(col("measurement_type") == "P2", col("measurement_value"))
        ).alias("pm25_count"),

        countDistinct("sensor_id").alias("sensors"),
        count("*").alias("measurements")
    )
)

# Pivot PM values into columns for dashboard analytics
pm_pivot_df = (
    pm_df
    .groupBy(
        "sensor_id",
        "sensor_type",
        "location_id",
        "country",
        "latitude",
        "longitude",
        "measurement_ts",
        "ingested_at"
    )
    .pivot("measurement_type", ["P1", "P2"])
    .agg(spark_max("measurement_value"))
    .withColumnRenamed("P1", "pm10")
    .withColumnRenamed("P2", "pm25")
    .filter(col("pm10").isNotNull() | col("pm25").isNotNull())
)

# Pick the latest row per sensor (snapshot step)
window_spec = Window.partitionBy("sensor_id").orderBy(
    col("measurement_ts").desc()
)

latest_df = (
    pm_pivot_df
    .withColumn("rn", row_number().over(window_spec))
    .filter(col("rn") == 1)
    .drop("rn")
)

# Write Gold table for python sanity checks
GOLD_TABLE_DAILY = "air_quality_gold.daily_air_quality"

gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(GOLD_TABLE_DAILY)

gold_rows_python = spark.read.table(
    "air_quality_gold.daily_air_quality"
).count()

# Sanity check python
display(
    spark.sql(
        f"""
        SELECT *
        FROM {GOLD_TABLE_DAILY}
        ORDER BY date DESC
        LIMIT 100
        """
    )
)

# Write Gold latest snaphot table for dashboard analytics
GOLD_TABLE_SNAPSHOT = "air_quality_gold.latest_sensor_snapshot"

latest_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(GOLD_TABLE_SNAPSHOT)

gold_rows_dashboard = spark.read.table(
    "air_quality_gold.latest_sensor_snapshot"
).count()

# Sanity check dashboard
display(
    spark.sql(
        f"""
        SELECT
            COUNT(*) AS sensors,
            AVG(pm25) AS avg_pm25,
            MAX(ingested_at) AS last_ingest
        FROM {GOLD_TABLE_SNAPSHOT}
        """
    )
)

# Display gold rows for python and dashboard from completed steps in the orchestration
dbutils.notebook.exit(
    f"Gold aggregation python sanity check completed: {gold_rows_python} records for python produced \nGold aggregation for the dashboard completed: {gold_rows_dashboard} records for the dashboard produced"
)

