In [0]:
"""
03_silver_cleaning.py

Purpose:
- Parse raw JSON from Bronze layer
- Enforce schema and data types
- Clean and deduplicate sensor measurements
- Write structured Silver Delta table

Input:
- air_quality_bronze.live_sensor_raw

Output:
- air_quality_silver.sensor_measurements

Visual tree view of the sensor_schema:
sensor_schema
├─ id : Long
├─ timestamp : String
├─ location : Struct
│    ├─ id : Long
│    ├─ latitude : String
│    ├─ longitude : String
│    └─ country : String
├─ sensor : Struct
│    ├─ id : Long
│    └─ sensor_type : Struct
│          └─ name : String
└─ sensordatavalues : Array of Struct
     ├─ value_type : String
     └─ value : String

"""

# Ensure silver database exists
spark.sql("CREATE DATABASE IF NOT EXISTS air_quality_silver")

# Read bronze table
from pyspark.sql.functions import (
    col, 
    from_json, 
    to_timestamp, 
    explode, 
    to_timestamp, 
    first, 
    when)
from pyspark.sql.types import (
    StructType, 
    StructField, 
    StringType, 
    DoubleType, 
    LongType, 
    ArrayType)
from pyspark.sql.window import Window

bronze_df = spark.read.table("air_quality_bronze.live_sensor_raw")

# Define JSON schema
sensor_schema = StructType([
    StructField("id", LongType(), True),

    StructField("timestamp", StringType(), True),
    
    StructField("location", StructType([
        StructField("id", LongType(), True),
        StructField("latitude", StringType(), True),
        StructField("longitude", StringType(), True),
        StructField("country", StringType(), True)
    ]), True),

    StructField("sensor", StructType([
        StructField("id", LongType(), True),
        StructField("sensor_type", StructType([
            StructField("name", StringType(), True)
        ]), True)
    ]), True),

    StructField(
        "sensordatavalues",
        ArrayType(
            StructType([
                StructField("value_type", StringType(), True),
                StructField("value", StringType(), True)
            ])
        ),
        True
    )
])

# Parse JSON & explode onto structured columns
parsed_df = bronze_df.withColumn(
    "parsed_json",
    from_json(col("raw_json"), sensor_schema)
)

# Select & clean silver columns
exploded_df = parsed_df.select(
    col("parsed_json.sensor.id").alias("sensor_id"),
    col("parsed_json.sensor.sensor_type.name").alias("sensor_type"),
    to_timestamp(col("parsed_json.timestamp")).alias("measurement_ts"),
    col("parsed_json.location.id").alias("location_id"),
    col("parsed_json.location.latitude").cast("double").alias("latitude"),
    col("parsed_json.location.longitude").cast("double").alias("longitude"),
    col("parsed_json.location.country").alias("country"),
    explode(col("parsed_json.sensordatavalues")).alias("measurement"),
    col("ingested_at"),
    col("batch_id")
)

# Silver projection
silver_df = exploded_df.select(
    "sensor_id",
    "sensor_type",
    "measurement_ts",
    "location_id",
    "latitude",
    "longitude",
    "country",
    col("measurement.value_type").alias("measurement_type"),
    col("measurement.value").cast("double").alias("measurement_value"),
    "ingested_at",
    "batch_id"
).filter(
    col("measurement_ts").isNotNull() &
    col("measurement_value").isNotNull()
)

# Filter invalid records (light cleaning)
silver_df_clean = silver_df.filter(
    col("sensor_id").isNotNull() &
    col("measurement_ts").isNotNull()
)

# Deduplicate
silver_df_dedup = silver_df_clean.dropDuplicates(
    ["sensor_id", "measurement_ts", "measurement_type"]
)

# Add quality checks / plausibility filtering
# Define physical plausibility
silver_df_filtered = silver_df_dedup.withColumn(
    "is_plausible",
    (col("measurement_value") >= 0)
)

# Create separate PM10 / PM2.5 columns to check ratio
pm_df = silver_df_filtered.filter(col("measurement_type").isin(["P1", "P2"]))

pm_pivot_df = pm_df.groupBy(
    "location_id",
    "measurement_ts",
).pivot("measurement_type", ["P1", "P2"]).agg(first("measurement_value"))

# Add plausibility flag for ratio: PM2.5 <= 1.2 * PM10
pm_pivot_df = pm_pivot_df.withColumn(
    "ratio_plausible",
    (col("P2") <= col("P1")*1.2)
)

# Merge back ratio flag with main silver_df
silver_df_final = silver_df_filtered.join(
    pm_pivot_df.select("location_id", "measurement_ts", "ratio_plausible"),
    on=["location_id", "measurement_ts"],
    how="left"
).withColumn(
    "quality_flag",
    when(col("is_plausible") & col("ratio_plausible"), "OK").otherwise("BAD")         
)

# Write Silver Delta table
SILVER_TABLE = "air_quality_silver.sensor_measurements"

silver_df_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(SILVER_TABLE)

# Silver layer schema check
display(
    spark.sql(
        f"""
        DESCRIBE {SILVER_TABLE}"""
    )
)

# Silver layer cardinality check
display(
    spark.sql(
        f"""
        SELECT
            COUNT(*) AS rows,
            COUNT(DISTINCT sensor_id) AS sensors,
            COUNT(DISTINCT location_id) AS locations
        FROM {SILVER_TABLE}"""
    )
)

# Silver layer sanity check 1: Verify which sensors exist & whether PM-data exists
display(
    spark.sql(
        f"""
        SELECT
            measurement_type,
            COUNT(*) AS records
        FROM {SILVER_TABLE}
        GROUP BY measurement_type
        ORDER BY records DESC
        """            
    )
)

# Silver layer sanity check 2: Check PM-only aggregation
display(
    spark.sql(
        f"""
        SELECT
            sensor_type,
            measurement_type,
            COUNT(*) AS records,
            AVG(measurement_value) AS avg_value
        FROM {SILVER_TABLE}
        WHERE measurement_type IN ('P1', 'P2')
        GROUP BY sensor_type, measurement_type
        ORDER BY records DESC
        """
    )
)

silver_rows = spark.read.table(
    "air_quality_silver.sensor_measurements"
).count()

dbutils.notebook.exit(
    f"Silver cleaning completed: {silver_rows} records in silver table"
)