In [0]:
"""
02_bronze_ingest_live.py

Purpose:
- Ingest raw live sensor data from sensor.community
- Store unmodified JSON payloads in Bronze Delta table
- Append-only ELT ingestion

Bronze table:
- air_quality_bronze.live_sensor_raw
"""

from typing import List, Dict
from datetime import datetime
import json
import uuid
import requests

SENSOR_URL: str = "https://data.sensor.community/static/v2/data.json"

def fetch_live_sensor_data(url: str) -> List[Dict]:
    """Fetch raw sensor data from sensor.community API."""
    try:
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        raise

def prepare_bronze_rows(
    records: List[Dict],
    source: str
) -> List[tuple]:
    """
    Convert raw records into Bronze-compatible rows.
    Each row contains: raw JSON, ingestion timestamp, source, batch ID.
    """
    batch_id: str = str(uuid.uuid4())
    ingested_at: datetime = datetime.utcnow()

    return [
        (
            json.dumps(record),
            ingested_at,
            source,
            batch_id
        )
        for record in records
    ]

# Ensure Bronze schema exists (idempotent)
spark.sql("CREATE DATABASE IF NOT EXISTS air_quality_bronze")

# Fetch live sensor data
records = fetch_live_sensor_data(SENSOR_URL)

# Prepare rows for Bronze table
bronze_rows = prepare_bronze_rows(records, "sensor.community")

# Create Spark DataFrame with explicit schema
bronze_df = spark.createDataFrame(
    bronze_rows,
    schema = ["raw_json", "ingested_at", "source", "batch_id"]
)

BRONZE_TABLE = "air_quality_bronze.live_sensor_raw"

# Append new data to Bronze Delta table
bronze_df.write \
    .format("delta") \
    .mode("append") \
    .saveAsTable(BRONZE_TABLE)

# Bronze layer sanity check: verify batch-level ingestion and append behavior
display(
    spark.sql(
        f"""
        SELECT
            source,
            batch_id,
            COUNT(*) AS records,
            MAX(ingested_at) AS latest_ingested_at
        FROM {BRONZE_TABLE}
        GROUP BY source, batch_id
        ORDER BY latest_ingested_at DESC 
        """
    )
)

# Print the first row of the DataFrame as JSON
first_row = bronze_df.collect()[0].asDict()
first_row["ingested_at"] = first_row["ingested_at"].isoformat()
print(json.dumps(first_row, indent=2))

rows_ingested = bronze_df.count()

dbutils.notebook.exit(
    f"Bronze ingestion completed: {rows_ingested} records ingested"
)
