In [0]:
"""
02_bronze_ingest_live.py

Purpose:
- Ingest raw live sensor data from sensor.community
- Store unmodified JSON payloads in Bronze Delta table
- Append-only ELT ingestion
"""

# Databricks notebooks do not automatically include the project root in PYTHONPATH.
# To enable imports from src/, each notebook adds the project root to sys.path at runtime
import sys, os
sys.path.append(os.path.abspath(".."))

import json
from src.utils.http import fetch_json
from src.utils.bronze import prepare_bronze_rows

SENSOR_URL: str = "https://data.sensor.community/static/v2/data.json"
SOURCE_NAME: str = "sensor.community"

# Ensure Bronze schema exists (idempotent)
spark.sql("CREATE DATABASE IF NOT EXISTS air_quality_bronze")

# Fetch live sensor data
records = fetch_json(SENSOR_URL)

# Prepare rows for Bronze table
bronze_rows = prepare_bronze_rows(records, SOURCE_NAME)

# Create Spark DataFrame with explicit schema
bronze_df = spark.createDataFrame(
    bronze_rows,
    schema = ["raw_json", "ingested_at", "source", "batch_id"]
)

BRONZE_TABLE = "air_quality_bronze.live_sensor_raw"

# Append new data to Bronze Delta table
bronze_df.write \
    .format("delta") \
    .mode("append") \
    .saveAsTable(BRONZE_TABLE)

# Bronze layer sanity check: verify batch-level ingestion and append behavior
display(
    spark.sql(
        f"""
        SELECT
            source,
            batch_id,
            COUNT(*) AS records,
            MAX(ingested_at) AS latest_ingested_at
        FROM {BRONZE_TABLE}
        GROUP BY source, batch_id
        ORDER BY latest_ingested_at DESC 
        """
    )
)

# Print the first row of the DataFrame as JSON
first_row = bronze_df.collect()[0].asDict()
first_row["ingested_at"] = first_row["ingested_at"].isoformat()
print(json.dumps(first_row, indent=2))

rows_ingested = bronze_df.count()

dbutils.notebook.exit(
    f"Bronze ingestion completed: {rows_ingested} records ingested"
)
