In [None]:
from databricks.connect import DatabricksSession  
spark = DatabricksSession.builder.getOrCreate()
from pyspark.sql.functions import current_timestamp, col, to_date

In [None]:
catalog = dbutils.widgets.get("catalog")
schema_landing = dbutils.widgets.get("schema_landing")
schema_bronze = dbutils.widgets.get("schema_bronze")
schema_silver = dbutils.widgets.get("schema_silver")
schema_gold = dbutils.widgets.get("schema_gold")
volume = dbutils.widgets.get("volume")
BUCKET_NAME = dbutils.widgets.get("bucket")

In [None]:
#Bucket path-
cloud_path = f"s3a://{BUCKET_NAME}/raw_weather/"

# UC Volume used for checkpoints (stream state) #volume/catalog/schemas
checkpoint_path = f"/Volumes/{catalog}/{schema_landing}/{volume}/checkpoints/iot_bronze/"

# UC Volume used for Auto Loader schema tracking (required for schema drift)
schema_path = f"/Volumes/{catalog}/{schema_landing}/{volume}/schemas/iot_bronze/"

# Bronze table (Unity Catalog managed Delta table)
bronze_table = f"{catalog}.{schema_bronze}.iot_bronze_weather"

#print(f"✅ Cloud Path: {cloud_path}")

✅ Cloud Path: s3a://my-databrick-iot-deen-001/raw_weather/


In [None]:
# --- INGESTION CONFIGURATION ---
# We use Auto Loader (cloudFiles) for high-frequency IoT data 
# It handles schema drift and scales via file notification mode.
df = (
    spark.readStream.format("cloudFiles")
        # 1. Format & Inference: Ingest raw JSON. 
        .option("cloudFiles.format", "json")
        .option("cloudFiles.inferColumnTypes", "true")

        # 2. Schema Management: Persistence is key for Unity Catalog.
        # This allows the stream to restart and remember the previous schema.
        .option("cloudFiles.schemaLocation", schema_path)

        # 3. Precision Control: Force the IoT timestamp to high-precision TIMESTAMP 
        # to prevent it being misidentified as a STRING during inference.
        .option("cloudFiles.schemaHints", "ingest_timestamp TIMESTAMP")

        # 4. Processing Strategy mode. 
        .option("cloudFiles.includeExistingFiles", "true")  #set to true to process existing files and false to ignore existing files

        # 5. Source Path: Map to the Unity Catalog External Volume.
        .load(cloud_path)

        # --- AUDIT & LINEAGE METADATA ---
        # Capture the processing time and source file path for debugging and data lineage.
        .withColumn("_ingest_timestamp", current_timestamp())
        .withColumn("_ingest_file_name", col('_metadata.file_name'))
        .withColumn("_ingest_file_path", col('_metadata.file_path'))

        # --- PARTITIONING & OPTIMIZATION ---
        # Generate a DATE column from the timestamp. 
        # This will be used for partitioning the Bronze table to optimize storage and cost.
        .withColumn("ingestion_date", to_date("_ingest_timestamp"))
)

In [None]:
(
    df.writeStream
        .format("delta")
        .outputMode("append") # Explicitly define that we are only adding new rows
        
        # 1. Reliability: Essential for tracking progress in External Volumes
        .option("checkpointLocation", checkpoint_path)

        # 2. Evolution: Ensures the Delta Table structure updates when sensors change
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .option("mergeSchema", "true")

        # 3. Layout: Physcially organizes data on disk by date for faster cleanup/queries
        .partitionBy("ingestion_date")

        # 4. Cost Control: Processes all pending files and then stops the cluster
        .trigger(availableNow=True)

        # 5. Destination: The Unity Catalog Three-Level Namespace (catalog.schema.table)
        .toTable(bronze_table) 
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x28f93c5f350>