In [None]:
from databricks.connect import DatabricksSession  #ruung in db
spark = DatabricksSession.builder.getOrCreate()


In [None]:
from pyspark.sql.functions import current_timestamp, col, to_date

In [None]:
#CERATe THE CATALOG
spark.sql("""
CREATE CATALOG IF NOT EXISTS iot_catalog""")

In [None]:
#CREATE Landing SCHEMA
spark.sql("""CREATE SCHEMA IF NOT EXISTS iot_catalog.00_landing
COMMENT 'Landing zone for raw IoT sensor JSON files'""")

In [None]:
#create volume to prevennt to store checkpoints
spark.sql("""
CREATE VOLUME IF NOT EXISTS iot_catalog.00_landing.source_iot_data
COMMENT 'Landing zone for raw IoT sensor JSON files'
""")


In [None]:
#CREATE Bronze SCHEMA
spark.sql(""" CREATE SCHEMA IF NOT EXISTS iot_catalog.01_bronze
COMMENT 'Bronze zone for raw IoT sensor JSON files' """)

In [None]:
# --- 1. GET PARAMETERS ---
# We retrieve the bucket name 
BUCKET_NAME = "my-databrick-iot-deen-001"


In [None]:
#Bucket path-
cloud_path = f"s3a://{BUCKET_NAME}/raw_weather/"

# UC Volume used for checkpoints (stream state) #volume/catalog/schemas
checkpoint_path = "/Volumes/iot_catalog/00_landing/source_iot_data/checkpoints/iot_bronze/"

# UC Volume used for Auto Loader schema tracking (required for schema drift)
schema_path = "/Volumes/iot_catalog/00_landing/source_iot_data/schemas/iot_bronze/"

# Bronze table (Unity Catalog managed Delta table)
bronze_table = "iot_catalog.01_bronze.iot_bronze_weather"

print(f"âœ… Cloud Path: {cloud_path}")

In [None]:
# --- INGESTION CONFIGURATION ---
# We use Auto Loader (cloudFiles) for high-frequency IoT data because it 
# handles schema drift and scales via file notification mode.
df = (
    spark.readStream.format("cloudFiles")
        # 1. Format & Inference: Ingest raw JSON. 
        # Sampling is used initially to determine data types.
        .option("cloudFiles.format", "json")
        .option("cloudFiles.inferColumnTypes", "true")

        # 2. Schema Management: Persistence is key for Unity Catalog.
        # This allows the stream to restart and remember the previous schema.
        .option("cloudFiles.schemaLocation", schema_path)

        # 3. Precision Control: Force the IoT timestamp to high-precision TIMESTAMP 
        # to prevent it being misidentified as a STRING during inference.
        .option("cloudFiles.schemaHints", "ingest_timestamp TIMESTAMP")

        # 4. Processing Strategy: 'Incremental-only' mode. 
        # Ignore existing backlogs in the bucket and process only data arriving from now.
        .option("cloudFiles.includeExistingFiles", "false")  #set to true to process existing files and false to ignore existing files

        # 5. Source Path: Map to the Unity Catalog External Volume.
        .load(cloud_path)

        # --- AUDIT & LINEAGE METADATA ---
        # Capture the processing time and source file path for debugging and data lineage.
        .withColumn("_ingest_timestamp", current_timestamp())
        .withColumn("_ingest_file_name", col('_metadata.file_name'))
        .withColumn("_ingest_file_path", col('_metadata.file_path'))


        # --- PARTITIONING & OPTIMIZATION ---
        # Generate a DATE column from the timestamp. 
        # This will be used for partitioning the Bronze table to optimize storage and cost.
        .withColumn("ingestion_date", to_date("_ingest_timestamp"))
)

In [None]:
(
    df.writeStream
        .format("delta")
        .outputMode("append") # Explicitly define that we are only adding new rows
        
        # 1. Reliability: Essential for tracking progress in External Volumes
        .option("checkpointLocation", checkpoint_path)

        # 2. Evolution: Ensures the Delta Table structure updates when sensors change
        .option("mergeSchema", "true")

        # 3. Layout: Physcially organizes data on disk by date for faster cleanup/queries
        .partitionBy("ingestion_date")

        # 4. Cost Control: Processes all pending files and then stops the cluster
        .trigger(availableNow=True)

        # 5. Destination: The Unity Catalog Three-Level Namespace (catalog.schema.table)
        .toTable(bronze_table) 
)

In [None]:
print("Ingestion Job Completed!")