In [None]:
import dlt
from pyspark.sql.functions import current_timestamp, col, to_date

In [None]:
catalog = spark.conf.get("catalog")
schema_landing = spark.conf.get("schema_landing")
schema_bronze = spark.conf.get("schema_bronze")
schema_silver = spark.conf.get("schema_silver")
schema_gold = spark.conf.get("schema_gold")
volume = spark.conf.get("volume")
BUCKET_NAME = spark.conf.get("bucket")

In [None]:
#Bucket path-
cloud_path = f"s3a://{BUCKET_NAME}/raw_weather/"

# UC Volume used for Auto Loader schema tracking (required for schema drift)
schema_path = f"/Volumes/{catalog}/{schema_landing}/{volume}/schemas/iot_bronze/"

In [None]:
@dlt.table(
    comment="Bronze layer: raw IoT weather data"
)
def bronze_table():

    df = (
        spark.readStream.format("cloudFiles")
            # --- 1. Auto Loader Format & Inference ---
            .option("cloudFiles.format", "json")
            .option("cloudFiles.inferColumnTypes", "true")

            # --- 2. Schema Persistence for UC ---
            .option("cloudFiles.schemaLocation", schema_path)

            # --- 3. Force high-precision timestamp inference ---
            .option("cloudFiles.schemaHints", "ingest_timestamp TIMESTAMP")

            # --- 4. Include/Exclude historical files ---
            .option("cloudFiles.includeExistingFiles", "true")

            # --- 5. Source path (external volume) ---
            .load(cloud_path)
            )
        
    df = df.withColumn("_ingest_timestamp", current_timestamp()) \
        .withColumn("_ingest_file_name", col("_metadata.file_name")) \
        .withColumn("_ingest_file_path", col("_metadata.file_path")) \
        .withColumn("ingestion_date", to_date("_ingest_timestamp"))
    

    return df