In [0]:
# Source path
source_autoloader_path = "/Volumes/medisure_jen/bronze/autoloader_landing/"

# Checkpoint location
checkpoint_path = "/Volumes/medisure_jen/bronze/checkpoints/claims_bronze_stream/"

# Target Bronze Delta table
target_bronze_table = "medisure_jen.bronze.claims_stream"

# Step 2: Read the stream using Auto Loader pattern
streaming_df = (spark.readStream
    .format("cloudFiles")  # Use Auto Loader
    .option("cloudFiles.format", "json")  # Specify format
    .option("cloudFiles.schemaLocation", checkpoint_path)  # Schema evolution tracking
    .option("multiLine", "true")  # Important for JSON
    .option("escape", '"')
    .load(source_autoloader_path)
)

# Add metadata columns - USE UC-COMPATIBLE METHOD
from pyspark.sql.functions import current_timestamp, col
streaming_df_with_metadata = (streaming_df
    .withColumn("_source_file", col("_metadata.file_path"))  # UC-compatible file path
    .withColumn("_ingestion_timestamp", current_timestamp())
)

# Step 3: Start the Stream with AvailableNow trigger
streaming_query = (streaming_df_with_metadata.writeStream
    .format("delta")
    .outputMode("append")  # Only append for Bronze layer
    .option("checkpointLocation", checkpoint_path)  # Exactly-once guarantee
    .option("mergeSchema", "true")  # Handle schema changes
    .queryName("claims_stream_ingestion_query")
    .trigger(once=True)  # Use AvailableNow trigger (processed as micro-batch)-adding this as im currently using serverless continues stream is not supported in a server less account 
    .toTable(target_bronze_table)
)


display(streaming_query)

# Wait for the termination of the query. This blocks the notebook cell, keeping the stream alive.
# The stream will run until you manually stop it or an error occurs.
streaming_query.awaitTermination()

# If you need to stop the stream manually, you can run this command in a separate cell.
# streaming_query.stop()