In [0]:
# Notebook Name: 02_streaming_data
# Purpose: Start a long-running streaming job to ingest data from the landing zone using Auto Loader.
#          This notebook should be run once and left running continuously.

# Source path in DBFS (where the ingestion notebook drops claims_stream.json)
source_autoloader_path = "dbfs:/tmp/medisure_jen/autoloader_landing/claims_stream.json"

# Checkpoint location for the stream (crucial for fault tolerance)
checkpoint_path = "dbfs:/tmp/medisure_poc/checkpoints/claims_bronze_stream/"

# Target Bronze Delta table
target_bronze_table = "catalog_jen.medisure_jen.bronze.claims_streaming"

# Step 2:Read the stream using Auto Loader pattern
streaming_df = (spark.readStream
    .format("cloudFiles")  # Use Auto Loader
    .option("cloudFiles.format", "json")  # Specify format
    .option("cloudFiles.schemaLocation", checkpoint_path)  # Schema evolution tracking
    .option("multiLine", "true")  # Important for JSON
    .option("escape", '"')
    .load(source_autoloader_path)
)

# Add metadata columns
from pyspark.sql.functions import current_timestamp, input_file_name
streaming_df_with_metadata = streaming_df.withColumn("_source_file", input_file_name()) \
                                         .withColumn("_ingestion_timestamp", current_timestamp())
# Step 3: Start the Stream and Write to Delta Table
# Start the streaming query
streaming_query = (streaming_df_with_metadata.writeStream
    .format("delta")
    .outputMode("append")  # Only append for Bronze layer
    .option("checkpointLocation", checkpoint_path)  # Exactly-once guarantee
    .option("mergeSchema", "true")  # Handle schema changes
    .queryName("claims_stream_ingestion_query")
    .toTable(target_bronze_table)
)

display(streaming_query)

# Wait for the termination of the query. This blocks the notebook cell, keeping the stream alive.
# The stream will run until you manually stop it or an error occurs.
streaming_query.awaitTermination()

# If you need to stop the stream manually, you can run this command in a separate cell.
# streaming_query.stop()