In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os

# Spark session
spark = SparkSession.builder \
    .appName("BronzeLayer") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Read from landing zone with multiline option
landing_path = "/opt/bitnami/spark/data/landing"
df = spark.readStream \
    .format("json") \
    .option("multiLine", "true") \
    .load(landing_path)

# Add metadata
df_bronze = df.select("data.*", "timestamp") \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_file", input_file_name())

# Write to bronze layer
bronze_path = "/opt/bitnami/spark/data/bronze"
checkpoint_path = "/opt/bitnami/spark/data/bronze_checkpoint"

query = df_bronze.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .start(bronze_path)

query.awaitTermination()