In [0]:
# =========================================
# 01 Bronze ingest – Auto Loader (CI/CD friendly, catalog-based)
# Works in Databricks Free (availableNow trigger)
# =========================================

from pyspark.sql.functions import current_timestamp, col, regexp_extract, to_date

# -----------------------------
# Parameters (set by runner/job)
# -----------------------------
dbutils.widgets.text("catalog", "etl_demo")          # e.g. etl_dev / etl_qa / etl_prod / etl_demo
dbutils.widgets.text("volume_name", "customer_volume")
dbutils.widgets.text("input_dir", "in")             # folder inside the volume
dbutils.widgets.text("checkpoint_dir", "_checkpoints/bronze")
dbutils.widgets.text("schema_dir", "_schema")

catalog        = dbutils.widgets.get("catalog").strip()
volume_name    = dbutils.widgets.get("volume_name").strip()
input_dir      = dbutils.widgets.get("input_dir").strip().strip("/")
checkpoint_dir = dbutils.widgets.get("checkpoint_dir").strip().strip("/")
schema_dir     = dbutils.widgets.get("schema_dir").strip().strip("/")

# -----------------------------
# Set environment context
# -----------------------------
spark.sql(f"USE CATALOG {catalog}")
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("USE SCHEMA bronze")

# -----------------------------
# UC Volume paths
# Pattern: dbfs:/Volumes/<catalog>/<schema>/<volume>/<path>
# -----------------------------
input_path      = f"dbfs:/Volumes/{catalog}/bronze/{volume_name}/{input_dir}"
checkpoint_path = f"dbfs:/Volumes/{catalog}/bronze/{volume_name}/{checkpoint_dir}"
schema_location = f"dbfs:/Volumes/{catalog}/bronze/{volume_name}/{schema_dir}"

bronze_table = "bronze.customer_bronze"  # schema-qualified only (catalog comes from USE CATALOG)

print("CATALOG:", catalog)
print("INPUT:", input_path)
print("CHECKPOINT:", checkpoint_path)
print("SCHEMA LOCATION:", schema_location)
print("TARGET TABLE:", bronze_table)

# -----------------------------
# Auto Loader stream
# -----------------------------
df_stream = (
    spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", schema_location)
        .option("cloudFiles.schemaEvolutionMode", "rescue")
        .option("cloudFiles.rescuedDataColumn", "_rescued_data")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(input_path)
)

# Add metadata columns
df_out = (
    df_stream
        .withColumn("_ingest_time", current_timestamp())
        .withColumn("_file_path", col("_metadata.file_path"))
        .withColumn("_file_name", col("_metadata.file_name"))
        # Optional: extract snapshot_date from filename if present (e.g. snapshot_2025_12_01.csv)
        .withColumn(
            "snapshot_date",
            to_date(regexp_extract(col("_metadata.file_name"), r"(\d{4}_\d{2}_\d{2})", 1), "yyyy_MM_dd")
        )
)

# -----------------------------
# Write to Delta (Free-friendly)
# availableNow processes available files and stops.
# -----------------------------
query = (
    df_out.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", checkpoint_path)
        .trigger(availableNow=True)
        .toTable(bronze_table)
)

query.awaitTermination()
print("✅ Bronze ingest finished")
