In [0]:
import dlt
from pyspark.sql.functions import col, to_date, regexp_replace, year

In [0]:
# 1. Bronze - Streaming ingestion using Auto Loader
@dlt.table(
    name="batch19v1_bronze",
    comment="Raw data ingested from Volume batch16v1"
)
def bronze():
    return (spark.readStream
                  .format("cloudFiles")
                  .option("cloudFiles.format", "csv")
                  .option("header", "true")
                  .load("/Volumes/sivaadbuc/default/batch19dlt/")
                  .withColumnRenamed("Sales Person", "sales_person")
                  .withColumnRenamed("Boxes Shipped", "boxes_shipped"))

In [0]:





# 2. Silver - Cleaned and typed data
@dlt.table(
    name="batch19v1_silver",
    comment="Cleansed data with proper types and filters"
)
def silver():
    bronze_df = dlt.read_stream("batch19v1_bronze")
    return (bronze_df
              .withColumn("order_date", to_date(col("Date"), "dd-MMM-yy"))
              .withColumn("amount", regexp_replace(col("Amount"), "\\$", "").cast("double"))
              .withColumn("boxes_shipped", col("boxes_shipped").cast("int"))
              .dropna(subset=["amount"]))

# 3. Gold - Aggregated business KPIs
@dlt.table(
    name="batch19v1_gold",
    comment="Aggregated sales and shipments by country and year"
)
def gold():
    silver_df = dlt.read("batch19v1_silver")   # Note: batch (not stream) since it's aggregated
    return (silver_df
              .groupBy("Country", year(col("order_date")).alias("year"))
              .agg(
                  {"amount": "sum", "boxes_shipped": "sum"}
              )
              .withColumnRenamed("sum(amount)", "total_sales")
              .withColumnRenamed("sum(boxes_shipped)", "total_boxes"))
