In [0]:
# import dlt
# from pyspark.sql.functions import col, to_date, regexp_replace, year

# # Bronze: Ingest raw CSV with Auto Loader
# @dlt.table(
#     name="australia_bronze",
#     comment="Raw data ingested from Australia_data1.csv"
# )
# def bronze():
#     return (spark.readStream
#                   .format("cloudFiles")
#                   .option("cloudFiles.format", "csv")
#                   .option("header", "true")
#                   .load("/Volumes/sivaadbuc/default/batch16v1/")
#                   .withColumnRenamed("Sales Person", "sales_person")
#                   .withColumnRenamed("Boxes Shipped", "boxes_shipped"))

# # Silver: Clean & cast types
# @dlt.table(
#     name="australia_silver",
#     comment="Cleaned data with proper types"
# )
# def silver():
#     bronze_df = dlt.read_stream("australia_bronze")
#     return (bronze_df
#               .withColumn("order_date", to_date(col("Date"), "dd-MMM-yy"))
#               .withColumn("amount", regexp_replace(col("Amount"), "[$,]", "").cast("double"))
#               .withColumn("boxes_shipped", col("boxes_shipped").cast("int"))
#               .dropna(subset=["amount"]))

# # Gold: Aggregate
# @dlt.table(
#     name="australia_gold",
#     comment="Aggregated sales and shipments by country and year"
# )
# def gold():
#     silver_df = dlt.read("australia_silver")
#     return (silver_df.groupBy("Country", year(col("order_date")).alias("year"))
#                      .agg(
#                          {"amount": "sum", "boxes_shipped": "sum"}
#                      )
#                      .withColumnRenamed("sum(amount)", "total_sales")
#                      .withColumnRenamed("sum(boxes_shipped)", "total_boxes"))


In [0]:
import dlt
from pyspark.sql.functions import col, to_date, regexp_replace, year

# 1. Gold: Aggregated view (depends on Silver)
@dlt.table(
    name="australia_gold",
    comment="Aggregated sales and shipments by country and year"
)
def gold():
    silver_df = dlt.read("australia_silver")
    return (silver_df.groupBy("Country", year(col("order_date")).alias("year"))
                     .agg(
                         {"amount": "sum", "boxes_shipped": "sum"}
                     )
                     .withColumnRenamed("sum(amount)", "total_sales")
                     .withColumnRenamed("sum(boxes_shipped)", "total_boxes"))

# 2. Silver: Cleansed data (depends on Bronze)
@dlt.table(
    name="australia_silver",
    comment="Cleaned data with proper types"
)
def silver():
    bronze_df = dlt.read_stream("australia_bronze")
    return (bronze_df
              .withColumn("order_date", to_date(col("Date"), "dd-MMM-yy"))
              .withColumn("amount", regexp_replace(col("Amount"), "[$,]", "").cast("double"))
              .withColumn("boxes_shipped", col("boxes_shipped").cast("int"))
              .dropna(subset=["amount"]))

# 3. Bronze: Raw ingestion (source of truth)
@dlt.table(
    name="australia_bronze",
    comment="Raw data ingested from Australia_data1.csv"
)
def bronze():
    return (spark.readStream
                  .format("cloudFiles")
                  .option("cloudFiles.format", "csv")
                  .option("header", "true")
                  .load("/Volumes/sivaadbuc/default/batch16v1/")
                  .withColumnRenamed("Sales Person", "sales_person")
                  .withColumnRenamed("Boxes Shipped", "boxes_shipped"))
