In [0]:
from pyspark.sql.functions import col, regexp_replace, to_date, year

# -------------------------------------------------
# 1. Bronze: Raw ingestion with Auto Loader
# -------------------------------------------------
bronze_df = (spark.readStream
                  .format("cloudFiles")
                  .option("cloudFiles.format", "csv")
                  .option("header", "true")
                  .option("cloudFiles.schemaLocation", "/Volumes/sivaadbuc/default/batch16v1/_schema")
                  .load("/Volumes/sivaadbuc/default/batch16v1/")
                  .withColumnRenamed("Sales Person", "sales_person")
                  .withColumnRenamed("Boxes Shipped", "boxes_shipped"))

bronze_query = (bronze_df.writeStream
                         .format("delta")
                         .option("checkpointLocation", "/Volumes/sivaadbuc/default/batch16v1/_checkpoints/bronze")
                         .outputMode("append")
                         .table("bronze_sales"))


# -------------------------------------------------
# 2. Silver: Cleaned data with expectations (manual)
# -------------------------------------------------
silver_df = (spark.readStream.table("bronze_sales")
                .withColumn("order_date", to_date(col("Date"), "dd-MMM-yy"))
                .withColumn("amount", regexp_replace(col("Amount"), "[$,]", "").cast("double"))
                .withColumn("boxes_shipped", col("boxes_shipped").cast("int"))
                .dropna(subset=["amount"]))

# Expectations in PySpark must be handled manually:
# Example: Drop invalid rows
silver_df = silver_df.filter("amount > 0 AND Country IS NOT NULL")

silver_query = (silver_df.writeStream
                           .format("delta")
                           .option("checkpointLocation", "/Volumes/sivaadbuc/default/batch16v1/_checkpoints/silver")
                           .outputMode("append")
                           .table("silver_sales"))


# -------------------------------------------------
# 3. Gold: Aggregations (batch, not streaming)
# -------------------------------------------------
gold_df = (spark.read.table("silver_sales")
                .groupBy("Country", year(col("order_date")).alias("year"))
                .agg(
                    {"amount": "sum", "boxes_shipped": "sum"}
                )
                .withColumnRenamed("sum(amount)", "total_sales")
                .withColumnRenamed("sum(boxes_shipped)", "total_boxes"))

gold_df.write.mode("overwrite").saveAsTable("gold_sales")


# -------------------------------------------------
# 4. View: Logical only (ad-hoc query)
# -------------------------------------------------
spark.sql("""
CREATE OR REPLACE VIEW gold_sales_view AS
SELECT * FROM gold_sales WHERE total_sales > 10000
""")


# -------------------------------------------------
# 5. Streaming Materialized View (continuous agg)
# -------------------------------------------------
gold_mv_df = (spark.readStream.table("silver_sales")
                   .groupBy("Country")
                   .sum("amount"))

gold_mv_query = (gold_mv_df.writeStream
                              .format("delta")
                              .option("checkpointLocation", "/Volumes/sivaadbuc/default/batch16v1/_checkpoints/gold_mv")
                              .outputMode("complete")
                              .table("gold_sales_streaming_mv"))


In [0]:
| Concept        | DLT                                            | Pure PySpark                                      |
| -------------- | ---------------------------------------------- | ------------------------------------------------- |
| Table creation | `@dlt.table`                                   | `df.writeStream.table(...)`                       |
| Dependencies   | `dlt.read` / `dlt.read_stream` (automatic DAG) | Must manage manually with `spark.read.table(...)` |
| Data quality   | `@dlt.expect`, `@dlt.expect_or_drop`           | Write filters manually (`df.filter(...)`)         |
| Views          | `@dlt.view`                                    | `CREATE OR REPLACE VIEW ...`                      |
| Streaming MV   | `@dlt.table` (with read\_stream)               | Write your own `.writeStream` with checkpoint     |
