In [0]:
# DLT works with 3 types of datasets
# 1. Streaming tables (permanent/temporary) - Used as append data sources or for incremental data
# 2. Materialized views - Used for transformations, aggregations or computations
# 3. Views - Used for intermediate transformations, not stored in target schema

import dlt

In [0]:
# Rules for orders
__order_rules = {
    "Valid Order Status": "o_orderstatus in ('O', 'F', 'P')",
    "Valid Order Price": "o_totalprice > 0.0"
}

In [0]:
# Rules for customers
__customer_rules = {
    "Valid Customer Segment": "c_mktsegment is not null"
}

In [0]:
# Streaming table for orders
@dlt.table(
    name="orders",
    table_properties={
        "quality": "bronze"
    },
    comment="Raw orders data"
)
@dlt.expect_all(__order_rules) # Default action is warn
def orders_bronze(): # These functions have to return dataframes
    df = (
        spark
        .readStream # For stream data
        .table("dev.bronze.orders_raw")
    )
    return df

In [0]:
# Streaming table for orders using AutoLoader
@dlt.table(
    name="orders_autoloader",
    table_properties={
        "quality": "bronze"
    },
    comment="Raw orders data autoloaded"
)
@dlt.expect_all(__order_rules)
def orders_autoloader_bronze(): # These functions have to return dataframes
    df = (
        spark
        .readStream # For stream data
        .format("cloudFiles")
        .option("cloudFiles.schemaHints", "o_orderkey long, o_custkey long, o_orderstatus string, o_totalprice decimal(18,2), o_orderdate date, o_orderpriority string, o_clerk string, o_shippriority integer, o_comment string")
        .option("cloudFiles.schemaLocation", "/Volumes/dev/etl/landing/autoloader/schema/1")
        .option("cloudFiles.format", "CSV")
        .option("pathGlobFilter", "*.csv")
        .option("cloudFiles.schemaEvolutionMode", "none")
        .load("/Volumes/dev/etl/landing/files/")
    )
    return df

In [0]:
# Append Flow

# Create target table
dlt.create_streaming_table("orders_union_bronze")

@dlt.append_flow(
    target="orders_union_bronze"
)
def orders_bronze():
    df = spark.readStream.table("LIVE.orders") # Because this table is LIVE already in the pipeline
    return df

@dlt.append_flow(
    target="orders_union_bronze"
)
def orders_union():
    df = spark.readStream.table("LIVE.orders_autoloader") # Because this table is LIVE already in the pipeline
    return df

In [0]:
# Materialized view for customers
@dlt.table(
    name="customers",
    table_properties={
        "quality": "bronze"
    },
    comment="Raw customers data"
)
# @dlt.expect_all_or_fail(__customer_rules)
@dlt.expect_all_or_drop(__customer_rules)
def customers_bronze(): # These functions have to return dataframes
    df = (
        spark
        .read # For batch data, always read in full (??)
        # .readStream
        .table("dev.bronze.customers_raw")
    )
    return df

In [0]:
# View to join orders with customers
@dlt.view() # Used to create views
def joined_view():
    customers_df = spark.read.table("LIVE.customers") # LIVE can be used to access DLTs produced in the same pipeline
    # orders_df = spark.read.table("LIVE.orders")
    orders_df = spark.read.table("LIVE.orders_union_bronze") # To read from the union table
    joined_df = orders_df.join(customers_df, how="left_outer", on=(orders_df["o_custkey"] == customers_df["c_custkey"]))

    return joined_df

In [0]:
# Materialized view with new column

from pyspark.sql.functions import current_timestamp

@dlt.table(
    table_properties = {
        "quality": "silver"
    },
    name = "orders_customers_joined_silver",
    comment = "Orders joined with customers"
)
def orders_customers_joined_silver():
    joined_df = spark.read.table("LIVE.joined_view") # Reading view
    joined_df = joined_df.withColumn("__insert_date", current_timestamp())
    return joined_df

In [0]:
# Aggregate based on c_mktsegment and find the count of order

from pyspark.sql.functions import count

@dlt.table(
    table_properties = {
        "quality": "gold"
    },
    name = "orders_agg_mktsegment",
    comment = "Orders aggregated based on market segment"
)
def orders_agg_mktsegment():
    agg_df = spark.read.table("LIVE.orders_customers_joined_silver").groupBy("c_mktsegment").agg(count("o_orderkey").alias("num_orders")).withColumn("__insert_date", current_timestamp())
    return agg_df