# Chapter 9: Streaming in and out of your Delta Lake

first draft

In [None]:
# Get dataset(s)
# Define spark kernel

In [None]:
# Delta readStream example

streamingDeltaDf = (
    spark
    .readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load("/files/delta/user_events")
    )

In [None]:
# Delta writeStream example

(streamingDeltaDf
.writeStream
.format("delta")
.outputMode(“append”)
.start("/<delta_path>/")
)


In [None]:
# Delta chained readStream to writeStream

(spark
.readStream
.format("delta")
.option("ignoreChanges", "true")
.load("/files/delta/user_events")
# …
# <other transformation logic>
# …
.writeStream
.format("delta")
.outputMode(“append”)
.start("/<delta_path>/")
)


In [None]:
# Setting ignoreDeletes

streamingDeltaDf = (
    spark
    .readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load("/files/delta/user_events")
    )

In [None]:
# Setting ignoreChanges

streamingDeltaDf = (
    spark
    .readStream
    .format("delta")
    .option("ignoreChanges", "true")
    .load("/files/delta/user_events")
    )

In [None]:
# Specify the starting version

(spark
.readStream
.format("delta")
.option("startingVersion", "5")
.load("/files/delta/user_events")
)

In [None]:
# Specify the starting timestamp

(spark
.readStream
.format("delta")
.option("startingTimestamp", "2023-04-18")
.load("/files/delta/user_events")
)

In [None]:
# Setting eventTimeOrder with a watermark

(spark
.readStream
.format("delta")
.option("withEventTimeOrder", "true")
.load("/files/delta/user_events")
.withWatermark("event_time", "10 seconds")
)

In [None]:
# Example of a forEachBatch function using txnVersion options for idempotency

app_id = ... # A unique string used as an application ID.

def writeToDeltaLakeTableIdempotent(batch_df, batch_id):
    # location 1
    (batch_df
    .write
    .format(“delta”)
    .option("txnVersion", batch_id)
    .option("txnAppId", app_id)
    .save("/<delta_path>/")
    )
    # location 2
    (batch_df
    .write
    .format(“delta”)
    .option("txnVersion", batch_id)
    .option("txnAppId", app_id)
    .save("/<delta_path>/")
    )

In [None]:
# Using the Delta Lake mergeBuilder to create an upsert forEachBatch function and apply to a writeStream

from delta.tables import *

changesStream = ... # Streaming dataframe with CDC records

# Function to upsert microBatchDf into Delta table using merge
def upsertToDelta(microBatchDf, batchId):
    """Use Delta APIs to handle merge logic into table"""
    deltaTable = DeltaTable.forName(spark, "retail_db.transactions_silver") # Target table

    deltaTable.alias("dt") \
    .merge(
        source = microBatchDf.alias("sdf"),
        condition = "sdf.t_id = dt.t_id"
        ) \
    .whenMatchedDelete(condition = "sdf.operation = 'DELETE'") \
    .whenMatchedUpdate(set = {
        "t_id": "sdf.t_id",
        "transaction_date": "sdf.transaction_date",
        "item_count": "sdf.item_count",
        "amount": "sdf.amount"
        }) \
    .whenNotMatchedInsert(values = {
        "t_id": "sdf.t_id",
        "transaction_date": "sdf.transaction_date",
        "item_count": "sdf.item_count",
        "amount": "sdf.amount"
        }) \
    .execute()

# Write the output of a streaming aggregation query into Delta table
(changesStream
.writeStream
.format("delta")
.queryName("Summaries Silver Pipeline")
.foreachBatch(upsertToDelta)
.outputMode("update")
.start()
)

In [None]:
# Delta Live Tables (DLT) syntactical example

import dlt

@dlt.table
def autoloader_dlt_bronze():
    return (
        spark
        .readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .load("<data path>")
    )

@dlt.table
def delta_dlt_silver():
    return (
        dlt
        .read_stream("autoloader_dlt_bronze")
        …
        <transformation logic>
        …
    )

@dlt.table
def live_delta_gold():
    return (
        dlt
        .read("delta_dlt_silver")
        …
        <aggregation logic>
        …
    )

In [None]:
# Defining Change Data Feed read boundaries in a batch process

# Specify the version as int or long
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", 0) \
  .option("endingVersion", 10) \
  .table("myDeltaTable")

# Specify timestamps as formatted timestamp
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", '2023-04-01 05:45:46') \
  .option("endingTimestamp", '2023-04-21 12:00:00') \
  .table("myDeltaTable")

# Providing only the startingVersion/timestamp
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", '2023-04-21 12:00:00.001') \
  .table("myDeltaTable")


# Specifying similarly with a file location
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", '2021-04-21 05:45:46') \
  .load("/pathToMyDeltaTable")

In [None]:
# Defining Change Data Feed read boundaries in a batch process

# Specifying a starting version
spark.readStream.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", 0) \
  .load("/pathToMyDeltaTable")

# Specifying a starting timestamp
spark.readStream.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", "2021-04-21 05:35:43") \
  .load("/pathToMyDeltaTable")

# Not providing either option, i.e., process from beginning
spark.readStream.format("delta") \
  .option("readChangeFeed", "true") \
  .load("/pathToMyDeltaTable")


In [None]:
# Example for viewing changes (needs updating)

from pyspark.sql.functions import col

spark.sql("""
    people10m
        SET
            gender = 'F',
            firstName='Leah'
        WHERE
            firstName='Leo'
            and lastName='Conkay';
    """)

(spark
.read
.format("delta")
.option("readChangeFeed", "true")
.option("startingVersion", 5)
.option("endingVersion", 5)
.table("tristen.people10m")
.select(
    col("firstName"),
    col("lastName"),
    col("gender"),
    col("_change_type"),
    col("_commit_version"))
).show()
