# Chapter 9: Streaming in and out of your Delta Lake

draft: 3

# Notebook Setup

In [11]:
# Create some resources
import os
os.makedirs("./delta/ch9/")  # create working directory for this notebook
os.makedirs("./delta/ch9/ckpt/")  # create a common checkpointing directory 

# set COVID dataset path
covid_data_path = '/opt/spark/work-dir/rs/data/COVID-19_NYT'

# Delta `readStream` example 

In [12]:
streamingDeltaDf = (
    spark
    .readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load(covid_data_path)
    )

# Delta `writeStream` example

In [14]:
(streamingDeltaDf
.writeStream
.format("delta")
.outputMode("append")
.option("checkpointLocation", ".ckpt/ws1/")
.start("./covid/")
);

23/05/19 12:33:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/05/19 12:33:26 WARN StreamingQueryManager: Stopping existing streaming query [id=304120fb-6798-450f-a033-a123559ec17b, runId=a5476195-bddf-4060-bad6-999f43cfab6e], as a new run is being started.


# Delta chained `readStream` to `writeStream`

In [17]:
from pyspark.sql.functions import col

(spark
.readStream
.format("delta")
.load("./covid/")
# in the book this says '<other transformation logic>'
.filter(col("deaths") > 0)
.writeStream
.format("delta")
.outputMode("append")
.option("checkpointLocation", ".ckpt/ws2/")
.start("./covid_deaths/")
);

23/05/19 12:41:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7ff9ddc14040>

23/05/19 12:41:05 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


# Setting `ignoreDeletes`

In [18]:
streamingDeltaDf = (
    spark
    .readStream
    .format("delta")
    .option("ignoreDeletes", "true")
    .load("./covid_deaths/")
    );

# Setting `ignoreChanges`

In [24]:
streamingDeltaDf = (
    spark
    .readStream
    .format("delta")
    .option("ignoreChanges", "true")
    .load("./covid_deaths/")
    );

# Specify the `startingVersion`

In [22]:
streamingDeltaDf = (spark
.readStream
.format("delta")
.option("startingVersion", "5")
.load("./covid_deaths/")
);

# Specify the `startingTimestamp`

In [25]:
streamingDeltaDf = (spark
.readStream
.format("delta")
.option("startingTimestamp", "2023-04-18")
.load("./covid_deaths/")
);

# Setting `eventTimeOrder` with a watermark

In [26]:
streamingDeltaDf = (spark
.readStream
.format("delta")
.option("withEventTimeOrder", "true")
.load("./covid_deaths/")
.withWatermark("event_time", "10 seconds")
)

AnalysisException: Column 'event_time' does not exist. Did you mean one of the following? [state, county, date, deaths, fips, cases];
'EventTimeWatermark 'event_time, 10 seconds
+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@3228627a,delta,List(),None,List(),None,Map(withEventTimeOrder -> true, path -> ./covid_deaths/),None), delta, [date#2554, county#2555, state#2556, fips#2557, cases#2558, deaths#2559]


# Idempotent Fanout

Example of a `forEachBatch` function using `txnVersion` options for idempotency

In [None]:
app_id = ... # A unique string used as an application ID.

def writeToDeltaLakeTableIdempotent(batch_df, batch_id):
    # location 1
    (batch_df
    .write
    .format(“delta”)
    .option("txnVersion", batch_id)
    .option("txnAppId", app_id)
    .save("/<delta_path>/")
    )
    # location 2
    (batch_df
    .write
    .format(“delta”)
    .option("txnVersion", batch_id)
    .option("txnAppId", app_id)
    .save("/<delta_path>/")
    )

# Streaming Upsert

Using the Delta Lake `mergeBuilder` to create an upsert `forEachBatch` function and apply to a `writeStream`

In [None]:
from delta.tables import *

changesStream = ... # Streaming dataframe with CDC records

# Function to upsert microBatchDf into Delta table using merge
def upsertToDelta(microBatchDf, batchId):
    """Use Delta APIs to handle merge logic into table"""
    deltaTable = DeltaTable.forName(spark, "retail_db.transactions_silver") # Target table

    deltaTable.alias("dt") \
    .merge(
        source = microBatchDf.alias("sdf"),
        condition = "sdf.t_id = dt.t_id"
        ) \
    .whenMatchedDelete(condition = "sdf.operation = 'DELETE'") \
    .whenMatchedUpdate(set = {
        "t_id": "sdf.t_id",
        "transaction_date": "sdf.transaction_date",
        "item_count": "sdf.item_count",
        "amount": "sdf.amount"
        }) \
    .whenNotMatchedInsert(values = {
        "t_id": "sdf.t_id",
        "transaction_date": "sdf.transaction_date",
        "item_count": "sdf.item_count",
        "amount": "sdf.amount"
        }) \
    .execute()

# Write the output of a streaming aggregation query into Delta table
(changesStream
.writeStream
.format("delta")
.queryName("Summaries Silver Pipeline")
.foreachBatch(upsertToDelta)
.outputMode("update")
.start()
)

# Delta Live Tables (DLT)

A syntactical example, you'll have to go try out something like this on Databricks

In [None]:
import dlt

@dlt.table
def autoloader_dlt_bronze():
    return (
        spark
        .readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .load("<data path>")
    )

@dlt.table
def delta_dlt_silver():
    return (
        dlt
        .read_stream("autoloader_dlt_bronze")
        …
        <transformation logic>
        …
    )

@dlt.table
def live_delta_gold():
    return (
        dlt
        .read("delta_dlt_silver")
        …
        <aggregation logic>
        …
    )

# Defining Change Data Feed read boundaries in a batch process

In [None]:
# Specify the version as int or long
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", 0) \
  .option("endingVersion", 10) \
  .table("myDeltaTable")

# Specify timestamps as formatted timestamp
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", '2023-04-01 05:45:46') \
  .option("endingTimestamp", '2023-04-21 12:00:00') \
  .table("myDeltaTable")

# Providing only the startingVersion/timestamp
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", '2023-04-21 12:00:00.001') \
  .table("myDeltaTable")

# Specifying similarly with a file location
spark.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", '2021-04-21 05:45:46') \
  .load("/pathToMyDeltaTable")

# Defining Change Data Feed read boundaries in a streaming process

In [None]:
# Specifying a starting version
spark.readStream.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", 0) \
  .load("/pathToMyDeltaTable")

# Specifying a starting timestamp
spark.readStream.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingTimestamp", "2021-04-21 05:35:43") \
  .load("/pathToMyDeltaTable")

# Not providing either option, i.e., process from beginning
spark.readStream.format("delta") \
  .option("readChangeFeed", "true") \
  .load("/pathToMyDeltaTable")

# Example for viewing changes (needs updating)

In [None]:
from pyspark.sql.functions import col

spark.sql("""
    people10m
        SET
            gender = 'F',
            firstName='Leah'
        WHERE
            firstName='Leo'
            and lastName='Conkay';
    """)

(spark
.read
.format("delta")
.option("readChangeFeed", "true")
.option("startingVersion", 5)
.option("endingVersion", 5)
.table("tristen.people10m")
.select(
    col("firstName"),
    col("lastName"),
    col("gender"),
    col("_change_type"),
    col("_commit_version"))
).show()
