In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

Moving to an Incremental Pipeline in Delta Lake: Change Tracking
================================

This post shows how to use Change Tracking (todo link) in Delta Lake 2.0 to convert a batch pipeline to an incremental update pipeline. We'll cover two parts:

1. Capturing change tracking in a Delta Lake Merge job.
1. Converting a series of `join` operations to `merge` operations to produce a cheaper pipeline using incremental operations.

Setting Up a Scenario: 3 Tables
--------------------------------------

I've set up three tables:

1. Invoice
2. InvoiceItem
3. Product

The ground truth for these tables lives in a production system and is dumped to the data lake and merged into a delta lake table. The logic for this merge is given below.

In [2]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("DeltaChangeFeedExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")  \
    .config("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")
    

sc = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
# Day 0: Read the data and merge. This is just to get our tables set up. See Day 1 for a "normal" day.
products = sc.read.format("csv") \
                .option("header","true") \
                .load("./data/products/updates/day=0/") \
                .drop('_c0')

products.write.format("delta").save("./outputs/products")

invoices = sc.read.format("csv") \
                .option("header","true") \
                .load("./data/invoice/updates/day=0/") \
                .drop('_c0')

invoices.write.format("delta").save("./outputs/invoices")

invoiceitems = sc.read.format("csv") \
                .option("header","true") \
                .load("./data/invoiceitems/updates/day=0/") \
                .drop('_c0')

invoiceitems.write.format("delta").save("./outputs/invoiceitems")

New, assume there is a job that produces a normalized copy of the data that merges all three tables together. This data has one row per invoice item. We can perform normalization using a couple of joins. Occasionally we see "hiccups" where an invoice and invoice item exist in our data lake but the product has not yet been downloaded. This kind of delay can happen when tables are joined that come from different production systems. So, we'll left join products because they will occasionally be null. Bad things happen in complicated systems.

In [32]:
# Build normalized view on day 0
# build normalized join
product_base = DeltaTable.forPath(sc, "./outputs/products").toDF()
invoice_base = DeltaTable.forPath(sc, "./outputs/invoices").toDF()
invoiceitem_base = DeltaTable.forPath(sc, "./outputs/invoiceitems").toDF()

# Left join with invoice item as the root. This isn't important for invoices and invoice items, but is
# critical for products in this example since products may be pulled at different time cadence and, thus,
# not exist yet.
normalized_view = invoiceitem_base.join(invoice_base, invoiceitem_base.invoice == invoice_base.invoice_id, how="left")
normalized_view = normalized_view.join(product_base, normalized_view.product == product_base.product_id, how="left")

normalized_view.write.format("delta").save("./outputs/normalized")

normalized_view.write.format("delta").save("./outputs/normalized_copy")  # we'll use this later.

In [5]:
# Day 1: process both updates and deletes, which come in separate files
def read_data(table_location, day, has_deletes):
    updates = sc.read.format("csv") \
                .option("header","true") \
                .load(f"./data/{table_location}/updates/day={day}/") \
                .drop('_c0')
        
    if has_deletes:
        deletes = sc.read.format("csv") \
                .option("header", "true") \
                .load(f"./data/{table_location}/deletes/day={day}/") \
                .drop("_c0")
    else:
        deletes = None

    return updates, deletes

product_updates, _ = read_data("products", day=1, has_deletes=False)
product_base = DeltaTable.forPath(sc, "./outputs/products")
print(f"Updating {product_updates.count()} products and deleting 0 products.")

invoice_updates, invoice_deletes = read_data("invoice", day=1, has_deletes=True)
invoice_base = DeltaTable.forPath(sc, "./outputs/invoices")
print(f"Updating {invoice_updates.count()} invoices and deleting {invoice_deletes.count()} invoices.")

invoiceitem_updates, invoiceitem_deletes = read_data("invoiceitems", day=1, has_deletes=True)
invoiceitem_base = DeltaTable.forPath(sc, "./outputs/invoiceitems")
print(f"Updating {invoiceitem_updates.count()} invoiceitems and deleting {invoiceitem_deletes.count()} invoiceitems.")

Updating 50 products and deleting 0 products.
Updating 448 invoices and deleting 5 invoices.
Updating 1475 invoiceitems and deleting 18 invoiceitems.


In [6]:
# Day 1 continued: merge tables
product_base.alias("oldData") \
  .merge(
    product_updates.alias("newData"),
    "oldData.product_id = newData.product_id") \
  .whenMatchedUpdateAll() \
  .whenNotMatchedInsertAll() \
  .execute()

invoice_base.alias("oldData") \
  .merge(
    product_updates.alias("newData"),
    "oldData.invoice_id = newData.invoice_id") \
  .whenMatchedUpdateAll() \
  .whenNotMatchedInsertAll() \

invoice_base.alias("oldData") \
    .merge(invoice_deletes.alias("newData"), "oldData.invoice_id = newData.invoice_id") \
    .whenMatchedDelete() \
    .execute()

invoiceitem_base.alias("oldData") \
    .merge(
        invoiceitem_updates.alias("newData"),
        "oldData.invoice_item_id = newData.invoice_item_id"
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll()
invoiceitem_base.alias("oldData") \
    .merge(
        invoiceitem_deletes.alias("newData"),         
        "oldData.invoice_item_id = newData.invoice_item_id"
    ) \
    .whenMatchedDelete() \
    .execute()

Basically, every day we merge in a new set of data from a production system. This could Create, Update, or Delete rows in any table. (An example where deletes as opposed to soft deletes might happen is GDPR compliance.) So, every day we get updated Delta Lake tables representing each table. These are normally created with merge commands to take advantage of partitions.

Every day we also need to rebuild our normalized table. This code is a copy of Day 0:

In [7]:
# build normalized join
product_base = DeltaTable.forPath(sc, "./outputs/products").toDF()
invoice_base = DeltaTable.forPath(sc, "./outputs/invoices").toDF()
invoiceitem_base = DeltaTable.forPath(sc, "./outputs/invoiceitems").toDF()

# Left join with invoice item as the root. This isn't important for invoices and invoice items, but is
# critical for products in this example since products may be pulled at different time cadence and, thus,
# not exist yet.
normalized_view = invoiceitem_base.join(invoice_base, invoiceitem_base.invoice == invoice_base.invoice_id, how="left")
normalized_view = normalized_view.join(product_base, normalized_view.product == product_base.product_id, how="left")

normalized_view.write.format("delta").mode("overwrite").save("./outputs/normalized")

There are two things I hate about this join. First, we have to load the entire table every day to produce our join. If we tried to load, say, only data changed on `day=1` then we would risk join failures because of products that were not changed on day 1. For instance, say that a product is created on day 0 and used in an invoice on day 0. Mistimed data copies from the Products service and the Invoices service could result in the invoice copying over but the product not copying over. On day 1, the new product will be copied to the lake. If we rerun all data, the invoice from day 0 will be updated, but it means we have to load invoices from day 0 even if they didn't change!

Second, the normalized data pulls the most recent value for any product not the value that was active when an invoice item was created. If we change the price in our product table, for instance, then the next day's normalized data will set that new price for all previous invoice items. This can be misleading!


In [8]:
from pyspark.sql.functions import col, row_number

sample_product = 'p_0'
print(f"Product {sample_product} on day=0 had price {products.filter(col('product_id') == sample_product).collect()[0].asDict()['price']}")
print(f"Product {sample_product} on day=1 had price {product_updates.filter(col('product_id') == sample_product).collect()[0].asDict()['price']}")

print("Prices in the combined field are all:")
normalized_view.filter(col('product') == sample_product).select('invoice_item_id', 'invoice_modified', 'price').limit(10).toPandas()


Product p_0 on day=0 had price 25.36
Product p_0 on day=1 had price 54.95
Prices in the combined field are all:


Unnamed: 0,invoice_item_id,invoice_modified,price
0,bcf91a16-abf2-4504-a7c8-9e91342c543f,day0,54.95
1,19974cd5-9cef-4562-bd58-535e7f80db49,day0,54.95
2,2e04c9a3-a2e9-4c06-a872-52199eff51f1,day0,54.95
3,668e762d-a4f8-4d6d-9cfc-d381744d1d4d,day0,54.95
4,cb5f59db-c24a-4d0b-81c2-de7f50646985,day0,54.95
5,21d11808-bd5b-4145-84c5-1da0bade6988,day0,54.95
6,29e9a1c8-a27d-45c9-9431-4fb18e084bfa,day0,54.95
7,387c97c7-f69f-4d3b-b34c-6de473649d33,day0,54.95
8,9997d809-8eee-49eb-bc61-d74ff0ff8abf,day0,54.95
9,f2ba58a7-afb1-461e-b1e7-3fb0f504b964,day0,54.95


Even though the invoice was created and only modified on day 0 when the price was 43.66, the price in our normalized view on day=1 was updated to read 54.95. This can be fixed, but requires more work.

Enabling Change Tracking and Converting to Incremental Jobs
---------------------------------------------------

What we really want is to be able to track the changes that we introduce when day1 data merges into each of our three base tables `product`, `invoice`, and `invoice_item`. It turns out Delta Lake supports change tracking as of V2.0.0. They call this feature the [change data feed](https://docs.delta.io/2.0.0/delta-change-data-feed.html). We enabled it in the top cell of this notebook when we added this setting to our spark context:

`.config("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")`

When you write data - Create, Update, or Delete - to a Delta table with Change Data Feed enabled, Delta lake writes additional parquet files that track which rows were inserted, created, or deleted in each transaction. You can read the change records by enabling option `.option('readChangeFeed', 'true')` during reads. Below we look at a few change records for the `invoice` and `product` tables.


In [14]:
invoice_change_data = sc.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", '0') \
  .load("./outputs/invoices")

product_change_data = sc.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", '0') \
  .load("./outputs/products")

The change feed is a list of updates in the table between startingVersion and most recent version.

In [10]:
from pyspark.sql.window import Window

sample_window = Window.partitionBy("_change_type").orderBy('invoice_id')
invoice_change_data.withColumn('sample', row_number().over(sample_window)).filter(col('sample') < 3).toPandas()

Unnamed: 0,invoice_id,customer,status,invoice_modified,invoice_created,_change_type,_commit_version,_commit_timestamp,sample
0,inv_313,8a924220-24d8-4fc5-9d43-a1c20a769ea9,sent,day0,day0,delete,1,2022-11-12 14:30:12.378,1
1,inv_334,f9e4bd75-5cfe-48de-a6b2-b25b66eef8b0,sent,day0,day0,delete,1,2022-11-12 14:30:12.378,2
2,inv_0,70a17298-62dd-4eb8-aeb0-5ac32e2add65,sent,day0,day0,insert,0,2022-11-12 14:29:55.968,1
3,inv_1,e84f85dc-68c4-42e2-84d5-74288f60a2a0,sent,day0,day0,insert,0,2022-11-12 14:29:55.968,2


In [11]:
sample_window = Window.partitionBy("_change_type").orderBy('product_id')
product_change_data.withColumn('sample', row_number().over(sample_window)).filter(col('sample') < 3).orderBy('product_id').toPandas()

Unnamed: 0,product_id,product_name,price,product_modified,product_created,_change_type,_commit_version,_commit_timestamp,sample
0,p_0,widget0,25.36,day0,day0,insert,0,2022-11-12 14:29:49.338,1
1,p_0,widget0,54.95,day1,day0,update_postimage,1,2022-11-12 14:30:09.294,1
2,p_0,widget0,25.36,day0,day0,update_preimage,1,2022-11-12 14:30:09.294,1
3,p_1,sproket1,65.8,day0,day0,insert,0,2022-11-12 14:29:49.338,2
4,p_10,sproket10,28.37,day1,day0,update_postimage,1,2022-11-12 14:30:09.294,2
5,p_10,sproket10,57.55,day0,day0,update_preimage,1,2022-11-12 14:30:09.294,2


The `_change_type` shows 4 different types:

* insert is a create
* delete is a delete
* update_preimage is the before side of an update.
* update_postimage is the after side of an update.

Both update_preimage and update_postimage happen on day=1 (`_commit_version = 1`) and you can see the pre and post prices.

The really cool thing about Change Data tracking is that we can create our normalized_view of data on day=1 without reading the entire existing normalized_view table. 
We accomplish this with a `merge` that uses Change Data input. To see this, I'm going to use time travel to get a copy of the `normalized_view` as it existed at day=0.

In [12]:
normalized_view_day_0 = sc.read.format("delta") \
  .option("versionAsOf", '0') \
  .load("./outputs/normalized")

In [13]:
# prove we have no day 1 data in our time travel:
import pyspark.sql.functions as F

normalized_view_day_0.agg(
    F.max(col('invoice_modified')), 
    F.max(col('invoice_item_modified')), 
    F.max(col('product_modified'))).toPandas()

Unnamed: 0,max(invoice_modified),max(invoice_item_modified),max(product_modified)
0,day0,day0,day0


In [59]:
invoice_change_data = sc.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", '1') \
  .load("./outputs/invoices")

invoice_change_data = invoice_change_data.filter(col('_change_type') != 'update_preimage')

invoice_items_change_data = sc.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", '1') \
  .load("./outputs/invoiceitems")

invoice_items_change_data = invoice_items_change_data.filter(col('_change_type') != 'update_preimage')

products_change_data = sc.read.format("delta") \
  .option("readChangeFeed", "true") \
  .option("startingVersion", '1') \
  .load("./outputs/products")

products_change_data = products_change_data.filter(col('_change_type') != 'update_preimage')

In [61]:
normalized_view_day_0_copy = DeltaTable.forPath(sc, "./outputs/normalized_copy")

normalized_view_day_0_copy.alias('left').merge(
        invoice_items_change_data.alias('right'),
        'left.invoice_item_id = right.invoice_item_id'
    ).whenMatchedUpdate(set={
        'invoice': 'right.invoice',
        'count': 'right.count',
        'invoice_item_modified': 'right.invoice_item_modified',
        'invoice_item_created': 'right.invoice_item_created',
        'product': 'right.product'
    }, condition = 'right._change_type != "delete"') \
    .whenMatchedDelete(condition = 'right._change_type = "delete"') \
    .whenNotMatchedInsert(values={
        'invoice_item_id': 'right.invoice_item_id',
        'invoice': 'right.invoice',
        'count': 'right.count',
        'invoice_item_modified': 'right.invoice_item_modified',
        'invoice_item_created': 'right.invoice_item_created',
        'product': 'right.product'
    })

normalized_view_day_0_copy.alias('left').merge(
        invoice_change_data.alias('right'),
        'left.invoice = right.invoice_id'           # NOTE THIS! We'll discuss below.
    ).whenMatchedUpdate(set={
        'invoice_id': 'right.invoice_id',
        'customer': 'right.customer',
        'invoice_modified': 'right.invoice_modified',
        'invoice_created': 'right.invoice_created',
        'status': 'right.status',
    }, condition = 'right._change_type != "delete"') \
    .whenMatchedUpdate(                             # NOTE THIS! We'll discuss below.
        condition ='right._change_type = "delete"',
        set={
            'invoice_id': 'NULL',
            'invoice': 'NULL',
            'customer': 'NULL',
            'invoice_modified': 'NULL',
            'invoice_created': 'NULL',
            'status': 'NULL'
        }) \
    .whenNotMatchedInsert(values={                     # Note THIS! We'll discuss below.
        'invoice_id': 'right.invoice_id',
        'customer': 'right.customer',
        'invoice_modified': 'right.invoice_modified',
        'invoice_created': 'right.invoice_created',
        'status': 'right.status'
    })
    
    
final_merge = normalized_view_day_0_copy.alias('left').merge(
        products_change_data.alias('right'),
        'left.product = right.product_id'
    ).whenMatchedUpdate(set={
        'product_id': 'right.product_id',
        'product_name': 'right.product_name',
        'price': 'right.price',
        'product_modified': 'right.product_modified',
        'product_created': 'right.product_created'
    }, condition ='right._change_type != "delete"') \
    .whenMatchedUpdate(
        condition ='right._change_type = "delete"',
        set={
            'invoice_id': 'NULL',
            'invoice': 'NULL',
            'customer': 'NULL',
            'invoice_modified': 'NULL',
            'invoice_created': 'NULL',
            'status': 'NULL'
        }) \
    .whenNotMatchedInsert(values={
        'product_id': 'right.product_id',
        'product_name': 'right.product_name',
        'price': 'right.price',
        'product_modified': 'right.product_modified',
        'product_created': 'right.product_created'
    })
    
final_merge.execute()

In [None]:
# todo
# talk about each of your issues.
# check the outputs.