# Setup

In [1]:
import pyspark
from delta import *
from pyspark.sql import functions as F

builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-220/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2510fa97-bf89-45db-941b-172bd811321f;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 111ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  

23/03/29 10:13:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Delta Lake merge with whenNotMatchedInsert

In [22]:
data = [(0, "Bob", 23), (1, "Sue", 25), (2, "Jim", 27)]

In [23]:
df = spark.createDataFrame(data).toDF("id", "name", "age")

In [24]:
df.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  1| Sue| 25|
|  2| Jim| 27|
+---+----+---+



In [25]:
df.repartition(1).write.format("delta").save("/tmp/people")

In [26]:
new_data = [
    (0, "Bob", 23),  # exists in our original dataset above
    (3, "Sally", 30),  # new data
    (7, "Bob", 49),  # new data
]

In [27]:
new_df = spark.createDataFrame(new_data).toDF("id", "name", "age")

In [28]:
new_df.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|  Bob| 23|
|  3|Sally| 30|
|  7|  Bob| 49|
+---+-----+---+



In [29]:
from delta.tables import DeltaTable

In [30]:
people_table = DeltaTable.forPath(spark, "/tmp/people")

In [31]:
people_table.toDF().show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  1| Sue| 25|
|  2| Jim| 27|
+---+----+---+



In [32]:
people_table.alias("target").merge(
    new_df.alias("source"), "target.id = source.id"
).whenNotMatchedInsert(
    values={"id": "source.id", "name": "source.name", "age": "source.age"}
).execute()

23/03/29 10:30:08 WARN MergeIntoCommand: Merge source has SQLMetric(id: 1620, name: Some(number of source rows), value: 3) rows in initial scan but SQLMetric(id: 1621, name: Some(number of source rows (during repeated scan)), value: 0) rows in second scan


In [33]:
people_table.toDF().show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
|  3|Sally| 30|
|  7|  Bob| 49|
+---+-----+---+



In [30]:
!ls /tmp/people

[1m[36m_delta_log[m[m
part-00000-37993f6a-e72c-47d1-8023-7c056a5e89d2-c000.snappy.parquet
part-00000-d542c8b2-1a61-4976-823a-0bd84536449d-c000.snappy.parquet


In [31]:
!cat /tmp/people/_delta_log/00000000000000000000.json

{"commitInfo":{"timestamp":1676395230116,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"3","numOutputBytes":"980"},"engineInfo":"Apache-Spark/3.3.0 Delta-Lake/2.2.0","txnId":"e175778a-7fc0-4df7-9957-6ee4f0d3401f"}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"c7747e8d-239e-4697-82bc-333445459721","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1676395229956}}
{"add":{"path":"part-00000-d542c8b2-1a61-4976-823a-0bd84536449d-c000.snappy.parquet","partitionValues":{},"size":980,"modificationTime":16763952

In [32]:
!cat /tmp/people/_delta_log/00000000000000000001.json

{"commitInfo":{"timestamp":1676395236476,"operation":"MERGE","operationParameters":{"predicate":"(target.id = source.id)","matchedPredicates":"[]","notMatchedPredicates":"[{\"actionType\":\"insert\"}]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numTargetRowsCopied":"0","numTargetRowsDeleted":"0","numTargetFilesAdded":"1","executionTimeMs":"261","numTargetRowsInserted":"2","scanTimeMs":"0","numTargetRowsUpdated":"0","numOutputRows":"2","numTargetChangeFilesAdded":"0","numSourceRows":"3","numTargetFilesRemoved":"0","rewriteTimeMs":"259"},"engineInfo":"Apache-Spark/3.3.0 Delta-Lake/2.2.0","txnId":"b50edde7-39c9-4ecf-adab-3eee28958eab"}}
{"add":{"path":"part-00000-37993f6a-e72c-47d1-8023-7c056a5e89d2-c000.snappy.parquet","partitionValues":{},"size":975,"modificationTime":1676395236473,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":3,\"name\":\"Henry\",\"age\":30},\"maxValues\":{\"id\":4,\"name\":\"Sally\",\"age\":33},\"nu

# Delta Lake merge with whenMatchedUpdate

In [33]:
new_data = [
    (4, "Henry", 34),
    (5, "Allie", 22),
]

new_df = spark.createDataFrame(new_data).toDF("id", "name", "age").repartition(1)

In [34]:
people_table.alias("target").merge(
    new_df.alias("source"), "target.id = source.id"
).whenMatchedUpdate(set={"age": "source.age"}).whenNotMatchedInsertAll().execute()

In [35]:
people_table.toDF().show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  3|Sally| 30|
|  4|Henry| 34|
|  5|Allie| 22|
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
+---+-----+---+



# Delta Lake merge vs Parquet table merge

In [62]:
data = [(0, "Bob", 23), (1, "Sue", 25), (2, "Jim", 27)]

df = spark.createDataFrame(data).toDF("id", "name", "age")
df.repartition(1).write.format("parquet").save("/tmp/parquet/people")

target = spark.read.format("parquet").load("/tmp/parquet/people")
target.cache()

new_data = [
    (0, "Bob", 23),
    (3, "Sally", 30),
    (4, "Henry", 33),
]

source = spark.createDataFrame(new_data).toDF("id", "name", "age").repartition(1)

23/02/14 09:36:07 WARN CacheManager: Asked to cache already cached data.


In [63]:
source_prefix = source.select([F.col(c).alias("source_" + c) for c in source.columns])
target_prefix = target.select([F.col(c).alias("target_" + c) for c in target.columns])

joined_df = source_prefix.join(
    target_prefix, target_prefix.target_id == source_prefix.source_id, "full_outer"
)

final_df = joined_df.select(
    F.expr("CASE WHEN target_id IS NULL THEN source_id ELSE target_id END").alias("id"),
    F.expr("CASE WHEN target_name IS NULL THEN source_name ELSE target_name END").alias(
        "name"
    ),
    F.expr("CASE WHEN target_age IS NULL THEN source_age ELSE target_age END").alias(
        "age"
    ),
)

In [64]:
final_df.write.mode("overwrite").format("parquet").save("/tmp/parquet/people_new")

                                                                                

# Apply change data with merge

In [67]:
people_table.toDF().show()

new_data = [
    (9, "Richard", 75, "INSERT"),
    (3, "Sally", 31, "UPDATE"),
    (0, "Bob", 23, "DELETE"),
]


new_df = spark.createDataFrame(new_data).toDF("id", "name", "age", "_op").repartition(1)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  3|Sally| 30|
|  4|Henry| 34|
|  5|Allie| 22|
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
+---+-----+---+



In [68]:
people_table.alias("target").merge(
    new_df.alias("source"), "target.id = source.id"
).whenNotMatchedInsert(
    condition='source._op = "INSERT"',
    values={"id": "source.id", "name": "source.name", "age": "source.age"},
).whenMatchedUpdate(
    condition='source._op = "UPDATE"',
    set={"id": "source.id", "name": "source.name", "age": "source.age"},
).whenMatchedDelete(
    condition='source._op = "DELETE"'
).execute()

# Delta Lake merge for partial Change Data

In [71]:
people_table.toDF().show()

new_data = [
    (1, "SueNew", None, "UPDATE"),
    (3, None, 32, "UPDATE"),
]

new_df = spark.createDataFrame(new_data).toDF("id", "name", "age", "_op").repartition(1)

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|    Sue| 25|
|  2|    Jim| 27|
|  3|  Sally| 31|
|  4|  Henry| 34|
|  5|  Allie| 22|
|  9|Richard| 75|
+---+-------+---+



In [72]:
people_table.alias("target").merge(
    new_df.alias("source"), "target.id = source.id"
).whenMatchedUpdate(
    condition='source._op = "UPDATE"',
    set={
        "id": "source.id",
        "name": "CASE WHEN source.name IS NOT NULL THEN source.name ELSE target.name END",
        "age": "CASE WHEN source.age IS NOT NULL THEN source.age ELSE target.age END",
    },
).execute()

In [73]:
people_table.toDF().show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  9|Richard| 75|
|  5|  Allie| 22|
|  1| SueNew| 25|
|  3|  Sally| 32|
|  2|    Jim| 27|
|  4|  Henry| 34|
+---+-------+---+



## Cleanup

In [21]:
!rm -rf /tmp/people