# Delta Lake Change Data Feed

In [1]:
import pyspark
from delta import *
from pyspark.sql import functions as F

builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-332-delta-230/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cf749fff-aae0-4b64-b77c-c9c566e41cb9;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 100ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  

23/04/19 12:05:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Create Delta table with change data feed enabled

In [2]:
spark.conf.set("spark.sql.legacy.createHiveTableByDefault", "false")

In [3]:
spark.sql(
    "CREATE TABLE students (id LONG, name STRING, age LONG) USING delta TBLPROPERTIES (delta.enableChangeDataFeed = true)"
)

23/04/19 12:05:49 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

DataFrame[]

### Append data

In [4]:
df = spark.createDataFrame([(0, "Bob", 23), (1, "Sue", 25), (2, "Jim", 27)]).toDF(
    "id", "name", "age"
)

In [5]:
df.repartition(1).write.mode("append").format("delta").saveAsTable("students")

                                                                                

In [6]:
spark.sql("SELECT * FROM students").show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 23|
|  1| Sue| 25|
|  2| Jim| 27|
+---+----+---+



In [7]:
spark.sql("SELECT * FROM table_changes('students', 0)").show(truncate=False)

+---+----+---+------------+---------------+----------------------+
|id |name|age|_change_type|_commit_version|_commit_timestamp     |
+---+----+---+------------+---------------+----------------------+
|0  |Bob |23 |insert      |1              |2023-04-19 12:05:57.84|
|1  |Sue |25 |insert      |1              |2023-04-19 12:05:57.84|
|2  |Jim |27 |insert      |1              |2023-04-19 12:05:57.84|
+---+----+---+------------+---------------+----------------------+



### Append more data

In [8]:
df = spark.createDataFrame([(5, "Jack", 18), (6, "Nora", 19), (7, "Clare", 20)]).toDF(
    "id", "name", "age"
)

In [9]:
df.repartition(1).write.mode("append").format("delta").saveAsTable("students")

In [10]:
spark.sql("SELECT * FROM students").show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  5| Jack| 18|
|  6| Nora| 19|
|  7|Clare| 20|
|  0|  Bob| 23|
|  1|  Sue| 25|
|  2|  Jim| 27|
+---+-----+---+



In [11]:
spark.sql("SELECT * FROM table_changes('students', 0)").show(truncate=False)

+---+-----+---+------------+---------------+----------------------+
|id |name |age|_change_type|_commit_version|_commit_timestamp     |
+---+-----+---+------------+---------------+----------------------+
|5  |Jack |18 |insert      |2              |2023-04-19 12:06:06.24|
|6  |Nora |19 |insert      |2              |2023-04-19 12:06:06.24|
|7  |Clare|20 |insert      |2              |2023-04-19 12:06:06.24|
|0  |Bob  |23 |insert      |1              |2023-04-19 12:05:57.84|
|1  |Sue  |25 |insert      |1              |2023-04-19 12:05:57.84|
|2  |Jim  |27 |insert      |1              |2023-04-19 12:05:57.84|
+---+-----+---+------------+---------------+----------------------+



### Delete rows of data

In [12]:
delta_table = DeltaTable.forName(spark, "students")

In [13]:
delta_table.delete(F.col("age") > 20)

In [14]:
spark.sql("SELECT * FROM students").show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  5| Jack| 18|
|  6| Nora| 19|
|  7|Clare| 20|
+---+-----+---+



In [15]:
spark.sql("SELECT * FROM table_changes('students', 0)").show(truncate=False)

+---+-----+---+------------+---------------+----------------------+
|id |name |age|_change_type|_commit_version|_commit_timestamp     |
+---+-----+---+------------+---------------+----------------------+
|0  |Bob  |23 |delete      |3              |2023-04-19 12:06:11.5 |
|1  |Sue  |25 |delete      |3              |2023-04-19 12:06:11.5 |
|2  |Jim  |27 |delete      |3              |2023-04-19 12:06:11.5 |
|5  |Jack |18 |insert      |2              |2023-04-19 12:06:06.24|
|6  |Nora |19 |insert      |2              |2023-04-19 12:06:06.24|
|7  |Clare|20 |insert      |2              |2023-04-19 12:06:06.24|
|0  |Bob  |23 |insert      |1              |2023-04-19 12:05:57.84|
|1  |Sue  |25 |insert      |1              |2023-04-19 12:05:57.84|
|2  |Jim  |27 |insert      |1              |2023-04-19 12:05:57.84|
+---+-----+---+------------+---------------+----------------------+



## Query change data feed

In [16]:
spark.sql("SELECT * FROM table_changes('students', 0)").show(truncate=False)

+---+-----+---+------------+---------------+----------------------+
|id |name |age|_change_type|_commit_version|_commit_timestamp     |
+---+-----+---+------------+---------------+----------------------+
|0  |Bob  |23 |delete      |3              |2023-04-19 12:06:11.5 |
|1  |Sue  |25 |delete      |3              |2023-04-19 12:06:11.5 |
|2  |Jim  |27 |delete      |3              |2023-04-19 12:06:11.5 |
|5  |Jack |18 |insert      |2              |2023-04-19 12:06:06.24|
|6  |Nora |19 |insert      |2              |2023-04-19 12:06:06.24|
|7  |Clare|20 |insert      |2              |2023-04-19 12:06:06.24|
|0  |Bob  |23 |insert      |1              |2023-04-19 12:05:57.84|
|1  |Sue  |25 |insert      |1              |2023-04-19 12:05:57.84|
|2  |Jim  |27 |insert      |1              |2023-04-19 12:05:57.84|
+---+-----+---+------------+---------------+----------------------+



In [18]:
spark.sql("SELECT * FROM table_changes('students', 2, 3)").show(truncate=False)

+---+-----+---+------------+---------------+----------------------+
|id |name |age|_change_type|_commit_version|_commit_timestamp     |
+---+-----+---+------------+---------------+----------------------+
|0  |Bob  |23 |delete      |3              |2023-04-19 12:06:11.5 |
|1  |Sue  |25 |delete      |3              |2023-04-19 12:06:11.5 |
|2  |Jim  |27 |delete      |3              |2023-04-19 12:06:11.5 |
|5  |Jack |18 |insert      |2              |2023-04-19 12:06:06.24|
|6  |Nora |19 |insert      |2              |2023-04-19 12:06:06.24|
|7  |Clare|20 |insert      |2              |2023-04-19 12:06:06.24|
+---+-----+---+------------+---------------+----------------------+



## Cleanup

In [19]:
spark.sql("DROP TABLE IF EXISTS students")

DataFrame[]