In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from delta.tables import DeltaTable
import os

* recreate the Delta table in the metastore from the data which is in the location
* enable the Change Data Feed (CDF) feature on a Delta table
* delete a record and query the CDF
* append the record back using the CDF

In [5]:
spark = (
    SparkSession
    .builder
    .appName('delta-II')
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.1")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

In [8]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_base_path = os.path.join(project_path, 'data/users_base')
users_increment_path = os.path.join(project_path, 'data/users_increment')
accounts_output_path = os.path.join(project_path, 'output/delta/accounts')

In [40]:
spark.sql('drop table if exists accounts')

DataFrame[]

In [6]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [10]:
(
    spark.read.parquet(users_base_path)
    .write
    .mode('overwrite')
    .format('delta')
    .option('path', accounts_output_path)
    .saveAsTable('accounts')
)

24/10/10 13:16:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [11]:
spark.sql('ALTER TABLE accounts SET TBLPROPERTIES (delta.enableChangeDataFeed = true)')

DataFrame[]

In [17]:
spark.sql('describe history accounts').select('version', 'timestamp', 'operation').show()

+-------+--------------------+--------------------+
|version|           timestamp|           operation|
+-------+--------------------+--------------------+
|      8|2024-10-10 13:26:...|   SET TBLPROPERTIES|
|      7|2024-10-10 13:17:...|CREATE OR REPLACE...|
|      6|2024-10-06 14:49:...|          VACUUM END|
|      5|2024-10-06 14:49:...|        VACUUM START|
|      4|2024-10-06 14:48:...|             RESTORE|
|      3|2024-10-06 14:47:...|              DELETE|
|      2|2024-10-06 14:46:...|            OPTIMIZE|
|      1|2024-10-06 14:45:...|               MERGE|
|      0|2024-10-06 14:41:...|CREATE TABLE AS S...|
+-------+--------------------+--------------------+



In [19]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

+-------+------------+-----+--------+---------+-------+----------+-----+------------+---------------+-----------------+
|user_id|display_name|about|location|downvotes|upvotes|reputation|views|_change_type|_commit_version|_commit_timestamp|
+-------+------------+-----+--------+---------+-------+----------+-----+------------+---------------+-----------------+
+-------+------------+-----+--------+---------+-------+----------+-----+------------+---------------+-----------------+



In [20]:
DeltaTable.forName(spark, 'accounts').delete(col('user_id') == 79)

                                                                                

In [21]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+
|user_id|display_name|               about|location|downvotes|upvotes|reputation|views|_change_type|_commit_version|   _commit_timestamp|
+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+
|     79|  Adam Ernst|<p>I'm an iOS dev...|    NULL|       38|    411|     41056| 1439|      delete|              9|2024-10-10 15:07:...|
+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+



In [23]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
    .drop('_change_type', '_commit_version', '_commit_timestamp')
    .write
    .mode('append')
    .format('delta')
    .option('path', accounts_output_path)
    .saveAsTable('accounts')
)

                                                                                

In [24]:
spark.sql('describe history accounts').select('version', 'timestamp', 'operation').show()

+-------+--------------------+--------------------+
|version|           timestamp|           operation|
+-------+--------------------+--------------------+
|     10|2024-10-10 15:13:...|               WRITE|
|      9|2024-10-10 15:07:...|              DELETE|
|      8|2024-10-10 13:26:...|   SET TBLPROPERTIES|
|      7|2024-10-10 13:17:...|CREATE OR REPLACE...|
|      6|2024-10-06 14:49:...|          VACUUM END|
|      5|2024-10-06 14:49:...|        VACUUM START|
|      4|2024-10-06 14:48:...|             RESTORE|
|      3|2024-10-06 14:47:...|              DELETE|
|      2|2024-10-06 14:46:...|            OPTIMIZE|
|      1|2024-10-06 14:45:...|               MERGE|
|      0|2024-10-06 14:41:...|CREATE TABLE AS S...|
+-------+--------------------+--------------------+



In [25]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+
|user_id|display_name|               about|location|downvotes|upvotes|reputation|views|_change_type|_commit_version|   _commit_timestamp|
+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+
|     79|  Adam Ernst|<p>I'm an iOS dev...|    NULL|       38|    411|     41056| 1439|      delete|              9|2024-10-10 15:07:...|
|     79|  Adam Ernst|<p>I'm an iOS dev...|    NULL|       38|    411|     41056| 1439|      insert|             10|2024-10-10 15:13:...|
+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+



In [26]:
spark.sql('drop table if exists accounts')

DataFrame[]

In [29]:
DeltaTable.forPath(spark, accounts_output_path).toDF().show()

                                                                                

+--------+-----------------+--------------------+--------+---------+-------+----------+-----+
| user_id|     display_name|               about|location|downvotes|upvotes|reputation|views|
+--------+-----------------+--------------------+--------+---------+-------+----------+-----+
| 2005342|George Gorczynski|                    |    NULL|        0|      0|        31|    5|
| 5239596|         wp-kings|                NULL|    NULL|        0|      0|         1|    0|
| 7729673|              Zro|<p>I write code t...|    NULL|        0|      0|        11|    1|
|10291510|           Sergei|                NULL|    NULL|        0|      0|         1|    0|
| 8481958|             Zico|                NULL|    NULL|        0|      0|         1|    0|
|12472375|Raj Kumar Meshram|                NULL|    NULL|        0|      0|         1|    0|
| 2826358|  Justin Thompson|                NULL|    NULL|        0|      0|         1|    3|
| 8047640|           jora91|                NULL|    NULL|  

                                                                                

In [31]:
DeltaTable.forPath(spark, accounts_output_path).history().select('version', 'timestamp', 'operation').show(truncate=30)

+-------+-----------------------+------------------------------+
|version|              timestamp|                     operation|
+-------+-----------------------+------------------------------+
|     10|2024-10-10 15:13:50.444|                         WRITE|
|      9|2024-10-10 15:07:01.347|                        DELETE|
|      8|2024-10-10 13:26:55.637|             SET TBLPROPERTIES|
|      7|2024-10-10 13:17:05.864|CREATE OR REPLACE TABLE AS ...|
|      6|2024-10-06 14:49:37.913|                    VACUUM END|
|      5|2024-10-06 14:49:36.037|                  VACUUM START|
|      4|2024-10-06 14:48:25.086|                       RESTORE|
|      3|2024-10-06 14:47:20.673|                        DELETE|
|      2|2024-10-06 14:46:46.875|                      OPTIMIZE|
|      1|2024-10-06 14:45:40.178|                         MERGE|
|      0|2024-10-06 14:41:24.372|        CREATE TABLE AS SELECT|
+-------+-----------------------+------------------------------+



In [37]:
spark.sql(f"""
    CREATE TABLE accounts
    USING DELTA
    LOCATION '{accounts_output_path}'
""")

DataFrame[]

In [38]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default| accounts|      false|
+---------+---------+-----------+



In [39]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+
|user_id|display_name|               about|location|downvotes|upvotes|reputation|views|_change_type|_commit_version|   _commit_timestamp|
+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+
|     79|  Adam Ernst|<p>I'm an iOS dev...|    NULL|       38|    411|     41056| 1439|      delete|              9|2024-10-10 15:07:...|
|     79|  Adam Ernst|<p>I'm an iOS dev...|    NULL|       38|    411|     41056| 1439|      insert|             10|2024-10-10 15:13:...|
+-------+------------+--------------------+--------+---------+-------+----------+-----+------------+---------------+--------------------+

