In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from delta.tables import DeltaTable
import os

* recreate the Delta table in the metastore from the data which is in the location
* enable the Change Data Feed (CDF) feature on a Delta table
* delete a record and query the CDF
* append the record back using the CDF

In [None]:
spark = (
    SparkSession
    .builder
    .appName('delta-II')
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.1")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_base_path = os.path.join(project_path, 'data/users_base')
users_increment_path = os.path.join(project_path, 'data/users_increment')
accounts_output_path = os.path.join(project_path, 'output/delta/accounts')

In [None]:
spark.sql('drop table if exists accounts')

In [None]:
spark.sql('show tables').show()

In [None]:
(
    spark.read.parquet(users_base_path)
    .write
    .mode('overwrite')
    .format('delta')
    .option('path', accounts_output_path)
    .saveAsTable('accounts')
)

In [None]:
spark.sql('ALTER TABLE accounts SET TBLPROPERTIES (delta.enableChangeDataFeed = true)')

In [None]:
spark.sql('describe history accounts').select('version', 'timestamp', 'operation').show()

In [None]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

In [None]:
DeltaTable.forName(spark, 'accounts').delete(col('user_id') == 79)

In [None]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

In [None]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
    .drop('_change_type', '_commit_version', '_commit_timestamp')
    .write
    .mode('append')
    .format('delta')
    .option('path', accounts_output_path)
    .saveAsTable('accounts')
)

In [None]:
spark.sql('describe history accounts').select('version', 'timestamp', 'operation').show()

In [None]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()

In [None]:
spark.sql('drop table if exists accounts')

In [None]:
DeltaTable.forPath(spark, accounts_output_path).toDF().show()

In [None]:
DeltaTable.forPath(spark, accounts_output_path).history().select('version', 'timestamp', 'operation').show(truncate=30)

In [None]:
spark.sql(f"""
    CREATE TABLE accounts
    USING DELTA
    LOCATION '{accounts_output_path}'
""")

In [None]:
spark.sql('show tables').show()

In [None]:
(
    spark.read
    .format('delta')
    .option('readChangeFeed', 'true')
    .option('startingVersion', 8)
    .table('accounts')
).show()