In [9]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
import os 

In [3]:
builder = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()




In [3]:
sc = spark.sparkContext

Spark DataFrameWriter: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode

Spark to Delta mapping: https://docs.delta.io/latest/releases.html
Dataframe Reader and Writer versions: https://docs.delta.io/latest/versioning.html

Get Started: https://delta.io/learn/getting-started/

In [73]:
path_delta = "D:/Data/delta_test"
#path_delta = "D:/Data/delta_test_v2"

# Write table data

In [60]:
sdf = spark.range(start=1, end=25, numPartitions=1)
sdf.write.format("delta").mode("overwrite").save(path_delta)


In [61]:
for i in range(0,20):
    sdf = spark.range(start=1, end=20, numPartitions=1)
    sdf.write.format("delta").mode("overwrite").save(path_delta)

# Get log files

In [75]:
files_json = [f"{path_delta}/_delta_log/{f}" for f in os.listdir(f"{path_delta}/_delta_log") if f.split(".")[-1] == "json"]
files_json

['D:/Data/delta_test_v2/_delta_log/00000000000000000000.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000001.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000002.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000003.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000004.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000005.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000006.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000007.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000008.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000009.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000010.checkpoint.10885be8-9a9a-4216-ac61-0de993a23b39.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000010.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000011.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000012.json',
 'D:/Data/delta_test_v2/_delta_log/00000000000000000013.json',
 'D:/Da

In [78]:
sdf_json = spark.read.format("json").load(files_json[-2])
sdf_json.show(20, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|add                                                                                                                                                                               |commitInfo                                                                                                                                               |remove                                                                                               |
+-----------------------------------------------------------------------------------------------------------------------------

In [76]:
files_parquet = [f"{path_delta}/_delta_log/{f}" for f in os.listdir(f"{path_delta}/_delta_log") if f.split(".")[-1] == "parquet"]
files_parquet

[]

In [69]:
sdf_parquet = spark.read.format("parquet").load(files_parquet[0])
sdf_parquet.show(20, False)

+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------+
|txn |add                                                                                                                                                                                                                  |remove                                                                                                                      |metaData                                                                                          

# Delta details

In [56]:
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.detail().show(20, False)


+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+------------------------------+----------------+----------------+--------------------------------------+
|format|id                                  |name|description|location                |createdAt              |lastModified           |partitionColumns|numFiles|sizeInBytes|properties                    |minReaderVersion|minWriterVersion|tableFeatures                         |
+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+------------------------------+----------------+----------------+--------------------------------------+
|delta |19e69754-ffba-48ca-969f-d72c2df1efb4|NULL|NULL       |file:/D:/Data/delta_test|2024-04-16 22:04:43.767|2024-04-16 22:29:55.742|[]              |1       |609  

# Delta history

In [55]:
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.history().show(20, False)

+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                           |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------+------------+-----------------------------------+
|31     |2024-04-16 22:29:55.742|NULL  |NULL    |WRITE    |{mode -> Overwrite, partitionBy -> []}|NULL|NULL    |NULL     |30         |Serializable  |false        |{numFiles -> 1, nu

# Checkpoint V2
- Using sidecars

In [70]:
path_delta = "D:/Data/delta_test_v2"

In [71]:
sdf = spark.range(start=1, end=25, numPartitions=1)
sdf.write.format("delta").mode("overwrite").save(path_delta)

In [72]:
spark.sql(f"ALTER TABLE delta.`{path_delta}` SET TBLPROPERTIES ('delta.checkpointPolicy'='v2')")

DataFrame[]

In [74]:
for i in range(0,20):
    sdf = spark.range(start=1, end=20, numPartitions=1)
    sdf.write.format("delta").mode("overwrite").save(path_delta)