In [0]:
from pyspark.sql import SparkSession
from delta.tables import *

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DeltaTableDemo") \
    .master("local") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Define the path for the Delta Table
delta_table_path = "/tmp/delta_table_demo"

# Create some sample data
columns = ["id", "name", "age"]
data = [(1, "Alice", 30), (2, "Bob", 28), (3, "Cathy", 25)]
df = spark.createDataFrame(data, columns)

df.write.format("delta").mode("overwrite").save(delta_table_path)

# Read the data from the Delta Table
print("Initial Data:")
df = spark.read.format("delta").load(delta_table_path)
df.show()

Initial Data:
+---+-----+---+
| id| name|age|
+---+-----+---+
|  3|Cathy| 25|
|  1|Alice| 30|
|  2|  Bob| 28|
+---+-----+---+



In [0]:
# Update Transaction
print("\nUpdate Transaction - Updating Bob's age to 29")
delta_table = DeltaTable.forPath(spark, delta_table_path)
delta_table.update("name = 'Bob'", {"age": "29"})
df = spark.read.format("delta").load(delta_table_path)
df.show()


Update Transaction - Updating Bob's age to 29
+---+-----+---+
| id| name|age|
+---+-----+---+
|  3|Cathy| 25|
|  1|Alice| 30|
|  2|  Bob| 29|
+---+-----+---+



In [0]:
# Delete Transaction
print("\nDelete Transaction - Deleting Cathy")
delta_table.delete("name = 'Cathy'")
df = spark.read.format("delta").load(delta_table_path)
df.show()


Delete Transaction - Deleting Cathy
+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 30|
|  2|  Bob| 29|
+---+-----+---+



In [0]:
# Merge Operation (Upsert Transaction)
print("\nMerge Operation - Upserting new and modified data")
new_data = [(2, "Bob", 30), (4, "David", 35)]
new_df = spark.createDataFrame(new_data, columns)
new_df.createOrReplaceTempView("updates")
delta_table.alias("t").merge(
    new_df.alias("u"),
    "t.id = u.id"
)
df = spark.read.format("delta").load(delta_table_path)
df.show()



Merge Operation - Upserting new and modified data
+---+-----+---+
| id| name|age|
+---+-----+---+
|  4|David| 35|
|  2|  Bob| 30|
|  1|Alice| 30|
+---+-----+---+



In [0]:
# Time Travel (Reading Old Data)
print("\nTime Travel - Reading previous version")
old_df = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
old_df.show()


Time Travel - Reading previous version
+---+-----+---+
| id| name|age|
+---+-----+---+
|  3|Cathy| 25|
|  1|Alice| 30|
|  2|  Bob| 28|
+---+-----+---+



In [0]:
# Time Travel (Reading Old Data)
print("\nTime Travel - Reading previous version")
old_df = spark.read.format("delta").option("versionAsOf", 2).load(delta_table_path)
old_df.show()


Time Travel - Reading previous version
+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 30|
|  2|  Bob| 29|
+---+-----+---+



In [0]:
df.write.format("delta").mode("overwrite").saveAsTable("delta_table_demo")

In [0]:
spark.sql("""
    OPTIMIZE delta_table_demo
    ZORDER BY (age)
""")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [0]:
spark.sql("DESCRIBE DETAIL delta_table_demo").show()

+------+--------------------+--------------------+-----------+--------------------+--------------------+-------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+----------+
|format|                  id|                name|description|            location|           createdAt|       lastModified|partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|       tableFeatures|statistics|
+------+--------------------+--------------------+-----------+--------------------+--------------------+-------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+----------+
| delta|35fd1672-8399-4c0...|spark_catalog.def...|       NULL|dbfs:/user/hive/w...|2025-03-18 12:59:...|2025-03-18 13:04:31|              []|               []|       1|       1201|        {}|               1|               2|[