In [0]:
from pyspark.sql import SparkSession
from delta.tables import DeltaTable

In [0]:
spark = SparkSession.builder.appName("ZOrderingExample").getOrCreate()

In [0]:
data = [
    (1, "Virat", 25, "2023-01-01"),
    (2, "Rohit", 30, "2023-01-02"),
    (3, "Shreyas", 35, "2023-01-03"),
    (4, "Sundar", 28, "2023-01-04"),
    (5, "Axar", 22, "2023-01-05")
]

In [0]:
columns = ["id", "name", "age", "date"]
df = spark.createDataFrame(data, columns)

In [0]:
delta_path = "/mnt/delta/zorder_example"
df.write.format("delta").mode("overwrite").save(delta_path)

In [0]:
delta_table = DeltaTable.forPath(spark, delta_path)

In [0]:
delta_table.optimize().executeZOrderBy("age")

Out[7]: DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bi

In [0]:
optimized_df = spark.read.format("delta").load(delta_path)
optimized_df.show()

+---+-------+---+----------+
| id|   name|age|      date|
+---+-------+---+----------+
|  3|Shreyas| 35|2023-01-03|
|  4| Sundar| 28|2023-01-04|
|  1|  Virat| 25|2023-01-01|
|  2|  Rohit| 30|2023-01-02|
|  5|   Axar| 22|2023-01-05|
+---+-------+---+----------+



In [0]:
history = delta_table.history()
history.select("operation", "operationParameters", "operationMetrics").show(truncate=False)

+---------+-------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|operation|operationParameters                                                |operationMetrics                                                                                                                                                                                                                   |
+---------+-------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|OPTIMIZE |{predicate -> [], zOrderBy -> ["age"], batchId -> 0, auto -> fals

In [0]:
history.show()

+-------+-------------------+----------------+--------------------+---------+--------------------+----+-----------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|operation| operationParameters| job|         notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+---------+--------------------+----+-----------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      1|2025-03-18 10:44:11|2034898594157119|mohithgowda265@gm...| OPTIMIZE|{predicate -> [],...|null|{280381709156568}|0318-090519-zet9epzz|          0|SnapshotIsolation|        false|{numRemovedFiles ...|        null|Databricks-Runtim...|
|      0|2025-03-18 10:40:49|203