# Delta Lake Z ORDER

* x0: Delta table that was initially created
* x1: Delta table after it's been optimized via small file compaction
* x2: Delta table Z Ordered by id1
* x3: Delta table Z Ordered by id1 and id2

In [1]:
from pathlib import Path

import delta
import deltalake
import levi
import pyspark
from delta import *
from pyspark.sql import functions as F

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", '10G')
    .config("spark.driver.memory", '25G')
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-332-delta-230/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5679ff7a-aec6-439e-b434-aa8de72cf6c2;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 106ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  

23/05/24 20:37:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
delta_path = f"{Path.home()}/data/delta_zorder_G1_1e9_1e2_0_0"

## Create Delta Lake

In [4]:
df = (
    spark.read.format("csv")
    .option("header", True)
    .load(f"{Path.home()}/data/G1_1e9_1e2_0_0.csv")
)

In [5]:
df.show()

+-----+-----+------------+---+---+-------+---+---+---------+
|  id1|  id2|         id3|id4|id5|    id6| v1| v2|       v3|
+-----+-----+------------+---+---+-------+---+---+---------+
|id016|id059|id0009584273| 31| 54|7579268|  5|  2|92.709317|
|id039|id028|id0008226858| 32| 73|1462759|  3| 14|23.308717|
|id047|id073|id0004357983| 52| 71| 354157|  2| 15|98.462728|
|id043|id069|id0006903604| 37| 35| 372382|  5|  5|32.566149|
|id054|id095|id0005719264| 94| 99|6957127|  5| 11| 97.89284|
|id029|id027|id0007119528| 11| 41|6768037|  2|  7|26.394021|
|id047|id053|id0003186028| 93| 64|3300443|  3| 14|79.319642|
|id091|id097|id0007718026| 22| 50|3609381|  5| 15|94.510853|
|id090|id033|id0007857423|  5| 65|3618630|  1|  5| 1.579951|
|id070|id062|id0001399833| 90| 99|6131090|  5| 14|24.892749|
|id039|id030|id0000654974| 22| 18|1298417|  1| 10|15.321252|
|id023|id095|id0005131426| 52| 38|7811474|  4| 12| 25.65414|
|id070|id013|id0009420524| 14| 33|7075062|  5| 14|43.468912|
|id022|id026|id000519192

In [6]:
df.write.format("delta").save(delta_path)

                                                                                

23/05/24 20:43:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


## Compact small files

In [8]:
delta_table = DeltaTable.forPath(spark, delta_path)

In [9]:
delta_table.optimize().executeCompaction()

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

## Z Order on id1

In [15]:
delta_table = DeltaTable.forPath(spark, delta_path)

In [16]:
delta_table.optimize().executeZOrderBy("id1")

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

## Z Order on id1 and id2

In [19]:
delta_table = DeltaTable.forPath(spark, delta_path)

In [20]:
delta_table.optimize().executeZOrderBy("id1", "id2")

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

## Create views for all four versions of the Delta table

In [11]:
(
    spark.read.format("delta")
    .option("versionAsOf", "0")
    .load(delta_path)
    .createOrReplaceTempView("x0")
)

In [13]:
(
    spark.read.format("delta")
    .option("versionAsOf", "1")
    .load(delta_path)
    .createOrReplaceTempView("x1")
)

In [17]:
(
    spark.read.format("delta")
    .option("versionAsOf", "2")
    .load(delta_path)
    .createOrReplaceTempView("x2")
)

In [21]:
(
    spark.read.format("delta")
    .option("versionAsOf", "3")
    .load(delta_path)
    .createOrReplaceTempView("x3")
)

## query_a benchmarks

In [12]:
%%time

spark.sql(
    "select id1, sum(v1) as v1 from x0 where id1 = 'id016' group by id1"
).collect()



CPU times: user 19.5 ms, sys: 8.91 ms, total: 28.4 ms
Wall time: 5.97 s


                                                                                

[Row(id1='id016', v1=30003304.0)]

In [14]:
%%time

spark.sql(
    "select id1, sum(v1) as v1 from x1 where id1 = 'id016' group by id1"
).collect()



CPU times: user 18.4 ms, sys: 8.4 ms, total: 26.8 ms
Wall time: 4.73 s


                                                                                

[Row(id1='id016', v1=30003304.0)]

In [25]:
%%time

spark.sql(
    "select id1, sum(v1) as v1 from x2 where id1 = 'id016' group by id1"
).collect()

CPU times: user 1.87 ms, sys: 1.9 ms, total: 3.77 ms
Wall time: 526 ms


[Row(id1='id016', v1=30003304.0)]

In [26]:
%%time

spark.sql(
    "select id1, sum(v1) as v1 from x3 where id1 = 'id016' group by id1"
).collect()

CPU times: user 1.81 ms, sys: 2.02 ms, total: 3.83 ms
Wall time: 561 ms


[Row(id1='id016', v1=30003304.0)]

## Use levi to analyze file skipping for query_a

In [31]:
dt = deltalake.DeltaTable(delta_path, version=0)
levi.delta_file_sizes(dt)

{'num_files_<1mb': 0,
 'num_files_1mb-500mb': 395,
 'num_files_500mb-1gb': 0,
 'num_files_1gb-2gb': 0,
 'num_files_>2gb': 0}

In [32]:
levi.skipped_stats(dt, filters=[("id1", "=", "'id016'")])

{'num_files': 395, 'num_files_skipped': 0, 'num_bytes_skipped': 0}

In [33]:
dt = deltalake.DeltaTable(delta_path, version=1)
levi.delta_file_sizes(dt)

{'num_files_<1mb': 0,
 'num_files_1mb-500mb': 1,
 'num_files_500mb-1gb': 0,
 'num_files_1gb-2gb': 26,
 'num_files_>2gb': 0}

In [34]:
levi.skipped_stats(dt, filters=[("id1", "=", "'id016'")])

{'num_files': 27, 'num_files_skipped': 0, 'num_bytes_skipped': 0}

In [35]:
dt = deltalake.DeltaTable(delta_path, version=2)
levi.delta_file_sizes(dt)

{'num_files_<1mb': 0,
 'num_files_1mb-500mb': 0,
 'num_files_500mb-1gb': 4,
 'num_files_1gb-2gb': 21,
 'num_files_>2gb': 0}

In [36]:
levi.skipped_stats(dt, filters=[("id1", "=", "'id016'")])

{'num_files': 25, 'num_files_skipped': 24, 'num_bytes_skipped': 26071485607}

In [37]:
dt = deltalake.DeltaTable(delta_path, version=3)
levi.delta_file_sizes(dt)

{'num_files_<1mb': 0,
 'num_files_1mb-500mb': 0,
 'num_files_500mb-1gb': 2,
 'num_files_1gb-2gb': 23,
 'num_files_>2gb': 0}

In [38]:
levi.skipped_stats(dt, filters=[("id1", "=", "'id016'")])

{'num_files': 25, 'num_files_skipped': 18, 'num_bytes_skipped': 19445472638}

## query_b benchmarks

In [39]:
%%time

spark.sql(
    "select id2, sum(v1) as v1 from x0 where id2 = 'id047' group by id2"
).collect()



CPU times: user 15.6 ms, sys: 7.02 ms, total: 22.6 ms
Wall time: 4.74 s


                                                                                

[Row(id2='id047', v1=29996255.0)]

In [40]:
%%time

spark.sql(
    "select id2, sum(v1) as v1 from x1 where id2 = 'id047' group by id2"
).collect()



CPU times: user 15.2 ms, sys: 7.12 ms, total: 22.4 ms
Wall time: 4.53 s


                                                                                

[Row(id2='id047', v1=29996255.0)]

In [41]:
%%time

spark.sql(
    "select id2, sum(v1) as v1 from x2 where id2 = 'id047' group by id2"
).collect()



CPU times: user 15.4 ms, sys: 7.42 ms, total: 22.8 ms
Wall time: 4.59 s


                                                                                

[Row(id2='id047', v1=29996255.0)]

In [42]:
%%time

spark.sql(
    "select id2, sum(v1) as v1 from x3 where id2 = 'id047' group by id2"
).collect()



CPU times: user 6.28 ms, sys: 4.53 ms, total: 10.8 ms
Wall time: 1.6 s


                                                                                

[Row(id2='id047', v1=29996255.0)]

## query_c benchmarks

In [47]:
%%time

spark.sql(
    "select id1, id2, sum(v1) from x0 where id1 = 'id016' and id2 = 'id047' group by id1, id2"
).collect()



CPU times: user 16.9 ms, sys: 6.69 ms, total: 23.6 ms
Wall time: 4.92 s


                                                                                

[Row(id1='id016', id2='id047', sum(v1)=298361.0)]

In [48]:
%%time

spark.sql(
    "select id1, id2, sum(v1) from x1 where id1 = 'id016' and id2 = 'id047' group by id1, id2"
).collect()



CPU times: user 18.1 ms, sys: 7.21 ms, total: 25.3 ms
Wall time: 4.73 s


                                                                                

[Row(id1='id016', id2='id047', sum(v1)=298361.0)]

In [49]:
%%time

spark.sql(
    "select id1, id2, sum(v1) from x2 where id1 = 'id016' and id2 = 'id047' group by id1, id2"
).collect()

CPU times: user 1.81 ms, sys: 2.39 ms, total: 4.2 ms
Wall time: 498 ms


[Row(id1='id016', id2='id047', sum(v1)=298361.0)]

In [50]:
%%time

spark.sql(
    "select id1, id2, sum(v1) from x3 where id1 = 'id016' and id2 = 'id047' group by id1, id2"
).collect()

CPU times: user 1.72 ms, sys: 2.12 ms, total: 3.84 ms
Wall time: 345 ms


[Row(id1='id016', id2='id047', sum(v1)=298361.0)]

## h20 groupby query #1

In [27]:
%%time

spark.sql("select id1, sum(v1) as v1 from x0 group by id1").collect()



CPU times: user 90.6 ms, sys: 31.1 ms, total: 122 ms
Wall time: 28.5 s


                                                                                

[Row(id1='id089', v1=29990077.0),
 Row(id1='id080', v1=29979880.0),
 Row(id1='id087', v1=29997379.0),
 Row(id1='id073', v1=30006820.0),
 Row(id1='id043', v1=30005705.0),
 Row(id1='id064', v1=29985828.0),
 Row(id1='id051', v1=29994785.0),
 Row(id1='id045', v1=29992441.0),
 Row(id1='id074', v1=30006309.0),
 Row(id1='id023', v1=29988818.0),
 Row(id1='id006', v1=30006882.0),
 Row(id1='id013', v1=29989026.0),
 Row(id1='id055', v1=30009993.0),
 Row(id1='id099', v1=30009485.0),
 Row(id1='id056', v1=29987234.0),
 Row(id1='id052', v1=30014118.0),
 Row(id1='id093', v1=29988829.0),
 Row(id1='id075', v1=30013372.0),
 Row(id1='id034', v1=30010786.0),
 Row(id1='id036', v1=29994349.0),
 Row(id1='id032', v1=29986434.0),
 Row(id1='id097', v1=30015928.0),
 Row(id1='id059', v1=30010798.0),
 Row(id1='id065', v1=30007777.0),
 Row(id1='id005', v1=29993888.0),
 Row(id1='id003', v1=30003365.0),
 Row(id1='id037', v1=29996759.0),
 Row(id1='id062', v1=29996661.0),
 Row(id1='id002', v1=29996534.0),
 Row(id1='id09

In [28]:
%%time

spark.sql("select id1, sum(v1) as v1 from x1 group by id1").collect()



CPU times: user 70.4 ms, sys: 25.7 ms, total: 96.1 ms
Wall time: 23 s


                                                                                

[Row(id1='id089', v1=29990077.0),
 Row(id1='id080', v1=29979880.0),
 Row(id1='id087', v1=29997379.0),
 Row(id1='id073', v1=30006820.0),
 Row(id1='id064', v1=29985828.0),
 Row(id1='id043', v1=30005705.0),
 Row(id1='id051', v1=29994785.0),
 Row(id1='id045', v1=29992441.0),
 Row(id1='id074', v1=30006309.0),
 Row(id1='id023', v1=29988818.0),
 Row(id1='id006', v1=30006882.0),
 Row(id1='id013', v1=29989026.0),
 Row(id1='id055', v1=30009993.0),
 Row(id1='id099', v1=30009485.0),
 Row(id1='id056', v1=29987234.0),
 Row(id1='id052', v1=30014118.0),
 Row(id1='id093', v1=29988829.0),
 Row(id1='id034', v1=30010786.0),
 Row(id1='id075', v1=30013372.0),
 Row(id1='id036', v1=29994349.0),
 Row(id1='id032', v1=29986434.0),
 Row(id1='id097', v1=30015928.0),
 Row(id1='id059', v1=30010798.0),
 Row(id1='id065', v1=30007777.0),
 Row(id1='id005', v1=29993888.0),
 Row(id1='id003', v1=30003365.0),
 Row(id1='id037', v1=29996759.0),
 Row(id1='id062', v1=29996661.0),
 Row(id1='id094', v1=30005130.0),
 Row(id1='id00

In [29]:
%%time

spark.sql("select id1, sum(v1) as v1 from x2 group by id1").collect()



CPU times: user 95.8 ms, sys: 32.7 ms, total: 129 ms
Wall time: 24.5 s


                                                                                

[Row(id1='id013', v1=29989026.0),
 Row(id1='id014', v1=29998476.0),
 Row(id1='id016', v1=30003304.0),
 Row(id1='id017', v1=29995061.0),
 Row(id1='id015', v1=30006177.0),
 Row(id1='id023', v1=29988818.0),
 Row(id1='id021', v1=29982118.0),
 Row(id1='id025', v1=30016745.0),
 Row(id1='id022', v1=29994847.0),
 Row(id1='id024', v1=30003956.0),
 Row(id1='id093', v1=29988829.0),
 Row(id1='id094', v1=30005130.0),
 Row(id1='id096', v1=29993372.0),
 Row(id1='id092', v1=29996666.0),
 Row(id1='id095', v1=30010887.0),
 Row(id1='id064', v1=29985828.0),
 Row(id1='id062', v1=29996661.0),
 Row(id1='id063', v1=30006173.0),
 Row(id1='id060', v1=30021845.0),
 Row(id1='id061', v1=30012298.0),
 Row(id1='id099', v1=30009485.0),
 Row(id1='id097', v1=30015928.0),
 Row(id1='id100', v1=29987827.0),
 Row(id1='id098', v1=29997789.0),
 Row(id1='id006', v1=30006882.0),
 Row(id1='id005', v1=29993888.0),
 Row(id1='id009', v1=29994474.0),
 Row(id1='id007', v1=29992448.0),
 Row(id1='id008', v1=29999024.0),
 Row(id1='id05

In [30]:
%%time

spark.sql("select id1, sum(v1) as v1 from x3 group by id1").collect()



CPU times: user 69.4 ms, sys: 26.8 ms, total: 96.2 ms
Wall time: 19.2 s


                                                                                

[Row(id1='id064', v1=29985828.0),
 Row(id1='id013', v1=29989026.0),
 Row(id1='id014', v1=29998476.0),
 Row(id1='id016', v1=30003304.0),
 Row(id1='id015', v1=30006177.0),
 Row(id1='id009', v1=29994474.0),
 Row(id1='id010', v1=30006196.0),
 Row(id1='id012', v1=29990141.0),
 Row(id1='id011', v1=30002510.0),
 Row(id1='id005', v1=29993888.0),
 Row(id1='id003', v1=30003365.0),
 Row(id1='id002', v1=29996534.0),
 Row(id1='id004', v1=30015990.0),
 Row(id1='id001', v1=30009448.0),
 Row(id1='id006', v1=30006882.0),
 Row(id1='id007', v1=29992448.0),
 Row(id1='id008', v1=29999024.0),
 Row(id1='id021', v1=29982118.0),
 Row(id1='id018', v1=29992469.0),
 Row(id1='id020', v1=29993667.0),
 Row(id1='id017', v1=29995061.0),
 Row(id1='id019', v1=29998785.0),
 Row(id1='id023', v1=29988818.0),
 Row(id1='id022', v1=29994847.0),
 Row(id1='id024', v1=30003956.0),
 Row(id1='id089', v1=29990077.0),
 Row(id1='id090', v1=29994958.0),
 Row(id1='id091', v1=29995955.0),
 Row(id1='id092', v1=29996666.0),
 Row(id1='id08