# Delta Lake Compact with OPTIMIZE

In [8]:
import delta
import pyspark
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7720b555-80ed-4785-92d5-6babbeaaca79;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 348ms :: artifacts dl 21ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

22/12/19 11:38:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Create Delta table

In [4]:
df = spark.range(0, 5)

In [5]:
df.show()

[Stage 0:>                                                        (0 + 10) / 10]

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



                                                                                

In [6]:
df.repartition(5).write.format("delta").save("tmp/table1")

                                                                                

In [7]:
!tree tmp/table1

[01;34mtmp/table1[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [00mpart-00000-4dc9742a-7c76-4527-a2c6-d7904f56d05d-c000.snappy.parquet[0m
├── [00mpart-00001-18179b66-48f0-4d47-8f21-762678a7df66-c000.snappy.parquet[0m
├── [00mpart-00002-03f9116e-189e-4e55-bfe7-d501fffe4ced-c000.snappy.parquet[0m
└── [00mpart-00003-81cad732-eeaf-4708-9111-aa2e8136e304-c000.snappy.parquet[0m

1 directory, 5 files


## Compact the small files

In [10]:
delta_table = delta.DeltaTable.forPath(spark, "tmp/table1")

In [12]:
delta_table.optimize().executeCompaction()

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>>]

In [13]:
!tree tmp/table1

[01;34mtmp/table1[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [00mpart-00000-2b164d47-0f85-47e1-99dc-d136c784baaa-c000.snappy.parquet[0m
├── [00mpart-00000-4dc9742a-7c76-4527-a2c6-d7904f56d05d-c000.snappy.parquet[0m
├── [00mpart-00001-18179b66-48f0-4d47-8f21-762678a7df66-c000.snappy.parquet[0m
├── [00mpart-00002-03f9116e-189e-4e55-bfe7-d501fffe4ced-c000.snappy.parquet[0m
└── [00mpart-00003-81cad732-eeaf-4708-9111-aa2e8136e304-c000.snappy.parquet[0m

1 directory, 7 files


## Vacuum to see the compaction more clearly

In [14]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [16]:
delta_table.vacuum(0)

                                                                                

Deleted 4 files and directories in a total of 1 directories.


DataFrame[]

In [17]:
!tree tmp/table1

[01;34mtmp/table1[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
└── [00mpart-00000-2b164d47-0f85-47e1-99dc-d136c784baaa-c000.snappy.parquet[0m

1 directory, 3 files


## View transaction log entry for OPTIMIZE

In [18]:
!jq . tmp/table1/_delta_log/00000000000000000001.json

[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"part-00000-2b164d47-0f85-47e1-99dc-d136c784baaa-c000.snappy.parquet"[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m504[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1671468211798[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mfalse[0m[1;39m,
    [0m[34;1m"stats"[0m[1;39m: [0m[0;32m"{\"numRecords\":5,\"minValues\":{\"id\":0},\"maxValues\":{\"id\":4},\"nullCount\":{\"id\":0}}"[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"remove"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"part-00003-81cad732-eeaf-4708-9111-aa2e8136e304-c000.snappy.parquet"[0m[1;39m,
    [0m[34;1m"deletionTimestamp"[0m[1;39m: [0m[0;39m1671468210066[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mfalse[0m[1;39m,
    [0m[34

## Compacting pre-Delta 2.0

In [26]:
df = spark.range(0, 5)

In [27]:
df.repartition(5).write.format("delta").save("tmp/table2")

                                                                                

In [28]:
path = "tmp/table2"
numFiles = 1

(
    spark.read.format("delta")
    .load(path)
    .repartition(numFiles)
    .write.option("dataChange", "false")
    .format("delta")
    .mode("overwrite")
    .save(path)
)

                                                                                

In [33]:
delta_table = delta.DeltaTable.forPath(spark, "tmp/table2")

In [34]:
delta_table.vacuum(0)

                                                                                

Deleted 4 files and directories in a total of 1 directories.


DataFrame[]

In [35]:
!tree tmp/table2

[01;34mtmp/table2[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
└── [00mpart-00000-2f60eca6-07ba-4f66-a89c-eab5caa94d25-c000.snappy.parquet[0m

1 directory, 3 files


In [36]:
spark.read.format("delta").load("tmp/table2").show()

+---+
| id|
+---+
|  1|
|  4|
|  2|
|  3|
|  0|
+---+

