# Speed up Spark with Delta

In [1]:
import pyspark
import pyspark.sql.functions as F
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-66faaf5a-862d-4c56-ae0c-7494e4d58d08;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 306ms :: artifacts dl 23ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

In [4]:
single_csv_df = spark.read.option("header", True).csv("../../tmp/G1_1e7_1e2_0_0.csv")

                                                                                

In [5]:
single_csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).limit(5).show()



+------------+-------+-----------------+
|         id3|sum(v1)|          avg(v3)|
+------------+-------+-----------------+
|id0000023873|  264.0|49.88283578947368|
|id0000043072|  320.0| 53.3955363106796|
|id0000032560|  270.0|        50.261636|
|id0000080944|  296.0|47.64060392783505|
|id0000098376|  304.0|49.43126506060605|
+------------+-------+-----------------+



                                                                                

## Single uncompressed CSV file

In [6]:
%%time

single_csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000069985|  323.0|62.673885595744686|
|id0000061798|  276.0| 61.72031902173914|
|id0000061604|  270.0| 61.42980177777777|
|id0000042330|  219.0| 61.34167918421052|
|id0000040861|  283.0| 61.27588561538461|
+------------+-------+------------------+

CPU times: user 5.24 ms, sys: 3.05 ms, total: 8.29 ms
Wall time: 4.73 s


                                                                                

## Multiple CSV files

In [7]:
single_csv_df.rdd.getNumPartitions()

10

In [8]:
single_csv_df.write.option("header", True).csv("tmp/csvs")

                                                                                

In [9]:
csv_df = spark.read.option("header", True).csv("tmp/csvs")

In [10]:
%%time

csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

[Stage 13:>                                                       (0 + 10) / 10]

+------------+-------+-----------------+
|         id3|sum(v1)|          avg(v3)|
+------------+-------+-----------------+
|id0000069985|  323.0|62.67388559574468|
|id0000061798|  276.0|61.72031902173912|
|id0000061604|  270.0|61.42980177777779|
|id0000042330|  219.0|61.34167918421052|
|id0000040861|  283.0|61.27588561538461|
+------------+-------+-----------------+

CPU times: user 4.2 ms, sys: 2.33 ms, total: 6.54 ms
Wall time: 3.99 s


                                                                                

## Multiple Parquet files

In [11]:
csv_df.write.parquet("tmp/parquet")

                                                                                

In [12]:
parquet_df = spark.read.parquet("tmp/parquet")

In [13]:
%%time

parquet_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

[Stage 19:=====>                                                   (1 + 9) / 10]

+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000069985|  323.0| 62.67388559574468|
|id0000061798|  276.0| 61.72031902173914|
|id0000061604|  270.0|61.429801777777776|
|id0000042330|  219.0| 61.34167918421052|
|id0000040861|  283.0| 61.27588561538461|
+------------+-------+------------------+

CPU times: user 3.25 ms, sys: 2 ms, total: 5.24 ms
Wall time: 3.54 s


                                                                                

## Delta Lake

In [14]:
parquet_df.write.format("delta").save("tmp/delta")

                                                                                

In [15]:
delta_df = spark.read.format("delta").load("tmp/delta")

In [16]:
%%time

delta_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

[Stage 32:>                                                       (0 + 10) / 10]

+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000069985|  323.0| 62.67388559574468|
|id0000061798|  276.0| 61.72031902173914|
|id0000061604|  270.0|61.429801777777776|
|id0000042330|  219.0| 61.34167918421052|
|id0000040861|  283.0| 61.27588561538461|
+------------+-------+------------------+

CPU times: user 2.92 ms, sys: 1.82 ms, total: 4.74 ms
Wall time: 3.99 s


                                                                                

## Delta Lake with ZORDER

In [17]:
deltaTable = DeltaTable.forPath(spark, "tmp/delta")

In [18]:
deltaTable.optimize().executeZOrderBy("id3")

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint>]

In [19]:
optimized_df = spark.read.format("delta").load("tmp/delta")

In [20]:
%%time

optimized_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+-----------------+
|         id3|sum(v1)|          avg(v3)|
+------------+-------+-----------------+
|id0000069985|  323.0|62.67388559574467|
|id0000061798|  276.0|61.72031902173916|
|id0000061604|  270.0|61.42980177777777|
|id0000042330|  219.0|61.34167918421052|
|id0000040861|  283.0|61.27588561538461|
+------------+-------+-----------------+

CPU times: user 3.88 ms, sys: 2.6 ms, total: 6.48 ms
Wall time: 4.65 s


                                                                                

In [21]:
1+1

2

## Cleanup

In [22]:
%rm -rf tmp

In [23]:
%rm -rf spark-warehouse