# Speed up Spark with Delta

In [1]:
import pyspark
import pyspark.sql.functions as F
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-94da1aad-7f17-471e-b52d-eff5d15b02e7;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 305ms :: artifacts dl 23ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

In [4]:
single_csv_df = spark.read.option("header", True).csv("../../tmp/G1_1e8_1e2_0_0.csv")

                                                                                

In [5]:
single_csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000156724|  309.0|52.661497952380955|
|id0000786281|  336.0|53.478915250000014|
|id0000541395|  314.0| 50.35518390909091|
|id0000929333|  270.0| 49.81497536263737|
|id0000235059|  297.0|50.964993889999995|
+------------+-------+------------------+



                                                                                

## Single uncompressed CSV file

In [6]:
%%time

single_csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+-----------------+
|         id3|sum(v1)|          avg(v3)|
+------------+-------+-----------------+
|id0000600028|  297.0|64.44938332989692|
|id0000575479|  220.0|63.40989807792207|
|id0000976055|  310.0| 63.4078843069307|
|id0000966474|  291.0|63.38104397802196|
|id0000994485|  255.0|63.15061278260868|
+------------+-------+-----------------+

CPU times: user 20.4 ms, sys: 8.69 ms, total: 29.1 ms
Wall time: 47.4 s


                                                                                

## Multiple CSV files

In [7]:
single_csv_df.rdd.getNumPartitions()

39

In [8]:
single_csv_df.write.option("header", True).csv("tmp/csvs")

                                                                                

In [9]:
csv_df = spark.read.option("header", True).csv("tmp/csvs")

In [10]:
%%time

csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000600028|  297.0| 64.44938332989689|
|id0000575479|  220.0| 63.40989807792206|
|id0000976055|  310.0| 63.40788430693068|
|id0000966474|  291.0| 63.38104397802197|
|id0000994485|  255.0|63.150612782608704|
+------------+-------+------------------+

CPU times: user 17.8 ms, sys: 7.81 ms, total: 25.7 ms
Wall time: 47 s


                                                                                

## Multiple Parquet files

In [11]:
csv_df.write.parquet("tmp/parquet")

                                                                                

In [15]:
parquet_df = spark.read.parquet("tmp/parquet")

ERROR:root:KeyboardInterrupt while sending command.               (0 + 10) / 38]
Traceback (most recent call last):
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [13]:
%%time

parquet_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000600028|  297.0|  64.4493833298969|
|id0000575479|  220.0| 63.40989807792207|
|id0000976055|  310.0|  63.4078843069307|
|id0000966474|  291.0| 63.38104397802197|
|id0000994485|  255.0|63.150612782608704|
+------------+-------+------------------+

CPU times: user 20.2 ms, sys: 8.75 ms, total: 28.9 ms
Wall time: 54.1 s


                                                                                

## Delta Lake

In [14]:
parquet_df.write.format("delta").save("tmp/delta")

ERROR:root:KeyboardInterrupt while sending command.               (0 + 10) / 38]
Traceback (most recent call last):
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
delta_df = spark.read.format("delta").load("tmp/delta")

In [None]:
%%time

delta_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

## Delta Lake with ZORDER

In [None]:
deltaTable = DeltaTable.forPath(spark, "tmp/delta")

In [None]:
deltaTable.optimize().executeZOrderBy("id3")

In [None]:
optimized_df = spark.read.format("delta").load("tmp/delta")

In [None]:
%%time

optimized_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

In [None]:
1+1