# Speed up Spark with Delta

In [24]:
import pyspark
import pyspark.sql.functions as F
from delta import *

In [25]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [26]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [27]:
single_csv_df = spark.read.option("header", True).csv("../../tmp/G1_1e8_1e2_0_0.csv")

In [28]:
single_csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000156724|  309.0|52.661497952380955|
|id0000786281|  336.0|53.478915250000014|
|id0000541395|  314.0| 50.35518390909091|
|id0000929333|  270.0| 49.81497536263737|
|id0000235059|  297.0|50.964993889999995|
+------------+-------+------------------+



                                                                                

## Single uncompressed CSV file

In [29]:
%%time

single_csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+-----------------+
|         id3|sum(v1)|          avg(v3)|
+------------+-------+-----------------+
|id0000600028|  297.0|64.44938332989692|
|id0000575479|  220.0|63.40989807792206|
|id0000976055|  310.0| 63.4078843069307|
|id0000966474|  291.0|63.38104397802196|
|id0000994485|  255.0|63.15061278260869|
+------------+-------+-----------------+

CPU times: user 21.7 ms, sys: 9.85 ms, total: 31.6 ms
Wall time: 1min 6s


                                                                                

## Multiple CSV files

In [30]:
single_csv_df.rdd.getNumPartitions()

39

In [31]:
single_csv_df.write.option("header", True).csv("tmp/csvs")

                                                                                

In [32]:
csv_df = spark.read.option("header", True).csv("tmp/csvs")

In [33]:
%%time

csv_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000600028|  297.0|  64.4493833298969|
|id0000575479|  220.0| 63.40989807792206|
|id0000976055|  310.0| 63.40788430693068|
|id0000966474|  291.0| 63.38104397802197|
|id0000994485|  255.0|63.150612782608704|
+------------+-------+------------------+

CPU times: user 23.9 ms, sys: 10.3 ms, total: 34.2 ms
Wall time: 1min 6s


                                                                                

## Multiple Parquet files

In [34]:
csv_df.write.parquet("tmp/parquet")

                                                                                

In [35]:
parquet_df = spark.read.parquet("tmp/parquet")

In [36]:
%%time

parquet_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()



+------------+-------+------------------+
|         id3|sum(v1)|           avg(v3)|
+------------+-------+------------------+
|id0000600028|  297.0|  64.4493833298969|
|id0000575479|  220.0| 63.40989807792207|
|id0000976055|  310.0|  63.4078843069307|
|id0000966474|  291.0| 63.38104397802197|
|id0000994485|  255.0|63.150612782608704|
+------------+-------+------------------+

CPU times: user 24.9 ms, sys: 10.9 ms, total: 35.8 ms
Wall time: 1min 28s


                                                                                

## Delta Lake

In [37]:
parquet_df.write.format("delta").save("tmp/delta")

Exception in thread "refresh progress" java.lang.OutOfMemoryError: Java heap space
Exception in thread "netty-rpc-env-timeout" java.lang.OutOfMemoryError: Java heap space
22/08/19 13:54:39 ERROR Utils: uncaught error in thread spark-listener-group-appStatus, stopping SparkContext
java.lang.OutOfMemoryError: Java heap space
22/08/19 13:54:39 ERROR Utils: Uncaught exception in thread executor-heartbeater
java.lang.OutOfMemoryError: Java heap space
22/08/19 13:54:39 ERROR Utils: Uncaught exception in thread driver-heartbeater
java.lang.OutOfMemoryError: Java heap space
22/08/19 13:54:39 ERROR Utils: uncaught error in thread Spark Context Cleaner, stopping SparkContext
java.lang.OutOfMemoryError: Java heap space
22/08/19 13:54:39 ERROR Executor: Exception in task 6.0 in stage 78.0 (TID 759)
java.lang.OutOfMemoryError: Java heap space
22/08/19 13:54:39 ERROR Executor: Exception in task 9.0 in stage 78.0 (TID 762)
org.apache.spark.SparkException: Task failed while writing rows.
	at org.apach

ConnectionRefusedError: [Errno 61] Connection refused

In [None]:
delta_df = spark.read.format("delta").load("tmp/delta")

In [None]:
%%time

delta_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

## Delta Lake with ZORDER

In [None]:
deltaTable = DeltaTable.forPath(spark, "tmp/delta")

In [None]:
deltaTable.optimize().executeZOrderBy("id3")

In [None]:
optimized_df = spark.read.format("delta").load("tmp/delta")

In [None]:
%%time

optimized_df.groupby("id3").agg(F.sum("v1"), F.mean("v3")).orderBy(F.col("avg(v3)").desc()).limit(5).show()

In [None]:
1+1

## Cleanup

In [None]:
%rm -rf tmp

In [None]:
%rm -rf spark-warehouse

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
