# Lab : Spark AQE

Experiment with Spark Adaptive Query Engine

Refereces:
- http://blog.madhukaraphatak.com/spark-aqe-part-2/
- https://databricks.com/blog/2020/05/29/adaptive-query-execution-speeding-up-spark-sql-at-runtime.html
- https://docs.databricks.com/spark/latest/spark-sql/aqe.html
- https://docs.databricks.com/_static/notebooks/aqe-demo.html
- https://spark.apache.org/docs/latest/sql-performance-tuning.html#adaptive-query-execution

In [None]:
import findspark
findspark.init()  # uses SPARK_HOME
print("Spark found in : ", findspark.find())

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession



# use a unique tmep dir for warehouse dir, so we can run multiple spark sessions in one dir
import tempfile
tmpdir = tempfile.TemporaryDirectory()

config = ( SparkConf()
         .setAppName("TestApp")
         .setMaster("local[*]")
         #.setMaster("spark://f96e0987354e:7077")
         .set('executor.memory', '2g')
         .set('spark.sql.warehouse.dir', tmpdir.name)
         .set('spark.sql.adaptive.enabled', 'true')
         .set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
         )

print("Spark config:\n\t", config.toDebugString().replace("\n", "\n\t"))
spark = SparkSession.builder.config(conf=config).getOrCreate()
print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2])

In [None]:
# check if AQE is enabled
spark.conf.get('spark.sql.adaptive.enabled')

# spark.conf.set('spark.sql.adaptive.coalescePartitions.minPartitionNum', 1)

## AQE

In [None]:
%%time 
# generate large clickstream data


! [ ! -d /data/click-stream/json/ ] && cd /data/click-stream  && python gen-clickstream-json.py 

! ls -lh  /data/click-stream/json/

In [None]:
%%time

# load clickstream json -- this is a large table about 1.4 GB in size
clickstream = spark.read.json("../data/click-stream/json/").repartition(500)
clickstream.rdd.getNumPartitions()

In [None]:
count = clickstream.filter('cost > 100').groupBy('domain').count()

In [None]:
count.explain(extended=True)

In [None]:
count.show()

In [None]:
count.explain(extended=True)