# Chapter 7: Optimizing and Tuning Spark Applications
Christoph Windheuser    
May, 2022   
Python examples of chapter 7 (page 173 ff) in the book *Learning Spark*

In [1]:
# Import required python spark libraries
import pyspark


In [2]:
#create a SparkSession

spark = (SparkSession \
         .builder \
         .enableHiveSupport() \
         .appName("Chapter_7") \
         .getOrCreate())


In [4]:
# Show the content of the environment variable $SPARK_HOME:
!echo $SPARK_HOME

/opt/spark


In [6]:
# Show all config files
!ls -l $SPARK_HOME/conf

total 44
-rw-r--r-- 1 christoph christoph 1105 Jan 20 21:10 fairscheduler.xml.template
-rw-r--r-- 1 christoph christoph 2471 Mai  5 15:49 log4j.properties
-rw-r--r-- 1 christoph christoph 2471 Jan 20 21:10 log4j.properties.template
-rw-r--r-- 1 christoph christoph 9141 Jan 20 21:10 metrics.properties.template
-rw-r--r-- 1 christoph christoph 1353 Mai  5 15:57 spark-defaults.conf
-rw-r--r-- 1 christoph christoph 1292 Jan 20 21:10 spark-defaults.conf.template
-rwxr-xr-x 1 christoph christoph 4428 Jan 20 21:10 spark-env.sh.template
-rw-r--r-- 1 christoph christoph  865 Jan 20 21:10 workers.template


In [7]:
#Get single Spark configuiration values:
print(spark.conf.get("spark.sql.warehouse.dir"))

file:/home/christoph/Dev/LearningSpark/spark-warehouse


In [15]:
# Get the whole confiuguration context of a Spark Context:
scConf = sc.getConf().getAll()

for l in scConf:
    print (l[0] + ":")
    print (l[1])
    print ()


spark.jars.packages:
org.apache.spark:spark-avro_2.12:3.2.1

spark.jars:
file:///home/christoph/.ivy2/jars/org.apache.spark_spark-avro_2.12-3.2.1.jar,file:///home/christoph/.ivy2/jars/org.tukaani_xz-1.8.jar,file:///home/christoph/.ivy2/jars/org.spark-project.spark_unused-1.0.0.jar

spark.app.startTime:
1652550501773

spark.executor.id:
driver

spark.app.id:
local-1652550502663

spark.app.name:
PySparkShell

spark.files:
file:///home/christoph/.ivy2/jars/org.apache.spark_spark-avro_2.12-3.2.1.jar,file:///home/christoph/.ivy2/jars/org.tukaani_xz-1.8.jar,file:///home/christoph/.ivy2/jars/org.spark-project.spark_unused-1.0.0.jar

spark.driver.host:
192.168.0.125

spark.driver.port:
35787

spark.sql.catalogImplementation:
hive

spark.rdd.compress:
True

spark.app.initial.jar.urls:
spark://192.168.0.125:35787/jars/org.apache.spark_spark-avro_2.12-3.2.1.jar,spark://192.168.0.125:35787/jars/org.spark-project.spark_unused-1.0.0.jar,spark://192.168.0.125:35787/jars/org.tukaani_xz-1.8.jar

spark.

In [16]:
# Change single Spark config variables
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism)

In [21]:
# Show the Spark SQL-specifdic Spark configs:
spark.sql("SET -v").select("key", "value").show(truncate=False)

+-------------------------------------------------------------+----------------------------------------------------------------+
|key                                                          |value                                                           |
+-------------------------------------------------------------+----------------------------------------------------------------+
|spark.sql.adaptive.advisoryPartitionSizeInBytes              |<value of spark.sql.adaptive.shuffle.targetPostShuffleInputSize>|
|spark.sql.adaptive.autoBroadcastJoinThreshold                |<undefined>                                                     |
|spark.sql.adaptive.coalescePartitions.enabled                |true                                                            |
|spark.sql.adaptive.coalescePartitions.initialPartitionNum    |<undefined>                                                     |
|spark.sql.adaptive.coalescePartitions.minPartitionSize       |1MB                               

## Spark's Web Interface
To see Spark's Web Interface, go the web address: http://127.0.0.1:4040    
The tab *Environment* shows all environment variables. In the web interface, the variables are *read-only*, they cannot be modified.

## Set configuration variables in a Spark program

In [23]:
# First check, if a configuration variable is modifiable:

# Example:
spark.conf.isModifiable("spark.sql.shuffle.partitions")

True

In [24]:
# Get the actual value of the variable:
spark.conf.get("spark.sql.shuffle.partitions")

'16'

In [25]:
# Set the variable to a new variable and check:
spark.conf.set("spark.sql.shuffle.partitions", 5)
spark.conf.get("spark.sql.shuffle.partitions")

'5'

In [26]:
# Set it back to the old value:
spark.conf.set("spark.sql.shuffle.partitions", 16)
spark.conf.get("spark.sql.shuffle.partitions")

'16'