# Chapter 7: Optimizing and Tuning Spark Applications
Christoph Windheuser    
May, 2022   
Python examples of chapter 7 (page 173 ff) in the book *Learning Spark*

In [1]:
# Import required python spark libraries
import pyspark
from pyspark.sql.functions import col, expr, when, concat, lit, avg
from pyspark.sql.types import StructType,StructField, StringType, IntegerType


In [2]:
#create a SparkSession

spark = (SparkSession \
         .builder \
         .enableHiveSupport() \
         .appName("Chapter_7") \
         .getOrCreate())


In [None]:
# Show the content of the environment variable $SPARK_HOME:
!echo $SPARK_HOME

In [None]:
# Show all config files
!ls -l $SPARK_HOME/conf

In [None]:
#Get single Spark configuiration values:
print(spark.conf.get("spark.sql.warehouse.dir"))

In [None]:
# Get the whole confiuguration context of a Spark Context:
scConf = sc.getConf().getAll()

for l in scConf:
    print (l[0] + ":")
    print (l[1])
    print ()


In [None]:
# Change single Spark config variables
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism)

In [None]:
# Show the Spark SQL-specifdic Spark configs:
spark.sql("SET -v").select("key", "value").show(truncate=False)

## Spark's Web Interface
To see Spark's Web Interface, go the web address: http://127.0.0.1:4040    
The tab *Environment* shows all environment variables. In the web interface, the variables are *read-only*, they cannot be modified.

## Set configuration variables in a Spark program

In [None]:
# First check, if a configuration variable is modifiable:

# Example:
spark.conf.isModifiable("spark.sql.shuffle.partitions")

In [None]:
# Get the actual value of the variable:
spark.conf.get("spark.sql.shuffle.partitions")

In [None]:
# Set the variable to a new variable and check:
spark.conf.set("spark.sql.shuffle.partitions", 5)
spark.conf.get("spark.sql.shuffle.partitions")

In [None]:
# Set it back to the old value:
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.get("spark.sql.shuffle.partitions")

## Partitions
Page 181 ff.

In [None]:
# Create a big DataFrame:
numDF = spark.range(1000 * 1000)

In [None]:
# Get the default number of partitions of this DataFrame
numDF.rdd.getNumPartitions()

In [None]:
# Now change the number of partitions to another value
numDF = spark.range(1000 * 1000).repartition(32)

In [None]:
# Check the number of partitions>:
numDF.rdd.getNumPartitions()

## Caching of Data
Page 183 ff.

Create a DataFrame with 10M records.  

The time difference (approx. 10x faster) between *Count and load into cache*' and *Count in cache*
can only be demonstrated when this code is run the first time in the notebook. In consecutive executions the DataFrame is already cached and there is basically no time difference.

In [None]:


import time

start = time.time()
df = spark.range(1 * 10000000).toDF("id")
end = time.time()
print("Step 1 - Create:                    %f seconds" %(end - start))

start = time.time()
df = df.withColumn("square", df.id * df.id)
end = time.time()
print("Step 2 - Add Column:                %f seconds" %(end - start))

start = time.time()
df.cache()
end = time.time()
print("Step 3 - Cache df:                  %f seconds" %(end - start))

start = time.time()
df.count()
end = time.time()
print("Step 4 - Count and load into cache: %f seconds" %(end - start))

start = time.time()
df.count()
end = time.time()
print("Step 5 - Count in cache:            %f seconds" %(end - start))


### Caching Tables and Views in SQL
It is also possible to cache tables of views:

In [None]:
df.createOrReplaceTempView("dfTable")
spark.sql("CACHE TABLE dfTable")
spark.sql("SELECT count(*) FROM dfTable").show()

## Persistance of Data
Page 184 ff

Persistance of data is synonymous to caching data, but let you apecify how the data is persisted with the parameter `pyspark.StorageLevel.LEVEL`. 

As we have specified the persistance on disk only, the time difference is much lower compared to the example above (this time approx. 5x faster compared to 12x faster above). Under the link http://127.0.0.1:4040/ you can see that the data is persisted on disk and not on memory for all partitions.

In [None]:
start = time.time()
df2 = spark.range(1 * 10000000).toDF("id")
end = time.time()
print("Step 1 - Create:                    %f seconds" %(end - start))

start = time.time()
df2 = df2.withColumn("square", df2.id * df2.id)
end = time.time()
print("Step 2 - Add Column:                %f seconds" %(end - start))

start = time.time()
df2.persist(storageLevel=pyspark.StorageLevel.DISK_ONLY)
end = time.time()
print("Step 3 - Persist df DISK_ONLY:      %f seconds" %(end - start))

start = time.time()
df2.count()
end = time.time()
print("Step 4 - Count and load into cache: %f seconds" %(end - start))

start = time.time()
df2.count()
end = time.time()
print("Step 5 - Count in cache:            %f seconds" %(end - start))


## Shuffle Sort Merge Join (SMJ)
Page 189 ff.

In [3]:
from random import randint

# Disable broadcast join
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")


In [4]:
# Generate synthetic data for two data frames
states = ['AZ', 'CO', 'CA', 'TX', 'NY', 'MI']
items  = ['SKU-0', 'SKU-1', 'SKU-2', 'SKU-3', 'SKU-4']


In [5]:
usersDF_schema = StructType([ \
    StructField("uid",StringType(),True), \
    StructField("login",StringType(),True), \
    StructField("email",StringType(),True), \
    StructField("user_state", StringType(), True) \
  ])

ordersDF_schema = StructType([ \
    StructField("transaction_id",StringType(),True), \
    StructField("quantity",StringType(),True), \
    StructField("users_id",StringType(),True), \
    StructField("amount", StringType(), True), \
    StructField("state", StringType(), True), \
    StructField("items", StringType(), True) \
])


In [6]:
usersDF_data = []
for i in range (100000):
    login = "user_{}".format(i)
    usersDF_row = (str(i), login, login + "@databricks.com",
                    states[randint(0, 5)])
    usersDF_data.append(usersDF_row)


In [7]:
usersDF = spark.createDataFrame(data=usersDF_data, schema=usersDF_schema)

In [8]:
usersDF.show(n=10)

+---+------+--------------------+----------+
|uid| login|               email|user_state|
+---+------+--------------------+----------+
|  0|user_0|user_0@databricks...|        MI|
|  1|user_1|user_1@databricks...|        CA|
|  2|user_2|user_2@databricks...|        CO|
|  3|user_3|user_3@databricks...|        NY|
|  4|user_4|user_4@databricks...|        NY|
|  5|user_5|user_5@databricks...|        CO|
|  6|user_6|user_6@databricks...|        CO|
|  7|user_7|user_7@databricks...|        MI|
|  8|user_8|user_8@databricks...|        AZ|
|  9|user_9|user_9@databricks...|        CA|
+---+------+--------------------+----------+
only showing top 10 rows



In [9]:
ordersDF_data = []
for i in range (100000):
    login = "user_{}".format(i)
    ordersDF_row = (str(randint(100000, 999999)), 
                   str(randint(1, 100)),
                   str(randint(0, 9999)),
                   str((randint(10, 9999)/3.14)),
                   states[randint(0, 5)],
                   items[randint(0, 4)])
    ordersDF_data.append(ordersDF_row)


In [10]:
ordersDF = spark.createDataFrame(data=ordersDF_data,
                                 schema=ordersDF_schema)


In [11]:
ordersDF.show(n=10)

+--------------+--------+--------+------------------+-----+-----+
|transaction_id|quantity|users_id|            amount|state|items|
+--------------+--------+--------+------------------+-----+-----+
|        316601|       6|    5767|3065.9235668789806|   MI|SKU-1|
|        188678|      81|    3768|2654.7770700636943|   TX|SKU-1|
|        232450|      65|    7670|2457.9617834394903|   CO|SKU-3|
|        104382|      11|    9711| 2914.012738853503|   TX|SKU-1|
|        183808|      26|    7283|1225.4777070063694|   TX|SKU-4|
|        495994|      56|     808|1615.2866242038217|   MI|SKU-1|
|        455218|      93|    2538| 59.23566878980891|   CO|SKU-2|
|        129649|      67|     900| 2325.796178343949|   TX|SKU-3|
|        331143|      64|    5184| 605.0955414012739|   TX|SKU-1|
|        653319|      18|    7037|414.01273885350315|   CO|SKU-0|
+--------------+--------+--------+------------------+-----+-----+
only showing top 10 rows



In [12]:
usersOrdersDF = ordersDF.join(usersDF, ordersDF.users_id == usersDF.uid)

In [13]:
usersOrdersDF.show()

+--------------+--------+--------+------------------+-----+-----+----+---------+--------------------+----------+
|transaction_id|quantity|users_id|            amount|state|items| uid|    login|               email|user_state|
+--------------+--------+--------+------------------+-----+-----+----+---------+--------------------+----------+
|        787788|      60|    1008|1288.8535031847134|   TX|SKU-4|1008|user_1008|user_1008@databri...|        MI|
|        947098|      27|    1008|3172.9299363057326|   TX|SKU-2|1008|user_1008|user_1008@databri...|        MI|
|        802590|       5|    1008| 3167.515923566879|   NY|SKU-0|1008|user_1008|user_1008@databri...|        MI|
|        327971|       3|    1008| 1220.063694267516|   CA|SKU-1|1008|user_1008|user_1008@databri...|        MI|
|        284095|      16|    1008| 618.7898089171974|   CO|SKU-3|1008|user_1008|user_1008@databri...|        MI|
|        783847|      39|    1008|2329.2993630573246|   MI|SKU-3|1008|user_1008|user_1008@databr

In [14]:
usersOrdersDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [users_id#27], [uid#0], Inner
   :- Sort [users_id#27 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(users_id#27, 200), ENSURE_REQUIREMENTS, [id=#136]
   :     +- Filter isnotnull(users_id#27)
   :        +- Scan ExistingRDD[transaction_id#25,quantity#26,users_id#27,amount#28,state#29,items#30]
   +- Sort [uid#0 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(uid#0, 200), ENSURE_REQUIREMENTS, [id=#137]
         +- Filter isnotnull(uid#0)
            +- Scan ExistingRDD[uid#0,login#1,email#2,user_state#3]




## Optimizing the Shuffle Sort Merge Join
Page 193 ff.

In [15]:
(usersDF
     .orderBy(col("uid").asc())
     .write.format("parquet")
     .bucketBy(8, "uid")
     .mode("overWrite")
     .saveAsTable("UserTbl")
)

In [16]:
(ordersDF
     .orderBy(col("users_id").asc())
     .write.format("parquet")
     .bucketBy(8, "users_id")
     .mode("overWrite")
     .saveAsTable("OrderTbl")
)

In [17]:
spark.sql("CACHE TABLE UserTbl")
spark.sql("CACHE TABLE OrderTbl")

DataFrame[]

In [18]:
userBucketDF  = spark.table("UserTbl")
orderBucketDF = spark.table("OrderTbl")


In [19]:
joinUserOrderBucketDF = orderBucketDF.join(userBucketDF, orderBucketDF.users_id == userBucketDF.uid)


In [20]:
joinUserOrderBucketDF.show()

+--------------+--------+--------+------------------+-----+-----+----+---------+--------------------+----------+
|transaction_id|quantity|users_id|            amount|state|items| uid|    login|               email|user_state|
+--------------+--------+--------+------------------+-----+-----+----+---------+--------------------+----------+
|        555381|      49|       1| 678.9808917197452|   CA|SKU-2|   1|   user_1|user_1@databricks...|        CA|
|        729101|      17|       1| 721.3375796178344|   CO|SKU-3|   1|   user_1|user_1@databricks...|        CA|
|        213429|      76|       1| 1529.936305732484|   CA|SKU-3|   1|   user_1|user_1@databricks...|        CA|
|        809826|      24|       1|2372.6114649681526|   CA|SKU-4|   1|   user_1|user_1@databricks...|        CA|
|        504720|      30|       1|447.77070063694265|   NY|SKU-1|   1|   user_1|user_1@databricks...|        CA|
|        336000|      66|       1| 575.4777070063694|   AZ|SKU-1|   1|   user_1|user_1@databrick

In [21]:
joinUserOrderBucketDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [users_id#292], [uid#153], Inner
   :- Sort [users_id#292 ASC NULLS FIRST], false, 0
   :  +- Filter isnotnull(users_id#292)
   :     +- Scan In-memory table OrderTbl [transaction_id#290, quantity#291, users_id#292, amount#293, state#294, items#295], [isnotnull(users_id#292)]
   :           +- InMemoryRelation [transaction_id#290, quantity#291, users_id#292, amount#293, state#294, items#295], StorageLevel(disk, memory, deserialized, 1 replicas)
   :                 +- *(1) ColumnarToRow
   :                    +- FileScan parquet default.ordertbl[transaction_id#290,quantity#291,users_id#292,amount#293,state#294,items#295] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/cwi/Dev/LearningSpark/spark-warehouse/ordertbl], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<transaction_id:string,quantity:string,users_id:string,amount:string,state:string,items:str

### Visualization in Spark UI
1. Go to http://localhost:4040
2. Under the *Jobs* tab, click on the latest job
3. Open *DAG Visualization*
3. Click inside a block to get the details
