# 0. Set-Ups

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pyspark

In [2]:
spark = SparkSession.builder.master("local[3]").config("spark.sql.adapative.enabled", "false").enableHiveSupport().getOrCreate()
sc = spark.sparkContext


In [4]:
sc.getConf().getAll()


[('spark.driver.host', 'DESKTOP-PNH8CDK'),
 ('spark.app.id', 'local-1704834054160'),
 ('spark.app.submitTime', '1704834052500'),
 ('spark.master', 'local[3]'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jg

In [17]:
conf = pyspark.SparkConf().setAll([("spark.sql.adapative.enabled", "false")])
sc.stop()
sc = pyspark.SparkContext(conf=conf)


In [3]:
#Turning off AQE as it does some optimizations in the background and creates more jobs
spark.conf.set("spark.sql.adapative.enabled", "false")
#If in databricks turn off Databricks IO chaching which would cache table and not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [11]:
spark.conf.getOption("spark.sql.adapative.enabled")

AttributeError: 'RuntimeConfig' object has no attribute 'getOption'

In [5]:
import os, sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [18]:
spark.conf.get("spark.sql.adapative.enabled")

'false'

In [4]:
d = [
    {"a":"a", "b": 1},
    {"a":"b", "b": 2},
    {"a":"c", "b": 3},
    {"a":"d", "b": 4},
    {"a":"e", "b": 5},
    {"a":"e", "b": 6},
    {"a":"f", "b": 7},
    {"a":"g", "b": 8},
    {"a":"h", "b": 9},
    {"a":"i", "b": 10},
    {"a":"j", "b": 11},
    {"a":"k", "b": 12},
    {"a":"a", "b": 13},
    {"a":"b", "b": 13},
]
ddl_schema = "a string, b int"
sdf = spark.createDataFrame(d, schema=ddl_schema)

# 1. Lazy Execution and actions

In [9]:
sdf_exec = sdf.filter(f.col("b") > 5)

In [10]:
sc.setJobDescription("LazyExecution")
sdf_exec.count()

9

In [11]:
sdf_exec.rdd.getNumPartitions()

4

# 2. Noop Write

In [9]:
sc.setJobDescription("NoopWrite")
sdf.write.format("noop").mode("overwrite").save()

# 3. Narrow transformation with noop write

In [10]:
sc.setJobDescription("FilterNoopWrite")
sdf_exec = sdf.filter(f.col("b") > 5)
sdf_exec.write.format("noop").mode("overwrite").save()

# 4. Count

In [11]:
sc.setJobDescription("Count")
sdf.count()

14

# 5. Wide transformation

In [5]:
spark.conf.set("spark.sql.adapative.enabled", "true")
sdf_exec = sdf.groupBy("a").agg(f.count("*").alias("sum"))
sc.setJobDescription("WideNoop")
sdf_exec.write.format("noop").mode("overwrite").save()
#sc.setJobDescription("WideShow")
#sdf_exec.show()

In [6]:
sdf_exec.rdd.getNumPartitions()

1

# 6. Wide transformation with AQE

In [13]:
spark.conf.set("spark.sql.adapative.enabled", "true")

In [14]:
sdf_exec = sdf.groupBy("a").agg(f.count("*").alias("sum"))
sc.setJobDescription("WideAQENoop")
sdf_exec.write.format("noop").mode("overwrite").save()
sc.setJobDescription("WideAQEShow")
sdf_exec.show()

+---+---+
|  a|sum|
+---+---+
|  b|  2|
|  a|  2|
|  d|  1|
|  c|  1|
|  e|  2|
|  g|  1|
|  f|  1|
|  h|  1|
|  i|  1|
|  k|  1|
|  j|  1|
+---+---+



In [12]:
sdf_exec.rdd.getNumPartitions()

1