# 0. Set-Ups

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [11]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext


In [12]:
#Turning off AQE as it does some optimizations in the background and creates more jobs
spark.conf.set("spark.sql.adapative.enabled", "false")
#If in databricks turn off Databricks IO chaching which would cache table and not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [13]:
#import os, sys
#os.environ["PYSPARK_PYTHON"] = sys.executable
#os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [21]:
d = [
    {"a":"a", "b": 1},
    {"a":"b", "b": 2},
    {"a":"c", "b": 3},
    {"a":"d", "b": 4},
    {"a":"e", "b": 5},
    {"a":"e", "b": 6},
    {"a":"f", "b": 7},
    {"a":"g", "b": 8},
    {"a":"h", "b": 9},
    {"a":"i", "b": 10},
    {"a":"j", "b": 11},
    {"a":"k", "b": 12},
    {"a":"a", "b": 13},
    {"a":"b", "b": 13},
]
ddl_schema = "a string, b int"
sdf = spark.createDataFrame(d, schema=ddl_schema)

# 1. Lazy Execution and actions

In [15]:
sdf_exec = sdf.filter(f.col("b") > 5)

In [16]:
sc.setJobDescription("LazyExecution")
sdf_exec.count()

7

# 2. Noop Write

In [None]:
sc.setJobDescription("NoopWrite")
sdf.write.format("noop").mode("overwrite").save()

# 3. Narrow transformation with noop write

In [None]:
sc.setJobDescription("FilterNoopWrite")
sdf_exec = sdf.filter(f.col("b") > 5)
sdf_exec.write.format("noop").mode("overwrite").save()

# 4. Count

In [None]:
sc.setJobDescription("FilterCount")
sdf.count()

# 5. Wide transformation

In [23]:
sdf_exec = sdf.groupBy("a").agg(f.count("*").alias("sum"))
sc.setJobDescription("WideNoop")
sdf_exec.write.format("noop").mode("overwrite").save()
sc.setJobDescription("WideShow")
sdf_exec.show()

+---+---+
|  a|sum|
+---+---+
|  b|  2|
|  a|  2|
|  d|  1|
|  c|  1|
|  e|  2|
|  g|  1|
|  f|  1|
|  h|  1|
|  i|  1|
|  k|  1|
|  j|  1|
+---+---+



# 6. Wide transformation with AQE

In [24]:
spark.conf.set("spark.sql.adapative.enabled", "true")

In [22]:
sdf_exec = sdf.groupBy("a").agg(f.count("*").alias("sum"))
sc.setJobDescription("WideAQENoop")
sdf_exec.write.format("noop").mode("overwrite").save()
sc.setJobDescription("WideAQEShow")
sdf_exec.show()

6