# 0. Set-Ups

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pyspark

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[3]") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [4]:
d = [
    {"a":"a", "b": 1},
    {"a":"b", "b": 2},
    {"a":"c", "b": 3},
    {"a":"d", "b": 4},
    {"a":"e", "b": 5},
    {"a":"e", "b": 6},
    {"a":"f", "b": 7},
    {"a":"g", "b": 8},
    {"a":"h", "b": 9},
    {"a":"i", "b": 10},
    {"a":"j", "b": 11},
    {"a":"k", "b": 12},
    {"a":"a", "b": 13},
    {"a":"b", "b": 13},
]
ddl_schema = "a string, b int"
sdf = spark.createDataFrame(d, schema=ddl_schema)

In [5]:
sdf.rdd.getNumPartitions()

3

# 1. Lazy Execution and actions

In [6]:
sdf_lazy = sdf.filter(f.col("b") > 5)

In [7]:
sc.setJobDescription("LazyExecution")
sdf_lazy.count()

9

In [8]:
sdf_lazy.rdd.getNumPartitions()

3

# 2. Noop write

In [9]:
sc.setJobDescription("NoopWrite")
sdf.write.format("noop").mode("overwrite").save()

# 3. Narrow transformation with noop write

In [10]:
sc.setJobDescription("FilterNoopWrite")
sdf_narrowNoop = sdf.filter(f.col("b") > 5)
sdf_narrowNoop.write.format("noop").mode("overwrite").save()

# 4. Count

In [11]:
sc.setJobDescription("Count")
sdf.count()

14

# 5. Wide transformation

In [12]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")

sc.setJobDescription("Wide")
sdf_w = sdf.groupBy("a").count()
sdf_w.write.format("noop").mode("overwrite").save()

In [13]:
sc.setJobDescription("WideShow")
sdf_w.show()

+---+-----+
|  a|count|
+---+-----+
|  g|    1|
|  f|    1|
|  k|    1|
|  e|    2|
|  h|    1|
|  d|    1|
|  c|    1|
|  i|    1|
|  j|    1|
|  b|    2|
|  a|    2|
+---+-----+



In [14]:
sdf_w.rdd.getNumPartitions()

200

In [15]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

# 6. Wide Transformations with AQE

In [16]:
#Turning on AQE
spark.conf.set("spark.sql.adaptive.enabled", "true")

sc.setJobDescription("Wide")
sdf_w_aqe = sdf.groupBy("a").count()
sdf_w_aqe.write.format("noop").mode("overwrite").save()

In [17]:
sc.setJobDescription("WideShow")
sdf_w_aqe.show()

+---+-----+
|  a|count|
+---+-----+
|  d|    1|
|  c|    1|
|  b|    2|
|  a|    2|
|  g|    1|
|  f|    1|
|  e|    2|
|  k|    1|
|  h|    1|
|  i|    1|
|  j|    1|
+---+-----+



In [18]:
sdf_w_aqe.rdd.getNumPartitions()

1