In [13]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("logical_plan_and_physical_plan").getOrCreate()

######################
# code block 1 - Job #
######################
file_path = "file:///home/jovyan/work/sample/lorem_ipsum.txt"
df = spark.read.text(file_path)
df.explain(mode="formatted")

== Physical Plan ==
Scan text  (1)


(1) Scan text 
Output [1]: [value#134]
Batched: false
Location: InMemoryFileIndex [file:/home/jovyan/work/sample/lorem_ipsum.txt]
ReadSchema: struct<value:string>




In [17]:
######################
# code block 2 - Job #
######################
words = df.select(
    f.explode(
        f.split(df.value, ' ')).alias("word"))

word_counts = words.groupBy("word").count()

word_counts.explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['word], ['word, count(1) AS count#160L]
+- Project [word#156]
   +- Generate explode(split(value#134,  , -1)), false, [word#156]
      +- Relation [value#134] text

== Analyzed Logical Plan ==
word: string, count: bigint
Aggregate [word#156], [word#156, count(1) AS count#160L]
+- Project [word#156]
   +- Generate explode(split(value#134,  , -1)), false, [word#156]
      +- Relation [value#134] text

== Optimized Logical Plan ==
Aggregate [word#156], [word#156, count(1) AS count#160L]
+- Generate explode(split(value#134,  , -1)), [0], false, [word#156]
   +- Relation [value#134] text

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[word#156], functions=[count(1)], output=[word#156, count#160L])
   +- Exchange hashpartitioning(word#156, 200), ENSURE_REQUIREMENTS, [plan_id=342]
      +- HashAggregate(keys=[word#156], functions=[partial_count(1)], output=[word#156, count#164L])
         +- Generate explode(split(value#134

In [15]:
word_counts.explain(mode="formatted")
# word_counts.show()

== Physical Plan ==
AdaptiveSparkPlan (6)
+- HashAggregate (5)
   +- Exchange (4)
      +- HashAggregate (3)
         +- Generate (2)
            +- Scan text  (1)


(1) Scan text 
Output [1]: [value#134]
Batched: false
Location: InMemoryFileIndex [file:/home/jovyan/work/sample/lorem_ipsum.txt]
ReadSchema: struct<value:string>

(2) Generate
Input [1]: [value#134]
Arguments: explode(split(value#134,  , -1)), false, [word#137]

(3) HashAggregate
Input [1]: [word#137]
Keys [1]: [word#137]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#145L]
Results [2]: [word#137, count#146L]

(4) Exchange
Input [2]: [word#137, count#146L]
Arguments: hashpartitioning(word#137, 200), ENSURE_REQUIREMENTS, [plan_id=325]

(5) HashAggregate
Input [2]: [word#137, count#146L]
Keys [1]: [word#137]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#141L]
Results [2]: [word#137, count(1)#141L AS count#142L]

(6) AdaptiveSparkPlan
Output [2]: [word#137, count#142L]
Arguments: isFinalPl