# Spark UI

## Task I

* run the query bellow and go to the Spark UI
    * see the physical plan in SQL tab
    * see stages and tasks
    * see also the logical plan by calling explain(True)
    
## Task II
* set the `autoBroadcastJoinThreshold` to -1 to get SortMergeJoin and see the plan and job details again

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, initcap, year
import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('SparkUI')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_input_path = os.path.join(project_path, 'data/users')

questions_input_path = os.path.join(project_path, 'data/questions-json')

In [None]:
usersDF = spark.read.load(users_input_path)
questionsDF = spark.read.json(questions_input_path)

In [None]:
resultDF = (
    usersDF
    .join(questionsDF, 'user_id')
    .withColumn('name', initcap('display_name'))
    .select('user_id', 'creation_date', 'name')
    .filter(year(col('creation_date')) >= 2018)
)

In [None]:
resultDF.explain(True)

In [None]:
(
    resultDF
    .write
    .mode('overwrite')
    .format('noop')
    .save()
)

In [None]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', -1)

spark.conf.set('spark.sql.adaptive.enabled', False)

In [None]:
(
    usersDF
    .join(questionsDF, 'user_id')
    .withColumn('name', initcap('display_name'))
    .select('user_id', 'creation_date', 'name')
    .filter(year(col('creation_date')) >= 2018)
    .write
    .mode('overwrite')
    .format('noop')
    .save()
)

In [None]:
spark.stop()