# Spark UI

## Task I

* run the query bellow and go to the Spark UI
    * see the physical plan in SQL tab
    * see stages and tasks
    * see also the logical plan by calling explain(True)
    
## Task II
* set the `autoBroadcastJoinThreshold` to -1 to get SortMergeJoin and see the plan and job details again

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, initcap, year
import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('SparkUI')
    .getOrCreate()
)



In [3]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_input_path = os.path.join(project_path, 'data/users')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

In [4]:
usersDF = spark.read.load(users_input_path)
questionsDF = spark.read.load(questions_input_path)

In [5]:
resultDF = (
    usersDF
    .join(questionsDF, 'user_id')
    .withColumn('name', initcap('display_name'))
    .select('user_id', 'creation_date', 'name')
    .filter(year(col('creation_date')) >= 2018)
)

In [6]:
resultDF.explain(True)

== Parsed Logical Plan ==
'Filter (year('creation_date) >= 2018)
+- Project [user_id#0L, creation_date#18, name#53]
   +- Project [user_id#0L, display_name#1, about#2, location#3, downvotes#4L, upvotes#5L, reputation#6L, views#7L, question_id#16L, tags#17, creation_date#18, title#19, body#20, accepted_answer_id#21L, answers#22L, comments#23L, views#25L, initcap(display_name#1) AS name#53]
      +- Project [user_id#0L, display_name#1, about#2, location#3, downvotes#4L, upvotes#5L, reputation#6L, views#7L, question_id#16L, tags#17, creation_date#18, title#19, body#20, accepted_answer_id#21L, answers#22L, comments#23L, views#25L]
         +- Join Inner, (user_id#0L = user_id#24L)
            :- Relation [user_id#0L,display_name#1,about#2,location#3,downvotes#4L,upvotes#5L,reputation#6L,views#7L] parquet
            +- Relation [question_id#16L,tags#17,creation_date#18,title#19,body#20,accepted_answer_id#21L,answers#22L,comments#23L,user_id#24L,views#25L] parquet

== Analyzed Logical Plan 

In [7]:
resultDF.collect()

[Row(user_id=5493171, creation_date=datetime.datetime(2020, 5, 25, 19, 21, 43, 77000), name='Hejo'),
 Row(user_id=1092865, creation_date=datetime.datetime(2020, 1, 6, 12, 30, 23, 440000), name='Morgoe'),
 Row(user_id=4522398, creation_date=datetime.datetime(2019, 6, 6, 10, 43, 43, 890000), name='Andy  Peng'),
 Row(user_id=193189, creation_date=datetime.datetime(2020, 1, 27, 22, 34, 13, 913000), name='Punkouter'),
 Row(user_id=11839566, creation_date=datetime.datetime(2019, 12, 4, 13, 53, 10, 943000), name='John Walker'),
 Row(user_id=10128619, creation_date=datetime.datetime(2019, 11, 12, 12, 8, 32, 963000), name='Cbdeveloper'),
 Row(user_id=5881773, creation_date=datetime.datetime(2018, 10, 10, 15, 13, 17, 947000), name='Vipul'),
 Row(user_id=4776977, creation_date=datetime.datetime(2018, 5, 25, 17, 13, 7, 623000), name='Tom Wagstaff'),
 Row(user_id=10104341, creation_date=datetime.datetime(2019, 7, 4, 13, 52, 37, 910000), name='Dimitris Efst'),
 Row(user_id=426422, creation_date=date

In [8]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', -1)

In [9]:
(
    usersDF
    .join(questionsDF, 'user_id')
    .withColumn('name', initcap('display_name'))
    .select('user_id', 'creation_date', 'name')
    .filter(year(col('creation_date')) >= 2018)
).collect()

[Row(user_id=1532, creation_date=datetime.datetime(2019, 9, 13, 10, 29, 52, 853000), name='Pierre Spring'),
 Row(user_id=1532, creation_date=datetime.datetime(2018, 3, 21, 14, 15, 39, 653000), name='Pierre Spring'),
 Row(user_id=1532, creation_date=datetime.datetime(2019, 9, 12, 15, 17, 21, 710000), name='Pierre Spring'),
 Row(user_id=1694, creation_date=datetime.datetime(2018, 12, 18, 16, 38, 43, 967000), name='Eli Courtwright'),
 Row(user_id=1694, creation_date=datetime.datetime(2018, 4, 6, 0, 56, 11, 557000), name='Eli Courtwright'),
 Row(user_id=3312, creation_date=datetime.datetime(2018, 3, 21, 16, 2, 3, 537000), name='Jesse C. Slicer'),
 Row(user_id=7552, creation_date=datetime.datetime(2019, 9, 24, 0, 50, 5, 400000), name='Glenn Jackman'),
 Row(user_id=7552, creation_date=datetime.datetime(2018, 11, 19, 18, 40, 53, 630000), name='Glenn Jackman'),
 Row(user_id=7552, creation_date=datetime.datetime(2020, 2, 12, 19, 51, 19, 483000), name='Glenn Jackman'),
 Row(user_id=7552, creatio

In [10]:
spark.stop()