# Task - Union two aggregated subresults

* Work with the answers dataset
* For each question compute how many answers it has
* For each question compute the sum of the answer's score
* Combine these subresults into one final output using Union
 * The output should have three cols: question_id, metricValue, metricName (which is either sum or count)

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

from pyspark.sql import Window
from pyspark.sql.types import IntegerType

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Optimize II')
    .enableHiveSupport()
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'data/answers')

In [None]:
answersDF = spark.read.parquet(answers_input_path)

In [None]:
# Here we sum the score for each question:
dfSum = (
  answersDF
  .groupBy("question_id")
  .agg(
    sum("score").alias("metricValue")
  )
  .withColumn("metricName", lit("sum"))
)

# Here we count how many answers each question has:
dfCount = (
  answersDF
  .groupBy("question_id")
  .agg(
    count("score").alias("metricValue")
  )
  .withColumn("metricName", lit("count"))
)

# Here we union both results:
resultDF = dfSum.union(dfCount)

# Run the execution for the final query:
(
    resultDF
    .write
    .mode('overwrite')
    .format('noop')
    .save()
)

In [None]:
# Here we just slightly rewrite the query by refactoring the common part:

dfCommon = (
  answersDF
  .groupBy("question_id")
)

dfSum = (
  dfCommon
  .agg(
    sum("score").alias("metricValue")
  )
  .withColumn("metricName", lit("sum"))
)

dfCount = (
  dfCommon
  .agg(
    count("score").alias("metricValue")
  )
  .withColumn("metricName", lit("count"))
)

resultDF = dfSum.union(dfCount)

(
    resultDF
    .write
    .mode('overwrite')
    .format('noop')
    .save()
)

#### Rewrite the query in more optimal way

Hint:
* check the query plan
* try to reuse the `Exchange` operator

In [None]:
# Optimized:
# Add repartition
# Check the query plan - the Exchange is reused now

dfSum = (
  answersDF
  .repartition("question_id")
  .groupBy("question_id")
  .agg(
    sum("score").alias("metricValue")
  )
  .withColumn("metricName", lit("sum"))
)

dfCount = (
  answersDF
  .repartition("question_id")
  .groupBy("question_id")
  .agg(
    count("score").alias("metricValue")
  )
  .withColumn("metricName", lit("count"))
)

resultDF = dfSum.union(dfCount)

(
    resultDF
    .write
    .mode('overwrite')
    .format('noop')
    .save()
)

In [None]:
# optimized & refactored:

dfCommon = (
  answersDF
  .repartition("question_id")
  .groupBy("question_id")
)

dfSum = (
  dfCommon
  .agg(
    sum("score").alias("metricValue")
  )
  .withColumn("metricName", lit("sum"))
)

dfCount = (
  dfCommon
  .agg(
    count("score").alias("metricValue")
  )
  .withColumn("metricName", lit("count"))
)

resultDF = dfSum.union(dfCount)

(
    resultDF
    .write
    .mode('overwrite')
    .format('noop')
    .save()
)

In [None]:
spark.stop()