# Higher Order Functions II

In this notebook you will solve one more problem using HOFs.

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, sum, collect_list
)
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('HOF II')
    .getOrCreate()
)

# Task

* For each of the 15 most frequent tags compute the entropy of the 'comments' field in questions dataset using HOFs.

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

In [None]:
questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

<b>Find 15 most frequent tags</b>

Hint:
* explode tags
* use group by with count
* sort and use limit
* cache the result

In [None]:
tagsDF = (
    questionsDF
    .select(explode('tags').alias('tag'))
    .groupBy('tag')
    .agg(
        count('*').alias('frequency')
    )
    .orderBy(desc('frequency'))
    .limit(15)
    .select('tag')
).cache()

In [None]:
tagsDF.count()

<b>Compute the probabilities:</b>

Hint:
* join questions with frequent tags
* groupBy tag and comments to compute comments_frequency
* use sum(comments_frequency) as window call (partitionBy tag) to compute number of row for each tag
* use this to compute the probability for each comments value (frequency divided by total count)
* group by tag and use collect_list to get the probabilities to an array for each tag

In [None]:
w = Window().partitionBy('tag').rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)

tag_probabilities = (
    questionsDF
    .withColumn('tag', explode('tags'))
    .join(tagsDF, 'tag')
    .groupBy('tag', 'comments')
    .agg(
        count('*').alias('comments_frequency')
    )
    .withColumn('tag_count', sum('comments_frequency').over(w))
    .withColumn('comments_probability', col('comments_frequency') / col('tag_count'))
    .groupBy('tag')
    .agg(
        collect_list('comments_probability').alias('probabilities')
    )
)

<b>Compute the entropy:</b>

Hint:
* use AGGREGATE
* define the anonymous function to compute the entropy
 * S = -sum(x * log(x))

In [None]:
resultDF = (
    tag_probabilities
    .selectExpr(
        'tag',
        "AGGREGATE(probabilities, cast(0 as double), (buffer, value) -> (buffer - log(value) * value)) AS entropy"
    )
)

In [None]:
resultDF.orderBy('entropy').show()

In [26]:
spark.stop()