In [36]:
import findspark
findspark.init()

In [37]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, expr, sum
)
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

import os

In [38]:
spark = (
    SparkSession
    .builder
    .appName('HOF II')
    .getOrCreate()
)

# Task

* For each of the 15 most frequent tags compute the entropy of the 'comments' field in questions dataset.

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'output/questions-transformed')

In [40]:
questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [41]:
questionsDF.printSchema()

root
 |-- question_id: long (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- creation_date: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- accepted_answer_id: long (nullable = true)
 |-- answers: long (nullable = true)
 |-- comments: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- views: long (nullable = true)



In [42]:
# Find 15 most frequent tags and cache them in memory

tagsDF = (
    questionsDF
    .select(explode('tags').alias('tag'))
    .groupBy('tag')
    .agg(
        count('*').alias('frequency')
    )
    .orderBy(desc('frequency'))
    .limit(15)
    .select('tag')
).cache()

In [43]:
tagsDF.count()

15

In [44]:
# Compute the probabilities of each comments value using aggregate and window functions:

w = Window().partitionBy('tag').rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)

tag_probabilities = (
    questionsDF
    .withColumn('tag', explode('tags'))
    .join(tagsDF, 'tag')
    .groupBy('tag', 'comments')
    .agg(
        count('*').alias('comments_frequency')
    )
    .withColumn('tag_count', sum('comments_frequency').over(w))
    .withColumn('comments_probability', col('comments_frequency') / col('tag_count'))
    .groupBy('tag')
    .agg(
        collect_list('comments_probability').alias('probabilities')
    )
)

In [45]:
# Use HOF to compute the entropy for the probabilities:

resultDF = (
    tag_probabilities
    .selectExpr(
        'tag',
        "AGGREGATE(probabilities, cast(0 as double), (buffer, value) -> (buffer - log(value) * value)) AS entropy"
    )
)

In [46]:
resultDF.orderBy('entropy').show()

+--------------------+------------------+
|                 tag|           entropy|
+--------------------+------------------+
|      electrostatics|1.9022936887333617|
|              optics|1.9477363689971119|
|    electromagnetism|1.9710115136748623|
|quantum-field-theory|1.9964464573506688|
|homework-and-exer...|2.0487125613113504|
|    particle-physics| 2.049514121748252|
|      thermodynamics| 2.074663390425913|
|      fluid-dynamics| 2.076368910652297|
|              energy|2.0837510233131296|
|              forces| 2.089208712932891|
|   quantum-mechanics| 2.095853015437911|
| classical-mechanics| 2.097074538330614|
| newtonian-mechanics| 2.147981655731562|
|  general-relativity|2.1630789428167696|
|  special-relativity|2.2160193594486066|
+--------------------+------------------+



In [47]:
spark.stop()