In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, row_number,
    when, lit, lead, avg, pandas_udf, PandasUDFType, rank
)

from pyspark.sql import Window

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType, DoubleType, IntegerType

import os
import pandas as pd
import scipy.stats

In [None]:
spark = (
    SparkSession
    .builder
    .appName('UDFs II')
    .getOrCreate()
)

# Task 1

* compute avg time between two consecutive answers for each user that answered at least 2 questions

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/answers')

In [None]:
answersDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
).cache()

In [None]:
# Take users that answered at least 2 questions:

w = (
    Window()
    .partitionBy('user_id')
    .orderBy('creation_date')
    .rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)
)

data = (
    answersDF
    .withColumn('r', count('*').over(w))
    .filter(col('r') > 1)
)

In [None]:
data.count()

In [None]:
# Define schema for the pandas UDF:

schema = StructType(
    [
        StructField('answer_id', LongType()),
        StructField('creation_date', TimestampType()),
        StructField('body', StringType()),
        StructField('comments', LongType()),
        StructField('user_id', LongType()),
        StructField('score', LongType()),
        StructField('question_id', LongType()),
        StructField('result', DoubleType())
    ]
)

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def compute_avg_response(pdf):
    pdf['result'] = pdf.sort_values(by=['creation_date']).creation_date.diff().dt.total_seconds().mean()
    return pdf

In [None]:
resultDF = (
    data.drop('r')
    .groupBy('user_id')
    .apply(compute_avg_response)
    .select('user_id', 'result')
    .dropDuplicates()
)

In [None]:
resultDF.orderBy('result').show()

<b>Verify that the result makes sense:</b>

In [None]:
(
    answersDF
    .filter(col('user_id') == 142017)
    .withColumn('t', unix_timestamp('creation_date'))
    .select('creation_date', 't')
    .orderBy('creation_date')
).show(truncate=False)

# Task 2

* For each of the 15 most frequent tags compute the entropy of the 'comments' field in questions dataset.

In [None]:
data_input_path = os.path.join(project_path, 'output/questions-transformed')

questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [None]:
# Find 15 most frequent tags and cache them in memory

tagsDF = (
    questionsDF
    .select(explode('tags').alias('tag'))
    .groupBy('tag')
    .agg(
        count('*').alias('frequency')
    )
    .orderBy(desc('frequency'))
    .limit(15)
    .select('tag')
).cache()

In [None]:
tagsDF.count()

In [None]:
tags_with_commentsDF = (
    questionsDF
    .withColumn('tag', explode('tags'))
    .join(tagsDF, 'tag')
    .select('tag', 'comments')
)

In [None]:
schema = StructType(
    [
        StructField('tag', StringType()),
        StructField('comments', IntegerType()),
        StructField('entropy', DoubleType())
    ]
)

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def compute_entropy(pdf):
    probability = pdf['comments'].value_counts() / len(pdf)
    pdf['entropy'] = scipy.stats.entropy(probability)
    return pdf

In [None]:
resultDF = (
    tags_with_commentsDF
    .groupBy('tag')
    .apply(compute_entropy)
    .select('tag', 'entropy')
    .dropDuplicates(['tag', 'entropy'])
    .orderBy('entropy')
)

In [None]:
resultDF.show()

In [None]:
spark.stop()