# User Defined Functions

In this notebook you will solve two problems using Pandas Grouped Map UDF

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, pandas_udf, PandasUDFType, count, unix_timestamp, explode, desc
    #, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, row_number,
    #when, lit, lead, avg, pandas_udf, PandasUDFType, rank
)

from pyspark.sql import Window

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType, DoubleType, IntegerType

import os
import pandas as pd
import scipy.stats

In [None]:
spark = (
    SparkSession
    .builder
    .appName('UDFs II')
    .getOrCreate()
)

# Task 1

* compute avg time between two consecutive answers for each user that answered at least 2 questions
* use grouped map Pandas UDF

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

answers_input_path = os.path.join(project_path, 'data/answers')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
    .filter(col('user_id').isNotNull())
).cache()

In [None]:
answersDF.count()

In [None]:
# Define schema for the pandas UDF:

schema = StructType(
    [
        StructField('answer_id', LongType()),
        StructField('creation_date', TimestampType()),
        StructField('body', StringType()),
        StructField('comments', LongType()),
        StructField('user_id', LongType()),
        StructField('score', LongType()),
        StructField('question_id', LongType()),
        StructField('result', DoubleType())
    ]
)

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def compute_avg_response(pdf):
    # if the user has only 1 answer it will return null value, we will filter them out later
    pdf['result'] = pdf.sort_values(by=['creation_date']).creation_date.diff().dt.total_seconds().mean()
    return pdf

In [None]:
resultDF = (
    answersDF.drop('r')
    .groupBy('user_id')
    .apply(compute_avg_response)
    .filter(col('result').isNotNull()) # filter out users with only 1 answer
    .select('user_id', 'result')
    .dropDuplicates()
    .orderBy('user_id')
)

In [None]:
resultDF.orderBy('user_id').show(n=5)

<b>Verify that the result makes sense:</b>

In [None]:
(
    answersDF
    .filter(col('user_id') == 4)
    .withColumn('t', unix_timestamp('creation_date'))
    .select('creation_date', 't')
    .orderBy('creation_date')
).show(truncate=False)

After verification you may find out that there is actually difference between the result computed by Pandas and by PySpark DataFrame function unix_timestamp becaus of the difference in time zone. See the related <a href = "https://issues.apache.org/jira/browse/SPARK-21722" target="_blank">jira</a> to this problem:

<b>You may try to optimize this:</b>

Hint
* review this notebook after the Performance lecture
* you may filter out users with only one answer before you send the data to the UDF and reduce the cost in Python execution and data movement
 * to do this you can use window function with partitionBy('user_id') and use count(*) and keep only records with count > 1
 * realize that Exchange generated by the window is the same as Exchange generated by groupBy, so there will be only 1 Exchange
 * in other words the window function call will note require additional shuffle
 * check the physical plan

In [None]:
# Optimized version:
# Take users that answered at least 2 questions:

w = (
    Window()
    .partitionBy('user_id')
    .orderBy('creation_date')
    .rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)
)

answersDF = (
    answersDF
    .withColumn('r', count('*').over(w))
    .filter(col('r') > 1)
)

# Task 2

* For each of the 15 most frequent tags compute the entropy of the 'comments' field in questions dataset using Pandas UDF.

In [None]:

questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

<b>Find 15 most frequent tags</b>

Hint:
* explode tags
* use group by with count
* sort and use limit
* cache the result

In [None]:
tagsDF = (
    questionsDF
    .select(explode('tags').alias('tag'))
    .groupBy('tag')
    .agg(
        count('*').alias('frequency')
    )
    .orderBy(desc('frequency'))
    .limit(15)
    .select('tag')
).cache()

In [None]:
tagsDF.count()

<b>Take only questions with frequent tags:</b>

Hint:
* join the questions with the frequent tags

In [None]:
tags_with_commentsDF = (
    questionsDF
    .withColumn('tag', explode('tags'))
    .join(tagsDF, 'tag')
    .select('tag', 'comments')
)

<b>Implement the Pnadas Grouped Map UDF:</b>
    
Hint:
* use pdf['comments'].value_counts() to compute the histogram and than the probabilities
* use scipy.stats.entropy to compute the entropy

In [None]:
schema = StructType(
    [
        StructField('tag', StringType()),
        StructField('comments', IntegerType()),
        StructField('entropy', DoubleType())
    ]
)

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def compute_entropy(pdf):
    probability = pdf['comments'].value_counts() / len(pdf)
    pdf['entropy'] = scipy.stats.entropy(probability)
    return pdf

<b>Apply the UDF:</b>

Hint:
* apply the udf
* deduplicat the result
* sort by entropy
* see the result

In [None]:
resultDF = (
    tags_with_commentsDF
    .groupBy('tag')
    .apply(compute_entropy)
    .select('tag', 'entropy')
    .dropDuplicates(['tag', 'entropy'])
    .orderBy('entropy')
)

In [None]:
resultDF.show()

In [100]:
spark.stop()