In [48]:
import findspark
findspark.init()

In [49]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, row_number,
    when, lit, lead, avg, pandas_udf, PandasUDFType, rank
)

from pyspark.sql import Window

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType, DoubleType, IntegerType

import os
import pandas as pd
import scipy.stats

In [50]:
spark = (
    SparkSession
    .builder
    .appName('UDFs II')
    .getOrCreate()
)

# Task 1

* compute avg time between two consecutive answers for each user that answered at least 2 questions

In [51]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/answers')

In [52]:
answersDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
).cache()

In [53]:
# Take users that answered at least 2 questions:

w = (
    Window()
    .partitionBy('user_id')
    .orderBy('creation_date')
    .rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)
)

data = (
    answersDF
    .withColumn('r', count('*').over(w))
    .filter(col('r') > 1)
)

In [54]:
data.count()

175943

In [55]:
# Define schema for the pandas UDF:

schema = StructType(
    [
        StructField('answer_id', LongType()),
        StructField('creation_date', TimestampType()),
        StructField('body', StringType()),
        StructField('comments', LongType()),
        StructField('user_id', LongType()),
        StructField('score', LongType()),
        StructField('question_id', LongType()),
        StructField('result', DoubleType())
    ]
)

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def compute_avg_response(pdf):
    pdf['result'] = pdf.sort_values(by=['creation_date']).creation_date.diff().dt.total_seconds().mean()
    return pdf

In [56]:
resultDF = (
    data.drop('r')
    .groupBy('user_id')
    .apply(compute_avg_response)
    .select('user_id', 'result')
    .dropDuplicates()
)

In [57]:
resultDF.orderBy('result').show()

+-------+------------------+
|user_id|            result|
+-------+------------------+
| 142017|            134.81|
|  76258|146.60000000000002|
| 127301|           150.733|
|   7012|151.23000000000002|
|  47471|             154.8|
|  81035|154.85600000000002|
| 155429|164.44400000000002|
|  43758|179.00666666666666|
|  71344|            179.78|
|   4907|          180.8985|
|  59681|           187.866|
|  28339|197.38000000000002|
| 202454|197.92700000000002|
| 105605|           206.097|
| 128124|           210.497|
|  12918|           218.453|
| 174677|            221.15|
| 204225|           223.953|
|  98477|           234.288|
| 207428|234.48000000000002|
+-------+------------------+
only showing top 20 rows



<b>Verify that the result makes sense:</b>

In [58]:
(
    answersDF
    .filter(col('user_id') == 142017)
    .withColumn('t', unix_timestamp('creation_date'))
    .select('creation_date', 't')
    .orderBy('creation_date')
).show(truncate=False)

+-----------------------+----------+
|creation_date          |t         |
+-----------------------+----------+
|2017-01-14 09:59:54.697|1484384394|
|2017-01-14 10:02:09.507|1484384529|
+-----------------------+----------+



# Task 2

* For each of the 15 most frequent tags compute the entropy of the 'comments' field in questions dataset.

In [59]:
data_input_path = os.path.join(project_path, 'output/questions-transformed')

questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [60]:
# Find 15 most frequent tags and cache them in memory

tagsDF = (
    questionsDF
    .select(explode('tags').alias('tag'))
    .groupBy('tag')
    .agg(
        count('*').alias('frequency')
    )
    .orderBy(desc('frequency'))
    .limit(15)
    .select('tag')
).cache()

In [62]:
tagsDF.count()

15

In [63]:
tags_with_commentsDF = (
    questionsDF
    .withColumn('tag', explode('tags'))
    .join(tagsDF, 'tag')
    .select('tag', 'comments')
)

In [65]:
schema = StructType(
    [
        StructField('tag', StringType()),
        StructField('comments', IntegerType()),
        StructField('entropy', DoubleType())
    ]
)

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def compute_entropy(pdf):
    probability = pdf['comments'].value_counts() / len(pdf)
    pdf['entropy'] = scipy.stats.entropy(probability)
    return pdf

In [66]:
resultDF = (
    tags_with_commentsDF
    .groupBy('tag')
    .apply(compute_entropy)
    .select('tag', 'entropy')
    .dropDuplicates(['tag', 'entropy'])
    .orderBy('entropy')
)

In [67]:
resultDF.show()

+--------------------+------------------+
|                 tag|           entropy|
+--------------------+------------------+
|      electrostatics| 1.902293688733362|
|              optics|1.9477363689971114|
|    electromagnetism|1.9710115136748623|
|quantum-field-theory|1.9964464573506686|
|homework-and-exer...| 2.048712561311351|
|    particle-physics| 2.049514121748252|
|      thermodynamics| 2.074663390425912|
|      fluid-dynamics| 2.076368910652297|
|              energy|2.0837510233131304|
|              forces| 2.089208712932891|
|   quantum-mechanics|2.0958530154379114|
| classical-mechanics|2.0970745383306144|
| newtonian-mechanics| 2.147981655731562|
|  general-relativity| 2.163078942816769|
|  special-relativity| 2.216019359448606|
+--------------------+------------------+



In [68]:
spark.stop()