In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, year, window, count, countDistinct, unix_timestamp, when, lit, round, floor, ceil
)

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Interactive Analytics I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

answers_input_path = os.path.join(project_path, 'data/answers')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

# Task I

* Find out how many answers are being produced per week
* Plot the time evolution: on the x axis have date dimmension, on the y axis have number of answers per week

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

In [None]:
answersDF.show(truncate=8, n=5)

In [None]:
groupedDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .groupBy(
        window('creation_date', "1 week")
    )
    .agg(
        count('*').alias('answers')
    )
    .withColumn('date', col('window.start').cast('date'))
    .orderBy('window')
)

In [None]:
groupedDF.printSchema()

In [None]:
groupedDF.show(truncate=False, n=5)

In [None]:
groupedDF.count()

In [None]:
local_data = groupedDF.toPandas()

In [None]:
local_data.plot(x='date', y='answers')

# Task II

* Compute the response time
* Plot number of answered questions as a function of response time
 * choose hour as time unit
 * create a bar chart
 * chreta a cumulative sum

In [None]:
questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

In [None]:
hourly_data = (
    questionsDF.alias('questions')
    .join(answersDF.alias('answers'), questionsDF['accepted_answer_id'] == answersDF['answer_id'])
    .select(
        col('questions.tags'),
        col('questions.creation_date').alias('question_time'),
        col('questions.title'),
        col('answers.creation_date').alias('answer_time')
    )
    .withColumn('response_time', unix_timestamp('answer_time') - unix_timestamp('question_time'))
    .filter(col('response_time') > 0)
    .withColumn('hours', ceil(col('response_time') / 3600))
)

In [None]:
hourly_data_grouped = (
    hourly_data
    .groupBy('hours')
    .agg(count('*').alias('cnt'))
    .orderBy('hours')
)

In [None]:
hourly_data_local = hourly_data_grouped.toPandas()

In [None]:
%matplotlib inline

hourly_data_local.head(50).plot.bar(x='hours', y='cnt')

In [None]:
hourly_data_local.head(100)['cnt'].cumsum().plot()

In [None]:
spark.stop()