# Interactive Analytics

In this notebook you will answer 2 basic analytical questions about the data and visualise the result 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, unix_timestamp, when, lit, ceil

import os
import matplotlib.pyplot as plt

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Interactive Analytics I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'data/answers')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

# Task I

* Find out how many answers are being produced per week
* Plot the time evolution: on the x axis have date dimmension, on the y axis have number of answers per week

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

In [None]:
answersDF.show(truncate=25, n=5)

<b>Group the data</b>

Hint:
* use groupBy(window)

In [None]:
groupedDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .groupBy(
        window('creation_date', "1 week")
    )
    .agg(
        count('*').alias('answers')
    )
    .withColumn('date', col('window.start').cast('date'))
    .orderBy('window')
)

In [None]:
groupedDF.show(truncate=False, n=5)

In [None]:
groupedDF.count()

<b>Visualise the data:</b>

Hint
* convert the aggregated data to Pandas dataframe using toPandas()
* use ploting options of Pandas dataframe

In [None]:
local_data = groupedDF.toPandas()

In [None]:
local_data.plot(
    x='date', y='answers', figsize=(12, 6), 
    title='Number of answers per week',
    legend=False
    
)
plt.xlabel('Date')
plt.ylabel('Number of answers')
plt.show()

# Task II

* Compute the response time
 * for each question compute the time it took to have accepted answer
 * consider only questions with accepted answer
* Plot number of answered questions as a function of response time
 * choose hour as time unit
 * create a bar chart (too see how many questions were answered within one hour, within 2 hours and so on)
 * chreta a cumulative sum

In [None]:
questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

<b>Compute response time:</b>

Hint:
* join questions with answers
* use unix_timestamp to compare the times
* convert to hours
* ceil the numbers

In [None]:
hourly_data = (
    questionsDF.alias('questions')
    .join(answersDF.alias('answers'), questionsDF['accepted_answer_id'] == answersDF['answer_id'])
    .select(
        col('questions.tags'),
        col('questions.creation_date').alias('question_time'),
        col('questions.title'),
        col('answers.creation_date').alias('answer_time')
    )
    .withColumn('response_time', unix_timestamp('answer_time') - unix_timestamp('question_time'))
    .filter(col('response_time') > 0)
    .withColumn('hours', ceil(col('response_time') / 3600))
)

<b>Aggregate the data and visualise</b>

Hint:
* group by hour
* count
* convert to Pandas
* visualize (take first 50 records)

In [None]:
hourly_data_grouped = (
    hourly_data
    .groupBy('hours')
    .agg(count('*').alias('cnt'))
    .orderBy('hours')
)

In [None]:
hourly_data_local = hourly_data_grouped.toPandas()

In [None]:
hourly_data_local.head(5)

For bar chart you can use df.plot.bar

In [None]:
hourly_data_local.head(50).plot.bar(
    x='hours', y='cnt', figsize=(12, 6), 
    title='Response time of questions',
    legend=False
    
)
plt.xlabel('Hours')
plt.ylabel('Number of answered questions')
plt.show()

For cumulative sum you can use df['col'].comsum().plot()

In [None]:
hourly_data_local.head(50)['cnt'].cumsum().plot(
    figsize=(12, 6),
    title='Cumulative size of answered questions'
)
plt.xlabel('Hours')
plt.ylabel('Number of answered questions')
plt.show()

In [None]:
spark.stop()