In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, year, window, count, countDistinct
)

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Interactive Analytics I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

answers_input_path = os.path.join(project_path, 'data/answers')

# Task 1

* Find out how many answers are being produced per week
* Plot the time evolution: on the x axis have date dimmension, on the y axis have number of answers per week

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

In [None]:
answersDF.show(truncate=8, n=5)

In [None]:
groupedDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .groupBy(
        window('creation_date', "1 week")
    )
    .agg(
        count('*').alias('answers')
    )
    .withColumn('date', col('window.start').cast('date'))
    .orderBy('window')
)

In [None]:
groupedDF.printSchema()

In [None]:
groupedDF.show(truncate=False, n=5)

In [None]:
groupedDF.count()

In [None]:
local_data = groupedDF.toPandas()

In [None]:
local_data.plot(x='date', y='answers')