# Window functions

In this notebook you will:
* solve analytical question using window functions

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, unix_timestamp, row_number, lead, avg
from pyspark.sql import Window

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('WF I')
    .getOrCreate()
)

# Task 1

* compute avg time between two consecutive answers for each user that answered at least 2 questions

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

data_input_path = os.path.join(project_path, 'data/answers')

In [None]:
answersDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
).cache()

<b>Take only users that answered at least 2 questions:</b>

Hint:
* Use count as a window function
* Create the window per user
* Filter only users with count > 1

In [None]:
w = (
    Window()
    .partitionBy('user_id')
    .orderBy('creation_date')
    .rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)
)

data = (
    answersDF
    .withColumn('r', count('*').over(w))
    .filter(col('r') > 1)
)

In [None]:
data.orderBy('user_id', 'answer_id').show(n=5)

<b>Compute the average time between answers:</b>

Hint:
* Define new window also per user but sorted by creation_date
* Use lead function to add new column which contains the next answer
* Compute the time difference (use unix_timestamp)
* Group by user and compute the average

In [None]:
w2 = Window().partitionBy('user_id').orderBy('creation_date')

resultDF = (
    data
    .withColumn('next_answer', lead('creation_date').over(w2))
    .filter(col('next_answer').isNotNull())
    .withColumn('diff', unix_timestamp(col('next_answer')) - unix_timestamp(col('creation_date')))
    .select('user_id', 'r', 'diff')
    .groupBy('user_id')
    .agg(
        avg('diff').alias('avg_response_period')
    )
    .orderBy('avg_response_period')
)

In [None]:
resultDF.show()

In [None]:
spark.stop()