# Window functions

In this notebook you will:
* solve analytical question using window functions

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, count, unix_timestamp, row_number, lead, avg
)
from pyspark.sql import Window

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('WF I')
    .getOrCreate()
)

# Task 1

* compute avg time between two consecutive answers for each user that answered at least 2 questions

In [3]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/answers')

In [4]:
answersDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
).cache()

<b>Take only users that answered at least 2 questions:</b>

Hint:
* Use count as a window function
* Create the window per user
* Filter only users with count > 1

In [5]:
w = (
    Window()
    .partitionBy('user_id')
    .orderBy('creation_date')
    .rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)
)

data = (
    answersDF
    .withColumn('r', count('*').over(w))
    .filter(col('r') > 1)
)

In [7]:
data.show(n=5)

+---------+--------------------+--------------------+--------+-------+-----+-----------+---+
|answer_id|       creation_date|                body|comments|user_id|score|question_id|  r|
+---------+--------------------+--------------------+--------+-------+-----+-----------+---+
|     2251|2010-12-25 00:27:...|<p>If you are wil...|       0|     26|    1|       1354|  3|
|     3600|2011-01-22 15:14:...|<p>What is descri...|       0|     26|    2|       3526|  3|
|   234970|2016-02-10 10:25:...|<p>I don't know a...|       0|     26|    1|       8930|  3|
|       66|2010-11-02 22:09:...|<p>When you accel...|       6|     29|   14|         62| 77|
|       80|2010-11-02 22:50:...|<p>Neutrons have ...|       3|     29|   21|         78| 77|
+---------+--------------------+--------------------+--------+-------+-----+-----------+---+
only showing top 5 rows



<b>Compute the average time between answers:</b>

Hint:
* Define new window also per user but sorted by creation_date
* Use lead function to add new column which contains the next answer
* Compute the time difference (use unix_timestamp)
* Group by user and compute the average

In [None]:
w2 = Window().partitionBy('user_id').orderBy('creation_date')

resultDF = (
    data
    .withColumn('next_answer', lead('creation_date').over(w2))
    .filter(col('next_answer').isNotNull())
    .withColumn('diff', unix_timestamp(col('next_answer')) - unix_timestamp(col('creation_date')))
    .select('user_id', 'r', 'diff')
    .groupBy('user_id')
    .agg(
        avg('diff').alias('avg_response_period')
    )
    .orderBy('avg_response_period')
)

In [None]:
resultDF.show()

In [None]:
spark.stop()