# Window functions

In this notebook you will:
* solve analytical question using window functions

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, unix_timestamp, row_number, lead, avg
from pyspark.sql import Window

import os

In [12]:
spark = (
    SparkSession
    .builder
    .appName('WF I')
    .getOrCreate()
)

# Task 1

* compute avg time between two consecutive answers for each user that answered at least 2 questions

In [13]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

data_input_path = os.path.join(project_path, 'data/answers')

In [14]:
answersDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
).cache()

#### Take only users that answered at least 2 questions:

Hint:
* Define a [window](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.Window) per user_id
* Use count as a window function
* Filter only users with count > 1

In [15]:
# for this window avoid using the sort, because if you sort the window
# you will have to add frame definition .rowsBetween(Window().unboundedPreceding, Window().unboundedFollowing)
# if you don't sort, all records for a given user_id will be considered in the window
# with the sort the default frame is .rowsBetween(Window().unboundedPreceding, Window().currentRow)

w = Window().partitionBy('user_id')

data = (
    answersDF
    .filter(col('user_id').isNotNull())
    .withColumn('r', count('*').over(w))
    .filter(col('r') > 1)
)

In [16]:
data.orderBy('user_id', 'answer_id').show(n=5)

+-----------+---------+--------------------+--------------------+--------+-------+-----+---+
|question_id|answer_id|       creation_date|                body|comments|user_id|score|  r|
+-----------+---------+--------------------+--------------------+--------+-------+-----+---+
|     286401|   789058|2009-04-25 16:46:...|<p>If the compute...|       0|      5|    1|  4|
|    1166858|  1166871|2009-07-22 19:23:...|<p>I'd add a catc...|       5|      5|    1|  4|
|    1167625|  1167645|2009-07-22 21:29:...|<p>Use the <a hre...|       0|      5|    2|  4|
|    1434284|  1435853|2009-09-17 00:29:...|<p>Expired items ...|       4|      5|    5|  4|
|     113385|   113398|2008-09-22 08:08:...|<p>Is this close ...|       1|     13|    0| 14|
+-----------+---------+--------------------+--------------------+--------+-------+-----+---+
only showing top 5 rows



#### Compute the average time between answers:

Hint:
* Define new window also per user but sorted by creation_date
* Use [lead](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.lead) function to add new column which contains the next answer
* Compute the time difference (use [unix_timestamp](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.unix_timestamp)) which returns the timestamp in seconds
* Group by user and compute the average

In [17]:
w2 = Window().partitionBy('user_id').orderBy('creation_date')

resultDF = (
    data
    .withColumn('next_answer', lead('creation_date').over(w2))
    .filter(col('next_answer').isNotNull())
    .withColumn('diff', unix_timestamp(col('next_answer')) - unix_timestamp(col('creation_date'))) # in sec
    .select('user_id', 'r', 'diff')
    .groupBy('user_id')
    .agg(
        avg('diff').alias('avg_response_period')
    )
    .orderBy('avg_response_period')
)

In [18]:
resultDF.show(n=10)

+-------+-------------------+
|user_id|avg_response_period|
+-------+-------------------+
| 731255|               56.0|
| 240418|               58.0|
| 704327|               63.0|
|   9706|               83.0|
|2592424|               85.0|
| 132684|               87.0|
|    943|               89.0|
|1077236|              107.0|
|  10960|              107.0|
|3639557|              143.0|
+-------+-------------------+
only showing top 10 rows



In [19]:
spark.stop()