- Author: Ben Du- Date: 2020-05-03 23:52:16
- Title: Window Functions in Spark
- Slug: window-functions-in-spark
- Category: Computer Science
- Tags: Computer Science, Spark, window function, partition, over, analytics functions, big data

http://xinhstechblog.blogspot.com/2016/04/spark-window-functions-for-dataframes.html

https://spark.apache.org/docs/2.1.1/api/scala/index.html#org.apache.spark.sql.functions$

In [2]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.4.3
org.apache.spark spark-sql_2.11 2.4.3

In [3]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder()
    .master("local[2]")
    .appName("Spark Column Example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()

import spark.implicits._

org.apache.spark.sql.SparkSession$implicits$@3de37c80

In [4]:
import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.expressions.Window


In [5]:
val customers = Seq(
    ("Alice", "2016-05-01", 50.00, 1),
    ("Alice", "2016-05-01", 45.00, 2),
    ("Alice", "2016-05-02", 55.00, 3),
    ("Alice", "2016-05-02", 100.00, 4),
    ("Bob", "2016-05-01", 25.00, 5),
    ("Bob", "2016-05-01", 29.00, 6),
    ("Bob", "2016-05-02", 27.00,7 ),
    ("Bob", "2016-05-02", 30.00, 8)
).toDF("name", "date", "amount", "id")
customers.orderBy("name", "date").show

+-----+----------+------+---+
| name|      date|amount| id|
+-----+----------+------+---+
|Alice|2016-05-01|  50.0|  1|
|Alice|2016-05-01|  45.0|  2|
|Alice|2016-05-02|  55.0|  3|
|Alice|2016-05-02| 100.0|  4|
|  Bob|2016-05-01|  29.0|  6|
|  Bob|2016-05-01|  25.0|  5|
|  Bob|2016-05-02|  27.0|  7|
|  Bob|2016-05-02|  30.0|  8|
+-----+----------+------+---+



null

In [6]:
customers.createOrReplaceTempView("customers")

## Window with orderBy

It is tricky!!!

If you provide ORDER BY clause then the default frame is RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:


https://stackoverflow.com/questions/52273186/pyspark-spark-window-function-first-last-issue

1. Avoid using last and use first with `descending order by` instead.
   This gives less surprisings.
   
2. Do NOT use order by if not necessary. 
   It introduces unnecessary ...

In [7]:
val wSpec = Window.partitionBy("name", "date").orderBy("id")

org.apache.spark.sql.expressions.WindowSpec@295c46f0

In [8]:
customers.select(
    $"name",
    $"date",
    $"amount",
    $"id",
    max($"amount").over(Window.partitionBy("name", "date")).alias("max_amount")
).orderBy("name", "date").show

+-----+----------+------+---+----------+
| name|      date|amount| id|max_amount|
+-----+----------+------+---+----------+
|Alice|2016-05-01|  50.0|  1|      50.0|
|Alice|2016-05-01|  45.0|  2|      50.0|
|Alice|2016-05-02|  55.0|  3|     100.0|
|Alice|2016-05-02| 100.0|  4|     100.0|
|  Bob|2016-05-01|  25.0|  5|      29.0|
|  Bob|2016-05-01|  29.0|  6|      29.0|
|  Bob|2016-05-02|  27.0|  7|      30.0|
|  Bob|2016-05-02|  30.0|  8|      30.0|
+-----+----------+------+---+----------+



In [45]:
spark.sql("""
    select
        name,
        date,
        amount,
        id,
        max(amount) over (partition by name, date) as max_amount
    from
        customers
    """).orderBy("name", "date").show

+-----+----------+------+---+----------+
| name|      date|amount| id|max_amount|
+-----+----------+------+---+----------+
|Alice|2016-05-01|  50.0|  1|      50.0|
|Alice|2016-05-01|  45.0|  2|      50.0|
|Alice|2016-05-02|  55.0|  3|     100.0|
|Alice|2016-05-02| 100.0|  4|     100.0|
|  Bob|2016-05-01|  25.0|  5|      29.0|
|  Bob|2016-05-01|  29.0|  6|      29.0|
|  Bob|2016-05-02|  27.0|  7|      30.0|
|  Bob|2016-05-02|  30.0|  8|      30.0|
+-----+----------+------+---+----------+



In [65]:
customers.select(
    $"name",
    $"date",
    $"amount",
    $"id",
    max($"amount").over(Window.partitionBy("name", "date").orderBy("id")).alias("max_amount")
).orderBy("name", "date").show

+-----+----------+------+---+----------+
| name|      date|amount| id|max_amount|
+-----+----------+------+---+----------+
|Alice|2016-05-01|  50.0|  1|      50.0|
|Alice|2016-05-01|  45.0|  2|      50.0|
|Alice|2016-05-02|  55.0|  3|      55.0|
|Alice|2016-05-02| 100.0|  4|     100.0|
|  Bob|2016-05-01|  25.0|  5|      25.0|
|  Bob|2016-05-01|  29.0|  6|      29.0|
|  Bob|2016-05-02|  27.0|  7|      27.0|
|  Bob|2016-05-02|  30.0|  8|      30.0|
+-----+----------+------+---+----------+



In [47]:
spark.sql("""
    select
        name,
        date,
        amount,
        id,
        max(amount) over (partition by name, date order by id) as max_amount
    from
        customers
    """).orderBy("name", "date").show

+-----+----------+------+---+----------+
| name|      date|amount| id|max_amount|
+-----+----------+------+---+----------+
|Alice|2016-05-01|  50.0|  1|      50.0|
|Alice|2016-05-01|  45.0|  2|      50.0|
|Alice|2016-05-02|  55.0|  3|      55.0|
|Alice|2016-05-02| 100.0|  4|     100.0|
|  Bob|2016-05-01|  25.0|  5|      25.0|
|  Bob|2016-05-01|  29.0|  6|      29.0|
|  Bob|2016-05-02|  27.0|  7|      27.0|
|  Bob|2016-05-02|  30.0|  8|      30.0|
+-----+----------+------+---+----------+



In [48]:
customers.select(
    $"name",
    $"date",
    $"amount",
    $"id",
    first($"amount").over(Window.partitionBy("name", "date").orderBy("id")).alias("first_amount")
).orderBy("name", "date").show

+-----+----------+------+---+------------+
| name|      date|amount| id|first_amount|
+-----+----------+------+---+------------+
|Alice|2016-05-01|  50.0|  1|        50.0|
|Alice|2016-05-01|  45.0|  2|        50.0|
|Alice|2016-05-02|  55.0|  3|        55.0|
|Alice|2016-05-02| 100.0|  4|        55.0|
|  Bob|2016-05-01|  25.0|  5|        25.0|
|  Bob|2016-05-01|  29.0|  6|        25.0|
|  Bob|2016-05-02|  27.0|  7|        27.0|
|  Bob|2016-05-02|  30.0|  8|        27.0|
+-----+----------+------+---+------------+



In [49]:
spark.sql("""
    select
        name,
        date,
        amount,
        id,
        first(amount) over (partition by name, date order by id) as first_amount
    from
        customers
    """).orderBy("name", "date").show

+-----+----------+------+---+------------+
| name|      date|amount| id|first_amount|
+-----+----------+------+---+------------+
|Alice|2016-05-01|  50.0|  1|        50.0|
|Alice|2016-05-01|  45.0|  2|        50.0|
|Alice|2016-05-02|  55.0|  3|        55.0|
|Alice|2016-05-02| 100.0|  4|        55.0|
|  Bob|2016-05-01|  25.0|  5|        25.0|
|  Bob|2016-05-01|  29.0|  6|        25.0|
|  Bob|2016-05-02|  27.0|  7|        27.0|
|  Bob|2016-05-02|  30.0|  8|        27.0|
+-----+----------+------+---+------------+



In [64]:
customers.select(
    $"name",
    $"date",
    $"amount",
    $"id",
    last($"amount").over(Window.partitionBy("name", "date").orderBy("id")).alias("last_amount")
).orderBy("name", "date").show

+-----+----------+------+---+-----------+
| name|      date|amount| id|last_amount|
+-----+----------+------+---+-----------+
|Alice|2016-05-01|  50.0|  1|       50.0|
|Alice|2016-05-01|  45.0|  2|       45.0|
|Alice|2016-05-02| 100.0|  4|      100.0|
|Alice|2016-05-02|  55.0|  3|       55.0|
|  Bob|2016-05-01|  25.0|  5|       25.0|
|  Bob|2016-05-01|  29.0|  6|       29.0|
|  Bob|2016-05-02|  27.0|  7|       27.0|
|  Bob|2016-05-02|  30.0|  8|       30.0|
+-----+----------+------+---+-----------+



In [54]:
spark.sql("""
    select
        name,
        date,
        amount,
        id,
        last(amount) over (partition by name, date order by id) as last_amount
    from
        customers
    """).orderBy("name", "date").show

+-----+----------+------+---+-----------+
| name|      date|amount| id|last_amount|
+-----+----------+------+---+-----------+
|Alice|2016-05-01|  50.0|  1|       50.0|
|Alice|2016-05-01|  45.0|  2|       45.0|
|Alice|2016-05-02|  55.0|  3|       55.0|
|Alice|2016-05-02| 100.0|  4|      100.0|
|  Bob|2016-05-01|  25.0|  5|       25.0|
|  Bob|2016-05-01|  29.0|  6|       29.0|
|  Bob|2016-05-02|  27.0|  7|       27.0|
|  Bob|2016-05-02|  30.0|  8|       30.0|
+-----+----------+------+---+-----------+



In [58]:
customers.select(
    $"name",
    $"date",
    $"amount",
    $"id",
    first($"amount").over(Window.partitionBy("name", "date").orderBy($"id".desc)).alias("last_amount")
).orderBy("name", "date").show

+-----+----------+------+---+-----------+
| name|      date|amount| id|last_amount|
+-----+----------+------+---+-----------+
|Alice|2016-05-01|  45.0|  2|       45.0|
|Alice|2016-05-01|  50.0|  1|       45.0|
|Alice|2016-05-02| 100.0|  4|      100.0|
|Alice|2016-05-02|  55.0|  3|      100.0|
|  Bob|2016-05-01|  29.0|  6|       29.0|
|  Bob|2016-05-01|  25.0|  5|       29.0|
|  Bob|2016-05-02|  30.0|  8|       30.0|
|  Bob|2016-05-02|  27.0|  7|       30.0|
+-----+----------+------+---+-----------+



In [67]:
spark.sql("""
    select
        name,
        date,
        amount,
        id,
        first(amount) over (partition by name, date order by id desc) as last_amount
    from
        customers
    """).orderBy("name", "date").show

+-----+----------+------+---+-----------+
| name|      date|amount| id|last_amount|
+-----+----------+------+---+-----------+
|Alice|2016-05-01|  45.0|  2|       45.0|
|Alice|2016-05-01|  50.0|  1|       45.0|
|Alice|2016-05-02| 100.0|  4|      100.0|
|Alice|2016-05-02|  55.0|  3|      100.0|
|  Bob|2016-05-01|  29.0|  6|       29.0|
|  Bob|2016-05-01|  25.0|  5|       29.0|
|  Bob|2016-05-02|  30.0|  8|       30.0|
|  Bob|2016-05-02|  27.0|  7|       30.0|
+-----+----------+------+---+-----------+



## partition by with group by

Avoid doing so!!!

In [7]:
spark.sql("""
    select
        name,
        date,
        first(amount) over (partition by name, date order by id desc) as last_amount
    from
        customers
    group by
        name, date
    """).orderBy("name", "date").show

org.apache.spark.sql.AnalysisException:  expression 'customers.`amount`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;;

In [9]:
spark.sql("""
    select
        name,
        date,
        first(max(amount)) over (partition by name, date order by id desc) as last_amount
    from
        customers
    group by
        name, date
    """).orderBy("name", "date").show

org.apache.spark.sql.AnalysisException:  expression 'customers.`id`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;;

In [10]:
customers.orderBy("name", "date").show

+-----+----------+------+---+
| name|      date|amount| id|
+-----+----------+------+---+
|Alice|2016-05-01|  50.0|  1|
|Alice|2016-05-01|  45.0|  2|
|Alice|2016-05-02|  55.0|  3|
|Alice|2016-05-02| 100.0|  4|
|  Bob|2016-05-01|  29.0|  6|
|  Bob|2016-05-01|  25.0|  5|
|  Bob|2016-05-02|  27.0|  7|
|  Bob|2016-05-02|  30.0|  8|
+-----+----------+------+---+



In [12]:
spark.sql("""
    select
        name,
        date,
        amount,
        id,
        row_number() over (partition by name, date order by id desc) as rownum
    from
        customers
    """).orderBy("name", "date").show

+-----+----------+------+---+------+
| name|      date|amount| id|rownum|
+-----+----------+------+---+------+
|Alice|2016-05-01|  45.0|  2|     1|
|Alice|2016-05-01|  50.0|  1|     2|
|Alice|2016-05-02| 100.0|  4|     1|
|Alice|2016-05-02|  55.0|  3|     2|
|  Bob|2016-05-01|  29.0|  6|     1|
|  Bob|2016-05-01|  25.0|  5|     2|
|  Bob|2016-05-02|  30.0|  8|     1|
|  Bob|2016-05-02|  27.0|  7|     2|
+-----+----------+------+---+------+



In [13]:
spark.sql("""
    select 
        *
    from (
        select
            name,
            date,
            amount,
            id,
            row_number() over (partition by name, date order by id desc) as rownum
        from
            customers
        ) A
    where 
        rownum = 1
    """).orderBy("name", "date").show

+-----+----------+------+---+------+
| name|      date|amount| id|rownum|
+-----+----------+------+---+------+
|Alice|2016-05-01|  45.0|  2|     1|
|Alice|2016-05-02| 100.0|  4|     1|
|  Bob|2016-05-01|  29.0|  6|     1|
|  Bob|2016-05-02|  30.0|  8|     1|
+-----+----------+------+---+------+

