In [1]:
from common.session import get_spark_session

spark = get_spark_session('Aggregated window')

In [7]:
from pyspark.sql.functions import col

data = [
         ('a', '2025-01-01 09:00:00', 1)
        ,('a', '2025-01-01 09:06:00', 1)
        ,('a', '2025-01-01 09:11:00', 1)
        ,('a', '2025-01-01 09:12:00', 1)
        ,('a', '2025-01-01 09:23:00', 1)
        ,('b', '2025-01-01 09:11:00', 1)
        ,('b', '2025-01-01 09:19:11', 1)
        ,('b', '2025-01-01 09:23:11', 1)
       ]

cols = ["user_id", "time", "amount"]



df_sales = spark.createDataFrame(data, cols)
df_sales = df_sales.withColumn("time", col("time").cast("timestamp"))
df_sales.show()

+-------+-------------------+------+
|user_id|               time|amount|
+-------+-------------------+------+
|      a|2025-01-01 09:00:00|     1|
|      a|2025-01-01 09:06:00|     1|
|      a|2025-01-01 09:11:00|     1|
|      a|2025-01-01 09:12:00|     1|
|      a|2025-01-01 09:23:00|     1|
|      b|2025-01-01 09:11:00|     1|
|      b|2025-01-01 09:19:11|     1|
|      b|2025-01-01 09:23:11|     1|
+-------+-------------------+------+



In [4]:
''' Types: 

    Tubling: fix size, non-overlapping (by 10 minutes)
    Sliding: fix size, sliding window (by 10 minutes, sliding 5 minutes)
    Session: based on an activity with time-out

'''

' Types: \n\n    Tubling: fix size, non-overlapping (by 10 minutes)\n    Sliding: fix size, sliding window (by 10 minutes, sliding 5 minutes)\n    Session: based on an activity with time-out\n\n'

In [15]:
from pyspark.sql.functions import window, sum, session_window


# tubling of 10 mins
df_sales.groupBy("user_id", window(col("time"), "10 minutes")).agg(sum("amount").alias("total_of_window")).show(truncate=False)

# sliding of size 10 mins, sliding 5 mins
df_sales.groupBy("user_id", window(col("time"), "10 minutes", "5 minutes")).agg(sum("amount").alias("total_of_window")).show(truncate=False)

# session with 10 minutes inactivity
df_sales.groupBy("user_id", session_window(col("time"), "10 minutes")).agg(sum("amount").alias("total_of_window")).show(truncate=False)

+-------+------------------------------------------+---------------+
|user_id|window                                    |total_of_window|
+-------+------------------------------------------+---------------+
|a      |{2025-01-01 09:00:00, 2025-01-01 09:10:00}|2              |
|a      |{2025-01-01 09:10:00, 2025-01-01 09:20:00}|2              |
|a      |{2025-01-01 09:20:00, 2025-01-01 09:30:00}|1              |
|b      |{2025-01-01 09:10:00, 2025-01-01 09:20:00}|2              |
|b      |{2025-01-01 09:20:00, 2025-01-01 09:30:00}|1              |
+-------+------------------------------------------+---------------+

+-------+------------------------------------------+---------------+
|user_id|window                                    |total_of_window|
+-------+------------------------------------------+---------------+
|a      |{2025-01-01 09:00:00, 2025-01-01 09:10:00}|2              |
|a      |{2025-01-01 08:55:00, 2025-01-01 09:05:00}|1              |
|a      |{2025-01-01 09:05:00, 20