In [4]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

In [5]:
spark = SparkSession \
    .builder \
    .appName("PySpark Experimentation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/04 11:06:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
data_df = spark.createDataFrame([
    (1, "2023-06-01", 5, 4, "option_1", "option_3"),
    (1, "2023-06-01", 6, 1, "option_2", "option_3"),
    (1, "2023-06-02", 7, 6, "option_1", "option_3"),
    (1, "2023-06-06", 4, 2, "option_1", "option_4"),
    (2, "2023-06-03", 10, 12, "option_2", "option_3"),
    (2, "2023-06-03", 13, 15, "option_2", "option_4"),
], schema=["id", "timestamp", "numerical_1", "numerical_2", "categorical_feature_1", "categorical_feature_2"])

data_df = data_df.withColumn("timestamp", F.to_timestamp(F.col("timestamp"), "yyyy-MM-dd"))

In [7]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- numerical_1: long (nullable = true)
 |-- numerical_2: long (nullable = true)
 |-- categorical_feature_1: string (nullable = true)
 |-- categorical_feature_2: string (nullable = true)



In [8]:
data_df.show()

                                                                                

+---+-------------------+-----------+-----------+---------------------+---------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|
+---+-------------------+-----------+-----------+---------------------+---------------------+
|  1|2023-06-01 00:00:00|          5|          4|             option_1|             option_3|
|  1|2023-06-01 00:00:00|          6|          1|             option_2|             option_3|
|  1|2023-06-02 00:00:00|          7|          6|             option_1|             option_3|
|  1|2023-06-06 00:00:00|          4|          2|             option_1|             option_4|
|  2|2023-06-03 00:00:00|         10|         12|             option_2|             option_3|
|  2|2023-06-03 00:00:00|         13|         15|             option_2|             option_4|
+---+-------------------+-----------+-----------+---------------------+---------------------+



In [9]:
utf_shift_hours = 2
windows_size = "2 days"

In [10]:
window_column = F.window(timeColumn=F.col("timestamp"), windowDuration=windows_size, startTime=f"-{utf_shift_hours} hours")

In [11]:
data_df = data_df.withColumn("window", window_column)

In [12]:
data_df.show(truncate=False)

+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|id |timestamp          |numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|window                                    |
+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|1  |2023-06-01 00:00:00|5          |4          |option_1             |option_3             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|6          |1          |option_2             |option_3             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-02 00:00:00|7          |6          |option_1             |option_3             |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
|1  |2023-06-06 00:00:00|4          |2          |option_1             |option_4             |{2023-06-06 00:00:00, 2023-06-08 00:00:00}|
|2  |2023-06-03 00:00:00|10         |12  

23/07/04 11:07:06 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [20]:
# Solution 3

cat_grouped_1_df = data_df.groupBy(F.col("id"), window_column).pivot("categorical_feature_1", ["option_1"]).agg({"numerical_1": "sum", "numerical_2": "mean"})
cat_grouped_2_df = data_df.groupBy(F.col("id"), window_column).pivot("categorical_feature_2").agg({"numerical_2": "sum"})

cat_grouped_df = cat_grouped_1_df.join(cat_grouped_2_df, on=["id", "window"])
cat_grouped_df.show()

+---+--------------------+-------------------------+-------------------------+--------+--------+
| id|              window|option_1_avg(numerical_2)|option_1_sum(numerical_1)|option_3|option_4|
+---+--------------------+-------------------------+-------------------------+--------+--------+
|  1|{2023-05-31 00:00...|                      4.0|                        5|       5|    null|
|  1|{2023-06-06 00:00...|                      2.0|                        4|    null|       2|
|  2|{2023-06-02 00:00...|                     null|                     null|      12|      15|
|  1|{2023-06-02 00:00...|                      6.0|                        7|       6|    null|
+---+--------------------+-------------------------+-------------------------+--------+--------+



In [23]:
num_grouped_df = data_df.groupBy(F.col("id"), window_column).agg(F.sum("numerical_1").alias("tot"))
num_grouped_df.show()

+---+--------------------+---+
| id|              window|tot|
+---+--------------------+---+
|  1|{2023-05-31 00:00...| 11|
|  1|{2023-06-02 00:00...|  7|
|  1|{2023-06-06 00:00...|  4|
|  2|{2023-06-02 00:00...| 23|
+---+--------------------+---+



In [24]:
grouped_df = cat_grouped_df.join(num_grouped_df, on=["id", "window"])
grouped_df.show()

+---+--------------------+--------+--------+---+
| id|              window|option_1|option_2|tot|
+---+--------------------+--------+--------+---+
|  1|{2023-05-31 00:00...|       5|       6| 11|
|  1|{2023-06-02 00:00...|       7|    null|  7|
|  1|{2023-06-06 00:00...|       4|    null|  4|
|  2|{2023-06-02 00:00...|    null|      23| 23|
+---+--------------------+--------+--------+---+



In [25]:
grouped_df = grouped_df.withColumn("timestamp", F.col("window").start)
# grouped_df = grouped_df.withColumn("window_end", F.col("window").end)

grouped_df.show()

+---+--------------------+--------+--------+---+-------------------+
| id|              window|option_1|option_2|tot|          timestamp|
+---+--------------------+--------+--------+---+-------------------+
|  1|{2023-05-31 00:00...|       5|       6| 11|2023-05-31 00:00:00|
|  1|{2023-06-02 00:00...|       7|    null|  7|2023-06-02 00:00:00|
|  1|{2023-06-06 00:00...|       4|    null|  4|2023-06-06 00:00:00|
|  2|{2023-06-02 00:00...|    null|      23| 23|2023-06-02 00:00:00|
+---+--------------------+--------+--------+---+-------------------+



In [26]:
ids_df = data_df.select(F.col("id"), F.col("window").start.alias("window_start"), F.col("window").end.alias("window_end")).distinct()
ids_df = ids_df.groupBy("id").agg(F.min("window_start").alias("min_window_start"), F.max("window_end").alias("max_window_end"))
ids_df.show()

+---+-------------------+-------------------+
| id|   min_window_start|     max_window_end|
+---+-------------------+-------------------+
|  1|2023-05-31 00:00:00|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|2023-06-04 00:00:00|
+---+-------------------+-------------------+



In [27]:
ids_timestamps_df = ids_df.withColumn("timestamps", F.expr(f"sequence(to_timestamp(min_window_start), to_timestamp(max_window_end), interval {windows_size})")).drop("min_window_start", "max_window_end")
ids_timestamps_df = ids_timestamps_df.withColumn("timestamp", F.explode(F.col("timestamps"))).drop("window_start", "window_end", "timestamps")
ids_timestamps_df.show()

+---+-------------------+
| id|          timestamp|
+---+-------------------+
|  1|2023-05-31 00:00:00|
|  1|2023-06-02 00:00:00|
|  1|2023-06-04 00:00:00|
|  1|2023-06-06 00:00:00|
|  1|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|
|  2|2023-06-04 00:00:00|
+---+-------------------+



In [28]:
data_df.show()

+---+-------------------+-----------+-----------+-------------------+--------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature|              window|
+---+-------------------+-----------+-----------+-------------------+--------------------+
|  1|2023-06-01 00:00:00|          5|          4|           option_1|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          6|          1|           option_2|{2023-05-31 00:00...|
|  1|2023-06-02 00:00:00|          7|          6|           option_1|{2023-06-02 00:00...|
|  1|2023-06-06 00:00:00|          4|          2|           option_1|{2023-06-06 00:00...|
|  2|2023-06-03 00:00:00|         10|         12|           option_2|{2023-06-02 00:00...|
|  2|2023-06-03 00:00:00|         13|         15|           option_2|{2023-06-02 00:00...|
+---+-------------------+-----------+-----------+-------------------+--------------------+



In [29]:
group_on = (grouped_df.timestamp == ids_timestamps_df.timestamp) & (grouped_df.id == ids_timestamps_df.id)
grouped_df.join(ids_timestamps_df, on=["id", "timestamp"], how='right').fillna(0, subset=["tot", "option_1", "option_2"]).drop("window").show(truncate=False)

+---+-------------------+--------+--------+---+
|id |timestamp          |option_1|option_2|tot|
+---+-------------------+--------+--------+---+
|1  |2023-05-31 00:00:00|5       |6       |11 |
|1  |2023-06-02 00:00:00|7       |0       |7  |
|1  |2023-06-04 00:00:00|0       |0       |0  |
|1  |2023-06-06 00:00:00|4       |0       |4  |
|1  |2023-06-08 00:00:00|0       |0       |0  |
|2  |2023-06-02 00:00:00|0       |23      |23 |
|2  |2023-06-04 00:00:00|0       |0       |0  |
+---+-------------------+--------+--------+---+

