In [18]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

In [19]:
spark = SparkSession \
    .builder \
    .appName("PySpark Experimentation") \
    .getOrCreate()

In [20]:
data_df = spark.createDataFrame([
    (1, "2023-06-01", 5, "option_1"),
    (1, "2023-06-01", 6, "option_2"),
    (1, "2023-06-02", 7, "option_1"),
    (1, "2023-06-06", 4, "option_1"),
    (2, "2023-06-03", 10, "option_2"),
    (2, "2023-06-03", 13, "option_2"),
], schema=['id', 'timestamp', 'amount', "categorical_feature"])

data_df = data_df.withColumn("timestamp", F.to_timestamp(F.col("timestamp"), "yyyy-MM-dd"))

In [21]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- amount: long (nullable = true)
 |-- categorical_feature: string (nullable = true)



In [22]:
data_df.show()

+---+-------------------+------+-------------------+
| id|          timestamp|amount|categorical_feature|
+---+-------------------+------+-------------------+
|  1|2023-06-01 00:00:00|     5|           option_1|
|  1|2023-06-01 00:00:00|     6|           option_2|
|  1|2023-06-02 00:00:00|     7|           option_1|
|  1|2023-06-06 00:00:00|     4|           option_1|
|  2|2023-06-03 00:00:00|    10|           option_2|
|  2|2023-06-03 00:00:00|    13|           option_2|
+---+-------------------+------+-------------------+



In [23]:
utf_shift_hours = 2
windows_size = "2 days"

In [24]:
window_column = F.window(timeColumn=F.col("timestamp"), windowDuration=windows_size, startTime=f"-{utf_shift_hours} hours")

In [25]:
data_df = data_df.withColumn("window", window_column)

In [26]:
data_df.show(truncate=False)

+---+-------------------+------+-------------------+------------------------------------------+
|id |timestamp          |amount|categorical_feature|window                                    |
+---+-------------------+------+-------------------+------------------------------------------+
|1  |2023-06-01 00:00:00|5     |option_1           |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|6     |option_2           |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-02 00:00:00|7     |option_1           |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
|1  |2023-06-06 00:00:00|4     |option_1           |{2023-06-06 00:00:00, 2023-06-08 00:00:00}|
|2  |2023-06-03 00:00:00|10    |option_2           |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
|2  |2023-06-03 00:00:00|13    |option_2           |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
+---+-------------------+------+-------------------+------------------------------------------+



In [27]:
cat_grouped_df = data_df.groupBy(F.col("id"), window_column).pivot("categorical_feature").count()
cat_grouped_df.show()

+---+--------------------+--------+--------+
| id|              window|option_1|option_2|
+---+--------------------+--------+--------+
|  1|{2023-05-31 00:00...|       1|       1|
|  1|{2023-06-06 00:00...|       1|    null|
|  2|{2023-06-02 00:00...|    null|       2|
|  1|{2023-06-02 00:00...|       1|    null|
+---+--------------------+--------+--------+



In [28]:
num_grouped_df = data_df.groupBy(F.col("id"), window_column).agg(F.sum("amount").alias("tot"))
num_grouped_df.show()

+---+--------------------+---+
| id|              window|tot|
+---+--------------------+---+
|  1|{2023-05-31 00:00...| 11|
|  1|{2023-06-02 00:00...|  7|
|  1|{2023-06-06 00:00...|  4|
|  2|{2023-06-02 00:00...| 23|
+---+--------------------+---+



In [29]:
grouped_df = cat_grouped_df.join(num_grouped_df, on=["id", "window"])
grouped_df.show()

+---+--------------------+--------+--------+---+
| id|              window|option_1|option_2|tot|
+---+--------------------+--------+--------+---+
|  1|{2023-05-31 00:00...|       1|       1| 11|
|  1|{2023-06-02 00:00...|       1|    null|  7|
|  1|{2023-06-06 00:00...|       1|    null|  4|
|  2|{2023-06-02 00:00...|    null|       2| 23|
+---+--------------------+--------+--------+---+



In [30]:
grouped_df = grouped_df.withColumn("timestamp", F.col("window").start)
# grouped_df = grouped_df.withColumn("window_end", F.col("window").end)

grouped_df.show()

+---+--------------------+--------+--------+---+-------------------+
| id|              window|option_1|option_2|tot|          timestamp|
+---+--------------------+--------+--------+---+-------------------+
|  1|{2023-05-31 00:00...|       1|       1| 11|2023-05-31 00:00:00|
|  1|{2023-06-02 00:00...|       1|    null|  7|2023-06-02 00:00:00|
|  1|{2023-06-06 00:00...|       1|    null|  4|2023-06-06 00:00:00|
|  2|{2023-06-02 00:00...|    null|       2| 23|2023-06-02 00:00:00|
+---+--------------------+--------+--------+---+-------------------+



In [36]:
ids_df = data_df.select(F.col("id"), F.col("window").start.alias("window_start"), F.col("window").end.alias("window_end"))
ids_df = ids_df.groupBy("id").agg(F.min("window_start").alias("min_window_start"), F.max("window_end").alias("max_window_end"))
ids_df.show()

+---+-------------------+-------------------+
| id|   min_window_start|     max_window_end|
+---+-------------------+-------------------+
|  1|2023-05-31 00:00:00|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|2023-06-04 00:00:00|
+---+-------------------+-------------------+



In [37]:
ids_timestamps_df = ids_df.withColumn("timestamps", F.expr(f"sequence(to_timestamp(min_window_start), to_timestamp(max_window_end), interval {windows_size})")).drop("min_window_start", "max_window_end")
ids_timestamps_df = ids_timestamps_df.withColumn("timestamp", F.explode(F.col("timestamps"))).drop("timestamps")
ids_timestamps_df.show()

+---+-------------------+
| id|          timestamp|
+---+-------------------+
|  1|2023-05-31 00:00:00|
|  1|2023-06-02 00:00:00|
|  1|2023-06-04 00:00:00|
|  1|2023-06-06 00:00:00|
|  1|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|
|  2|2023-06-04 00:00:00|
+---+-------------------+



In [33]:
data_df.show()

+---+-------------------+------+-------------------+--------------------+
| id|          timestamp|amount|categorical_feature|              window|
+---+-------------------+------+-------------------+--------------------+
|  1|2023-06-01 00:00:00|     5|           option_1|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|     6|           option_2|{2023-05-31 00:00...|
|  1|2023-06-02 00:00:00|     7|           option_1|{2023-06-02 00:00...|
|  1|2023-06-06 00:00:00|     4|           option_1|{2023-06-06 00:00...|
|  2|2023-06-03 00:00:00|    10|           option_2|{2023-06-02 00:00...|
|  2|2023-06-03 00:00:00|    13|           option_2|{2023-06-02 00:00...|
+---+-------------------+------+-------------------+--------------------+



In [38]:
group_on = (grouped_df.timestamp == ids_timestamps_df.timestamp) & (grouped_df.id == ids_timestamps_df.id)
grouped_df.join(ids_timestamps_df, on=["id", "timestamp"], how='right').fillna(0, subset=["tot", "option_1", "option_2"]).show(truncate=False)

+---+-------------------+------------------------------------------+--------+--------+---+
|id |timestamp          |window                                    |option_1|option_2|tot|
+---+-------------------+------------------------------------------+--------+--------+---+
|1  |2023-05-31 00:00:00|{2023-05-31 00:00:00, 2023-06-02 00:00:00}|1       |1       |11 |
|1  |2023-06-02 00:00:00|{2023-06-02 00:00:00, 2023-06-04 00:00:00}|1       |0       |7  |
|1  |2023-06-04 00:00:00|null                                      |0       |0       |0  |
|1  |2023-06-06 00:00:00|{2023-06-06 00:00:00, 2023-06-08 00:00:00}|1       |0       |4  |
|1  |2023-06-08 00:00:00|null                                      |0       |0       |0  |
|2  |2023-06-02 00:00:00|{2023-06-02 00:00:00, 2023-06-04 00:00:00}|0       |2       |23 |
|2  |2023-06-04 00:00:00|null                                      |0       |0       |0  |
+---+-------------------+------------------------------------------+--------+--------+---+