In [5]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

In [6]:
spark = SparkSession \
    .builder \
    .appName("PySpark Experimentation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/11 12:25:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [18]:
data_df = spark.createDataFrame([
    (348272371, "2023-01-01", 5.50, "shopping", "true"),
    (348272371, "2023-01-01", 6.10, "salute", "false"),
    (348272371, "2023-01-01", 8.20, "trasporti", "false"),
    (348272371, "2023-01-01", 1.50, "trasporti", "true"),
    (348272371, "2023-01-06", 20.20, "shopping", "false"),
    (348272371, "2023-01-06", 43.00, "shopping", "true"),
    (348272371, "2023-01-06", 72.20, "shopping", "false"),
    (234984832, "2023-01-01", 15.34, "salute", "true"),
    (234984832, "2023-01-01", 36.22, "salute", "true"),
    (234984832, "2023-01-01", 78.35, "salute", "false"),
    (234984832, "2023-01-02", 2.20, "trasporti", "true"),
], schema=["ID_BIC_CLIENTE", "DATA_TRANSAZIONE", "IMPORTO", "CA_CATEGORY_LIV0", "IS_CARTA"])

data_df = data_df.withColumn("DATA_TRANSAZIONE", F.to_timestamp(F.col("DATA_TRANSAZIONE"), "yyyy-MM-dd"))

In [19]:
data_df.printSchema()

root
 |-- ID_BIC_CLIENTE: long (nullable = true)
 |-- DATA_TRANSAZIONE: timestamp (nullable = true)
 |-- IMPORTO_SEGNO: double (nullable = true)
 |-- CA_CATEGORY_LIV0: string (nullable = true)
 |-- IS_CARTA: string (nullable = true)



In [20]:
data_df.show()

+--------------+-------------------+-------------+----------------+--------+
|ID_BIC_CLIENTE|   DATA_TRANSAZIONE|IMPORTO_SEGNO|CA_CATEGORY_LIV0|IS_CARTA|
+--------------+-------------------+-------------+----------------+--------+
|     348272371|2023-01-01 00:00:00|          5.5|        shopping|    true|
|     348272371|2023-01-01 00:00:00|          6.1|          salute|   false|
|     348272371|2023-01-01 00:00:00|          8.2|       trasporti|   false|
|     348272371|2023-01-01 00:00:00|          1.5|       trasporti|    true|
|     348272371|2023-01-04 00:00:00|         20.2|        shopping|   false|
|     348272371|2023-01-04 00:00:00|         43.0|        shopping|    true|
|     348272371|2023-01-04 00:00:00|         72.2|        shopping|   false|
|     234984832|2023-01-01 00:00:00|        15.34|          salute|    true|
|     234984832|2023-01-01 00:00:00|        36.22|          salute|    true|
|     234984832|2023-01-01 00:00:00|        78.35|          salute|   false|

In [21]:
utf_shift_hours = 1
windows_size = "2 days"

In [22]:
window_column = F.window(timeColumn=F.col("DATA_TRANSAZIONE"), windowDuration=windows_size, startTime=f"-{utf_shift_hours} hours")

In [23]:
data_df = data_df.withColumn("window", window_column)

In [24]:
data_df.show(truncate=False)

+--------------+-------------------+-------------+----------------+--------+------------------------------------------+
|ID_BIC_CLIENTE|DATA_TRANSAZIONE   |IMPORTO_SEGNO|CA_CATEGORY_LIV0|IS_CARTA|window                                    |
+--------------+-------------------+-------------+----------------+--------+------------------------------------------+
|348272371     |2023-01-01 00:00:00|5.5          |shopping        |true    |{2023-01-01 00:00:00, 2023-01-03 00:00:00}|
|348272371     |2023-01-01 00:00:00|6.1          |salute          |false   |{2023-01-01 00:00:00, 2023-01-03 00:00:00}|
|348272371     |2023-01-01 00:00:00|8.2          |trasporti       |false   |{2023-01-01 00:00:00, 2023-01-03 00:00:00}|
|348272371     |2023-01-01 00:00:00|1.5          |trasporti       |true    |{2023-01-01 00:00:00, 2023-01-03 00:00:00}|
|348272371     |2023-01-04 00:00:00|20.2         |shopping        |false   |{2023-01-03 00:00:00, 2023-01-05 00:00:00}|
|348272371     |2023-01-04 00:00:00|43.0

In [13]:
# Solution 3

cat_grouped_1_df = data_df.groupBy(F.col("ID_BIC_CLIENTE"), window_column).pivot("categorical_feature_1", ["option_1"]).agg({"numerical_1": "sum", "numerical_2": "mean"})
cat_grouped_2_df = data_df.groupBy(F.col("ID_BIC_CLIENTE"), window_column).pivot("categorical_feature_2").agg({"numerical_2": "sum"})

cat_grouped_df = cat_grouped_1_df.join(cat_grouped_2_df, on=["ID_BIC_CLIENTE", "window"])
cat_grouped_df.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `numerical_2` cannot be resolved. Did you mean one of the following? [`ID_BIC_CLIENTE`, `DATA_TRANSAZIONE`, `IMPORTO_SEGNO`, `CA_CATEGORY_LIV0`, `IS_CARTA`, `window`].

In [23]:
num_grouped_df = data_df.groupBy(F.col("id"), window_column).agg(F.sum("numerical_1").alias("tot"))
num_grouped_df.show()

+---+--------------------+---+
| id|              window|tot|
+---+--------------------+---+
|  1|{2023-05-31 00:00...| 11|
|  1|{2023-06-02 00:00...|  7|
|  1|{2023-06-06 00:00...|  4|
|  2|{2023-06-02 00:00...| 23|
+---+--------------------+---+



In [24]:
grouped_df = cat_grouped_df.join(num_grouped_df, on=["id", "window"])
grouped_df.show()

+---+--------------------+--------+--------+---+
| id|              window|option_1|option_2|tot|
+---+--------------------+--------+--------+---+
|  1|{2023-05-31 00:00...|       5|       6| 11|
|  1|{2023-06-02 00:00...|       7|    null|  7|
|  1|{2023-06-06 00:00...|       4|    null|  4|
|  2|{2023-06-02 00:00...|    null|      23| 23|
+---+--------------------+--------+--------+---+



In [25]:
grouped_df = grouped_df.withColumn("timestamp", F.col("window").start)
# grouped_df = grouped_df.withColumn("window_end", F.col("window").end)

grouped_df.show()

+---+--------------------+--------+--------+---+-------------------+
| id|              window|option_1|option_2|tot|          timestamp|
+---+--------------------+--------+--------+---+-------------------+
|  1|{2023-05-31 00:00...|       5|       6| 11|2023-05-31 00:00:00|
|  1|{2023-06-02 00:00...|       7|    null|  7|2023-06-02 00:00:00|
|  1|{2023-06-06 00:00...|       4|    null|  4|2023-06-06 00:00:00|
|  2|{2023-06-02 00:00...|    null|      23| 23|2023-06-02 00:00:00|
+---+--------------------+--------+--------+---+-------------------+



In [26]:
ids_df = data_df.select(F.col("id"), F.col("window").start.alias("window_start"), F.col("window").end.alias("window_end")).distinct()
ids_df = ids_df.groupBy("id").agg(F.min("window_start").alias("min_window_start"), F.max("window_end").alias("max_window_end"))
ids_df.show()

+---+-------------------+-------------------+
| id|   min_window_start|     max_window_end|
+---+-------------------+-------------------+
|  1|2023-05-31 00:00:00|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|2023-06-04 00:00:00|
+---+-------------------+-------------------+



In [27]:
ids_timestamps_df = ids_df.withColumn("timestamps", F.expr(f"sequence(to_timestamp(min_window_start), to_timestamp(max_window_end), interval {windows_size})")).drop("min_window_start", "max_window_end")
ids_timestamps_df = ids_timestamps_df.withColumn("timestamp", F.explode(F.col("timestamps"))).drop("window_start", "window_end", "timestamps")
ids_timestamps_df.show()

+---+-------------------+
| id|          timestamp|
+---+-------------------+
|  1|2023-05-31 00:00:00|
|  1|2023-06-02 00:00:00|
|  1|2023-06-04 00:00:00|
|  1|2023-06-06 00:00:00|
|  1|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|
|  2|2023-06-04 00:00:00|
+---+-------------------+



In [28]:
data_df.show()

+---+-------------------+-----------+-----------+-------------------+--------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature|              window|
+---+-------------------+-----------+-----------+-------------------+--------------------+
|  1|2023-06-01 00:00:00|          5|          4|           option_1|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          6|          1|           option_2|{2023-05-31 00:00...|
|  1|2023-06-02 00:00:00|          7|          6|           option_1|{2023-06-02 00:00...|
|  1|2023-06-06 00:00:00|          4|          2|           option_1|{2023-06-06 00:00...|
|  2|2023-06-03 00:00:00|         10|         12|           option_2|{2023-06-02 00:00...|
|  2|2023-06-03 00:00:00|         13|         15|           option_2|{2023-06-02 00:00...|
+---+-------------------+-----------+-----------+-------------------+--------------------+



In [29]:
group_on = (grouped_df.timestamp == ids_timestamps_df.timestamp) & (grouped_df.id == ids_timestamps_df.id)
grouped_df.join(ids_timestamps_df, on=["id", "timestamp"], how='right').fillna(0, subset=["tot", "option_1", "option_2"]).drop("window").show(truncate=False)

+---+-------------------+--------+--------+---+
|id |timestamp          |option_1|option_2|tot|
+---+-------------------+--------+--------+---+
|1  |2023-05-31 00:00:00|5       |6       |11 |
|1  |2023-06-02 00:00:00|7       |0       |7  |
|1  |2023-06-04 00:00:00|0       |0       |0  |
|1  |2023-06-06 00:00:00|4       |0       |4  |
|1  |2023-06-08 00:00:00|0       |0       |0  |
|2  |2023-06-02 00:00:00|0       |23      |23 |
|2  |2023-06-04 00:00:00|0       |0       |0  |
+---+-------------------+--------+--------+---+

