In [4]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

In [5]:
spark = SparkSession \
    .builder \
    .appName("PySpark Experimentation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/18 09:53:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
data_df = spark.createDataFrame([
    (1, "2023-06-01", 5, 4, "option_1", "option_3"),
    (1, "2023-06-01", 6, 1, "option_2", "option_3"),
    (1, "2023-06-02", 7, 6, "option_1", "option_3"),
    (1, "2023-06-06", 4, 2, "option_1", "option_4"),
    (2, "2023-06-03", 10, 12, "option_2", "option_3"),
    (2, "2023-06-03", 13, 15, "option_2", "option_4"),
], schema=["id", "timestamp", "numerical_1", "numerical_2", "categorical_feature_1", "categorical_feature_2"])

data_df = data_df.withColumn("timestamp", F.to_timestamp(F.col("timestamp"), "yyyy-MM-dd"))

In [7]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- numerical_1: long (nullable = true)
 |-- numerical_2: long (nullable = true)
 |-- categorical_feature_1: string (nullable = true)
 |-- categorical_feature_2: string (nullable = true)



In [8]:
data_df.show()

                                                                                

+---+-------------------+-----------+-----------+---------------------+---------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|
+---+-------------------+-----------+-----------+---------------------+---------------------+
|  1|2023-06-01 00:00:00|          5|          4|             option_1|             option_3|
|  1|2023-06-01 00:00:00|          6|          1|             option_2|             option_3|
|  1|2023-06-02 00:00:00|          7|          6|             option_1|             option_3|
|  1|2023-06-06 00:00:00|          4|          2|             option_1|             option_4|
|  2|2023-06-03 00:00:00|         10|         12|             option_2|             option_3|
|  2|2023-06-03 00:00:00|         13|         15|             option_2|             option_4|
+---+-------------------+-----------+-----------+---------------------+---------------------+



In [9]:
utf_shift_hours = 2
windows_size = "2 days"

In [10]:
window_column = F.window(timeColumn=F.col("timestamp"), windowDuration=windows_size, startTime=f"-{utf_shift_hours} hours")

In [11]:
data_df = data_df.withColumn("window", window_column)

In [12]:
data_df.show(truncate=False)

+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|id |timestamp          |numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|window                                    |
+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|1  |2023-06-01 00:00:00|5          |4          |option_1             |option_3             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|6          |1          |option_2             |option_3             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-02 00:00:00|7          |6          |option_1             |option_3             |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
|1  |2023-06-06 00:00:00|4          |2          |option_1             |option_4             |{2023-06-06 00:00:00, 2023-06-08 00:00:00}|
|2  |2023-06-03 00:00:00|10         |12  

In [2]:
#[ [ {"cate1": [opt1,opt2]}, {"cat2":[opt2]} ],[ {"cate1": [opt1]} ] ]
process(filters: list[list[dict[str,list[str]]]], aggr_functions: dict[str,str]) -> final_df


######## Generazione delle aggregazioni
# Algoritmo:
# genero il prodotto cartesiano di tutte le possibili funzioni di aggregazioni su tutte le possibili colonne numeriche
# lista di tutti dizionari di aggregazioni possibili

########  Fase di grouping 
# Algoritmo:
# Definisci la lista di colonne ([id_columns] + windows_column)

### Metodi
# grouping(df) -> grouped_df
## colonne = identifires_column.append(windows_column)
## grouped_columns = [F.col(col) for col in colonne]
## grouped_df = data_df.groupBy(*grouped_columns)ù
# capire se ha senso cachare

########  Fase di Aggregation
# Algoritmo:
# ciclo su tutte le possibili aggregazioni
##### creo una colonna per ogni elemento della lista
##### se l'elemento della lista contiene solo una tuple allora la mando a fare pivot
######## il pivot lo fa sulla colonna messa come secondo elemento della tupla 
##### se come secondo elemento troviamo * allora non si mettono filtri nel pivot e lo fa su tutte
####### se l'elemento della lista contiene più di una tupla allora la mando a fare la select
####### tutte le opzioni di una stessa categorica vanno in or (prendi la somma di tutti i soldi spesi in cibo o in sigarette)
####### tutti gli elementi di categorie diverse vanno in and (prendi la somma di tutti i soldi spesi in (cibo or sigarette) and (pagari in contanti)
# faccio il join tra tutti i df che ho creato
from functools import reduce
def join_dataframes(df1: DataFrame, df2: DataFrame):
    return df1.join(df2, "primary_key_column")
df_final = reduce(join_dataframes, df_list)

### Metodi
pivoting(grouped_df, filter: dict[str,str], aggr_functions: dict[str,str]) -> final_df
    return grouped_df.pivot(categorical_var_names,options).agg(aggr_functions)

selecting(grouped_df, filters: list[dict[str,list[str]]], aggr_functions: dict[str,str]) -> final_df
# Filtra le righe che soddisfano le condizioni desiderate
filter_conditions = [
    (F.col("categorical_feature_1").isin(["pasta", "spezie"])),
    (F.col("categorical_feature_2").isin(["bitcoin", "cash"]))
]
# Combina tutti i filtri in una unica espressione logica utilizzando l'operatore logico AND
filter_expression = reduce(lambda f1, f2: f1 & f2, filter_conditions)
# lambda lambda f1, f2: f1 & f2 -> funzione anonima che prende un due filtri e una condizione e ritorna l'unione dei due filtri secondo la condizione data
# reduce(funzione, lista) -> applica la funzione a tutti gli elementi della lista e ritorna il risultato
# Applica il filtro complessivo una volta sola 
 filtered_df = data_df.filter(filter_expression)





[1, 2, 3, 4]

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from functools import reduce

# Crea la sessione di Spark
spark = SparkSession.builder.getOrCreate()

# Definisci lo schema del DataFrame di esempio
schema = ["id", "timestamp", "numerical_1", "numerical_2", "categorical_feature_1", "categorical_feature_2"]

# Crea il DataFrame di esempio
data_df = spark.createDataFrame([
    (1, "2023-06-01", 5, 4, "pasta", "bitcoin"),
    (1, "2023-06-01", 4, 1, "spezie", "bancomat"),
    (1, "2023-06-01", 6, 1, "spazzolini", "bancomat"),
    (1, "2023-06-02", 7, 6, "pasta", "bancomat"),
    (1, "2023-06-06", 4, 2, "pasta", "cash"),
    #(2, "2023-06-03", 10, 12, "option_2", "option_3"),
    #(2, "2023-06-03", 13, 15, "option_2", "option_4"),
], schema=schema)




# Converti la colonna "timestamp" in formato data
data_df = data_df.withColumn("timestamp", F.to_timestamp(F.col("timestamp"), "yyyy-MM-dd"))

filter_conditions = [
    (F.col("categorical_feature_1").isin(["pasta", "spezie"])),
    (F.col("categorical_feature_2").isin(["bitcoin", "cash"]))
]

# Applica i filtri utilizzando l'operatore logico AND
filter_expression = reduce(lambda f1, f2: f1 & f2, filter_conditions)
# Applica il filtro complessivo una volta solo su una condizioni logica creata unendo tutti i possibili filtri
filtered_df = data_df.filter(filter_expression)


# Calcola la media di numerical_1
aggregated_df = filtered_df.groupBy().avg("numerical_1")

# Visualizza il risultato
aggregated_df.show()

data_df.show()



+----------------+
|avg(numerical_1)|
+----------------+
|             4.5|
+----------------+

+---+-------------------+-----------+-----------+---------------------+---------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|
+---+-------------------+-----------+-----------+---------------------+---------------------+
|  1|2023-06-01 00:00:00|          5|          4|                pasta|              bitcoin|
|  1|2023-06-01 00:00:00|          4|          1|               spezie|             bancomat|
|  1|2023-06-01 00:00:00|          6|          1|           spazzolini|             bancomat|
|  1|2023-06-02 00:00:00|          7|          6|                pasta|             bancomat|
|  1|2023-06-06 00:00:00|          4|          2|                pasta|                 cash|
+---+-------------------+-----------+-----------+---------------------+---------------------+



In [17]:
# Solution 3

cat_grouped_1_df = data_df.groupBy(F.col("id"), window_column).pivot("categorical_feature_1", []).agg({"numerical_1": "sum"})
cat_grouped_2_df = data_df.groupBy(F.col("id"), window_column).pivot("categorical_feature_2").agg({"numerical_2": "sum"})

cat_grouped_df = cat_grouped_1_df.join(cat_grouped_2_df, on=["id", "window"])
cat_grouped_1_df.show()

+---+--------------------+
| id|              window|
+---+--------------------+
|  1|{2023-05-31 00:00...|
|  1|{2023-06-02 00:00...|
|  1|{2023-06-06 00:00...|
|  2|{2023-06-02 00:00...|
+---+--------------------+



In [23]:
num_grouped_df = data_df.groupBy(F.col("id"), window_column).agg(F.sum("numerical_1").alias("tot"))
num_grouped_df.show()

+---+--------------------+---+
| id|              window|tot|
+---+--------------------+---+
|  1|{2023-05-31 00:00...| 11|
|  1|{2023-06-02 00:00...|  7|
|  1|{2023-06-06 00:00...|  4|
|  2|{2023-06-02 00:00...| 23|
+---+--------------------+---+



In [24]:
grouped_df = cat_grouped_df.join(num_grouped_df, on=["id", "window"])
grouped_df.show()

+---+--------------------+--------+--------+---+
| id|              window|option_1|option_2|tot|
+---+--------------------+--------+--------+---+
|  1|{2023-05-31 00:00...|       5|       6| 11|
|  1|{2023-06-02 00:00...|       7|    null|  7|
|  1|{2023-06-06 00:00...|       4|    null|  4|
|  2|{2023-06-02 00:00...|    null|      23| 23|
+---+--------------------+--------+--------+---+



In [25]:
grouped_df = grouped_df.withColumn("timestamp", F.col("window").start)
# grouped_df = grouped_df.withColumn("window_end", F.col("window").end)

grouped_df.show()

+---+--------------------+--------+--------+---+-------------------+
| id|              window|option_1|option_2|tot|          timestamp|
+---+--------------------+--------+--------+---+-------------------+
|  1|{2023-05-31 00:00...|       5|       6| 11|2023-05-31 00:00:00|
|  1|{2023-06-02 00:00...|       7|    null|  7|2023-06-02 00:00:00|
|  1|{2023-06-06 00:00...|       4|    null|  4|2023-06-06 00:00:00|
|  2|{2023-06-02 00:00...|    null|      23| 23|2023-06-02 00:00:00|
+---+--------------------+--------+--------+---+-------------------+



In [26]:
ids_df = data_df.select(F.col("id"), F.col("window").start.alias("window_start"), F.col("window").end.alias("window_end")).distinct()
ids_df = ids_df.groupBy("id").agg(F.min("window_start").alias("min_window_start"), F.max("window_end").alias("max_window_end"))
ids_df.show()

+---+-------------------+-------------------+
| id|   min_window_start|     max_window_end|
+---+-------------------+-------------------+
|  1|2023-05-31 00:00:00|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|2023-06-04 00:00:00|
+---+-------------------+-------------------+



In [27]:
ids_timestamps_df = ids_df.withColumn("timestamps", F.expr(f"sequence(to_timestamp(min_window_start), to_timestamp(max_window_end), interval {windows_size})")).drop("min_window_start", "max_window_end")
ids_timestamps_df = ids_timestamps_df.withColumn("timestamp", F.explode(F.col("timestamps"))).drop("window_start", "window_end", "timestamps")
ids_timestamps_df.show()

+---+-------------------+
| id|          timestamp|
+---+-------------------+
|  1|2023-05-31 00:00:00|
|  1|2023-06-02 00:00:00|
|  1|2023-06-04 00:00:00|
|  1|2023-06-06 00:00:00|
|  1|2023-06-08 00:00:00|
|  2|2023-06-02 00:00:00|
|  2|2023-06-04 00:00:00|
+---+-------------------+



In [28]:
data_df.show()

+---+-------------------+-----------+-----------+-------------------+--------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature|              window|
+---+-------------------+-----------+-----------+-------------------+--------------------+
|  1|2023-06-01 00:00:00|          5|          4|           option_1|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          6|          1|           option_2|{2023-05-31 00:00...|
|  1|2023-06-02 00:00:00|          7|          6|           option_1|{2023-06-02 00:00...|
|  1|2023-06-06 00:00:00|          4|          2|           option_1|{2023-06-06 00:00...|
|  2|2023-06-03 00:00:00|         10|         12|           option_2|{2023-06-02 00:00...|
|  2|2023-06-03 00:00:00|         13|         15|           option_2|{2023-06-02 00:00...|
+---+-------------------+-----------+-----------+-------------------+--------------------+



In [29]:
group_on = (grouped_df.timestamp == ids_timestamps_df.timestamp) & (grouped_df.id == ids_timestamps_df.id)
grouped_df.join(ids_timestamps_df, on=["id", "timestamp"], how='right').fillna(0, subset=["tot", "option_1", "option_2"]).drop("window").show(truncate=False)

+---+-------------------+--------+--------+---+
|id |timestamp          |option_1|option_2|tot|
+---+-------------------+--------+--------+---+
|1  |2023-05-31 00:00:00|5       |6       |11 |
|1  |2023-06-02 00:00:00|7       |0       |7  |
|1  |2023-06-04 00:00:00|0       |0       |0  |
|1  |2023-06-06 00:00:00|4       |0       |4  |
|1  |2023-06-08 00:00:00|0       |0       |0  |
|2  |2023-06-02 00:00:00|0       |23      |23 |
|2  |2023-06-04 00:00:00|0       |0       |0  |
+---+-------------------+--------+--------+---+

