In [1]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from functools import reduce
from typing import List, Tuple, Union, Dict
from pyspark.sql.dataframe import DataFrame


In [2]:
spark = SparkSession \
    .builder \
    .appName("PySpark Experimentation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/24 10:26:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/24 10:27:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Crea la sessione di Spark
spark = SparkSession.builder.getOrCreate()

# Definisci lo schema del DataFrame di esempio
schema = ["id", "timestamp", "numerical_1", "numerical_2", "categorical_feature_1", "categorical_feature_2"]

# Crea il DataFrame di esempio
data_df = spark.createDataFrame([
    (1, "2023-06-01", 1, 2, "pasta", "bitcoin"),
    (1, "2023-06-01", 2, 3, "pasta", "cash"),
    (1, "2023-06-01", 4, 1, "spezie", "bancomat"),
    (1, "2023-06-01", 6, 1, "spazzolini", "bancomat"),
    (1, "2023-06-02", 7, 6, "pasta", "bancomat"),
    (1, "2023-06-02", 7, 6, "spezie", "bancomat"),
    (1, "2023-06-06", 4, 2, "pasta", "cash"),
    (2, "2023-06-03", 10, 12, "pasta", "cash"),
    (2, "2023-06-03", 13, 15, "spazzolini", "cash"),
    (2, "2023-06-03", 1, 15, "spazzolini", "cash"),
], schema=schema)


# Converti la colonna "timestamp" in formato data
data_df = data_df.withColumn("timestamp", F.to_timestamp(F.col("timestamp"), "yyyy-MM-dd"))

In [4]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- numerical_1: long (nullable = true)
 |-- numerical_2: long (nullable = true)
 |-- categorical_feature_1: string (nullable = true)
 |-- categorical_feature_2: string (nullable = true)



In [5]:
data_df.show()

+---+-------------------+-----------+-----------+---------------------+---------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|
+---+-------------------+-----------+-----------+---------------------+---------------------+
|  1|2023-06-01 00:00:00|          1|          2|                pasta|              bitcoin|
|  1|2023-06-01 00:00:00|          2|          3|                pasta|                 cash|
|  1|2023-06-01 00:00:00|          4|          1|               spezie|             bancomat|
|  1|2023-06-01 00:00:00|          6|          1|           spazzolini|             bancomat|
|  1|2023-06-02 00:00:00|          7|          6|                pasta|             bancomat|
|  1|2023-06-02 00:00:00|          7|          6|               spezie|             bancomat|
|  1|2023-06-06 00:00:00|          4|          2|                pasta|                 cash|
|  2|2023-06-03 00:00:00|         10|         12|           

STEP1: Bucketing

In [6]:
utf_shift_hours = 2
windows_size = "2 days"

window_column = F.window(timeColumn=F.col("timestamp"), windowDuration=windows_size, startTime=f"-{utf_shift_hours} hours")
data_df = data_df.withColumn("bucket", window_column)
data_df.show(truncate=False)

+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|id |timestamp          |numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|bucket                                    |
+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|1  |2023-06-01 00:00:00|1          |2          |pasta                |bitcoin              |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|2          |3          |pasta                |cash                 |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|4          |1          |spezie               |bancomat             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|6          |1          |spazzolini           |bancomat             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-02 00:00:00|7          |6   

STEP2: Aggregations

In [7]:
########  Generazione di tutte le aggregazioni 
# genero il prodotto cartesiano di tutte le possibili funzioni di aggregazioni su tutte le possibili colonne numeriche
# lista di tutti dizionari di aggregazioni possibili
def _all_aggregation_combination(numerical_col_name: list[str], aggr_functions: list[str])->list[dict[str,str]]:
    all_aggregations = []
    for col in numerical_col_name:
        for func in aggr_functions:
            all_aggregations.append({col: func})

    return all_aggregations

In [8]:
########  Fase di grouping 
# Algoritmo:
# Definisci la lista di colonne ([id_columns] + windows_column)
def _grouping(data_df: DataFrame, identifier_cols_name: list[str]) -> DataFrame:
    grouped_columns = [F.col(col) for col in identifier_cols_name] 
    grouped_df = data_df.groupBy(*grouped_columns)
    return grouped_df
    #capire se ha senso cachare

In [9]:
########  Fase di Aggregation

def _pivoting(grouped_df: DataFrame, filter: list[tuple[str,list[str]]], aggregation: dict[str,str]) -> DataFrame:
    cat_var_name,cat_options = filter[0]
    if len(cat_options) == 0:
        print("pivot senza filtri")
        pivoted_df = grouped_df.pivot(cat_var_name).agg(aggregation)
    else: 
        print("pivot con filtri di opzioni")
        pivoted_df = grouped_df.pivot(cat_var_name,cat_options).agg(aggregation)
    return pivoted_df 

#schema = ["id", "timestamp", "numerical_1", "numerical_2", "categorical_feature_1", "categorical_feature_2"]
_pivoting(data_df.groupBy("id"),[("categorical_feature_1",[])],{"numerical_1":"sum"}).show()

pivot senza filtri
+---+-----+----------+------+
| id|pasta|spazzolini|spezie|
+---+-----+----------+------+
|  1|   14|         6|    11|
|  2|   10|        14|  null|
+---+-----+----------+------+



In [10]:

def create_column_name(operation: str, filters: list[tuple[str,list[str]]], numerical_col_name: str ) -> str:
    new_col_name = f"{operation}_of_{numerical_col_name}"
    for filter in filters:
        create_column_name, categotrical_options = filter
        if len(create_column_name) >= 0:
            new_col_name += f"_by_{create_column_name}"
        if len(categotrical_options) > 0:
            new_col_name += f"_({'_'.join(categotrical_options)})"
        new_col_name += f"_and"
    new_col_name = new_col_name[:-4] # remove last "_and_"
    return new_col_name


create_column_name("sum",[("cat_lv0",[]), ("cat_lv1", ["pasta", "penne"])],"importo")


'sum_of_importo_by_cat_lv0_and_by_cat_lv1_(pasta_penne)'

In [11]:
### VERSIONE SU SINGOLO UTENTE
def _selecting(
        self,
        df: DataFrame,
        filters: list[tuple[str, list[str]]],
        aggregation: dict[str, str],
    ) -> DataFrame:
        # Filtra le righe che soddisfano le condizioni desiderate
        filter_conditions = []
        for single_filter in filters:
            filter_column, filter_options = single_filter
            # controllare se filter_options non sia vuota
            filter_conditions.append(F.col(filter_column).isin(filter_options))

        # Combina tutti i filtri in una unica espressione logica utilizzando l'operatore logico AND
        # lambda lambda f1, f2: f1 & f2 -> funzione anonima che prende un due filtri e una condizione e ritorna l'unione dei due filtri secondo la condizione data
        # reduce(funzione, lista) -> applica la funzione a tutti gli elementi della lista e ritorna il risultato
        # Applica il filtro complessivo una volta sola
        filter_expression = reduce(lambda f1, f2: f1 & f2, filter_conditions)

        filtered_df = df.filter(filter_expression)
        return filtered_df

In [12]:
### VERSIONE FUNZIONANTE MA POCO EFFICIENTE

def _selecting(df: DataFrame, identifier_cols_name: List[str], filters: List[Tuple[str, List[str]]], aggregation: Dict[str, str]) -> DataFrame:
    # Applicare i filtri alle variabili categoriche
    # Raggruppa i dati in base all'ID utente
    
    unique_ids = data_df.select("id").distinct().rdd.flatMap(lambda x: x).collect()

    list_of_dataframes = []
    for uid in unique_ids:
        df= data_df.filter(data_df["id"] == uid)
        
        filter_conditions = []
        for single_filter in filters:
            filter_column, filter_options = single_filter
            # controllare se filter_options non sia vuota
            filter_conditions.append(F.col(filter_column).isin(filter_options))
    
        filter_expression = reduce(lambda f1, f2: f1 & f2, filter_conditions)
        filtered_df = df.filter(filter_expression)
        
        # Aggregare i dati per ottenere la somma su ogni finestra
        result_df = filtered_df.groupBy(*identifier_cols_name).agg(aggregation)
        

        # Rinominare le colonne risultanti usando la funzione di utilità
        numerical_col_name = list(aggregation.keys())[0]
        agg_func = list(aggregation.values())[0]
        new_col_name = create_column_name(agg_func, filters, numerical_col_name)
        spark_auto_col_name = agg_func + "(" + numerical_col_name + ")" # result_df.columns[-1]
        result_df = result_df.withColumnRenamed(spark_auto_col_name, new_col_name)

        list_of_dataframes.append(result_df)

    all_combined_result_df = reduce(lambda df1, df2: df1.union(df2), list_of_dataframes)
    return all_combined_result_df



# Chiamata al metodo _selecting
data_df.show()

result_df = _selecting(data_df, ["id", "bucket"], [("categorical_feature_1", ["spazzolini", "spezie"])], {"numerical_1": "sum"})
result_df.show()



+---+-------------------+-----------+-----------+---------------------+---------------------+--------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|              bucket|
+---+-------------------+-----------+-----------+---------------------+---------------------+--------------------+
|  1|2023-06-01 00:00:00|          1|          2|                pasta|              bitcoin|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          2|          3|                pasta|                 cash|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          4|          1|               spezie|             bancomat|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          6|          1|           spazzolini|             bancomat|{2023-05-31 00:00...|
|  1|2023-06-02 00:00:00|          7|          6|                pasta|             bancomat|{2023-06-02 00:00...|
|  1|2023-06-02 00:00:00|          7|          6|               spezie|         

In [13]:
from pyspark.sql import DataFrame, functions as F

### VERSIONE FUNZIONANTE PIU EFFICIENTE


def _selecting(df: DataFrame, identifier_cols_name: List[str], filters: List[Tuple[str, List[str]]], aggregation: Dict[str, str]) -> DataFrame:
    # Applicare i filtri alle variabili categoriche
    filter_conditions = []
    for single_filter in filters:
        filter_column, filter_options = single_filter
        filter_conditions.append(F.col(filter_column).isin(filter_options))
    
    filter_expression = reduce(lambda f1, f2: f1 & f2, filter_conditions)
    filtered_df = df.filter(filter_expression)

    # Aggregare i dati per ottenere la somma su ogni finestra
    result_df = filtered_df.groupBy(*identifier_cols_name).agg(aggregation)

    # Rinominare le colonne risultanti usando la funzione di utilità
    numerical_col_name = list(aggregation.keys())[0]
    agg_func = list(aggregation.values())[0]
    new_col_name = create_column_name(agg_func, filters, numerical_col_name)
    result_df = result_df.withColumnRenamed(agg_func + "(" + numerical_col_name + ")", new_col_name)

    return result_df

data_df.show(truncate=False)
result_df = _selecting(data_df, ["id", "bucket"], [("categorical_feature_1", ["spazzolini", "spezie"])], {"numerical_1": "sum", "numerical_2": "avg"})
result_df.show(truncate=False)

+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|id |timestamp          |numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|bucket                                    |
+---+-------------------+-----------+-----------+---------------------+---------------------+------------------------------------------+
|1  |2023-06-01 00:00:00|1          |2          |pasta                |bitcoin              |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|2          |3          |pasta                |cash                 |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|4          |1          |spezie               |bancomat             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|6          |1          |spazzolini           |bancomat             |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-02 00:00:00|7          |6   

In [14]:
def _join_dataframes(df1: DataFrame, df2: DataFrame, join_columns: list[str]) -> DataFrame:
    return df1.join(df2, join_columns) 


In [15]:
# Algoritmo:
# ciclo su tutte le possibili aggregazioni
##### creo una colonna per ogni elemento della lista
##### se l'elemento della lista contiene solo una tuple allora la mando a fare pivot
######## il pivot lo fa sulla colonna messa come secondo elemento della tupla 
##### se come secondo elemento troviamo * allora non si mettono filtri nel pivot e lo fa su tutte
####### se l'elemento della lista contiene più di una tupla allora la mando a fare la select
####### tutte le opzioni di una stessa categorica vanno in or (prendi la somma di tutti i soldi spesi in cibo o in sigarette)
####### tutti gli elementi di categorie diverse vanno in and (prendi la somma di tutti i soldi spesi in (cibo or sigarette) and (pagari in contanti)
# faccio il join tra tutti i df che ho creato

def _aggregation_with_filter(grouped_df: DataFrame, all_aggregation_filters: list[list[tuple[str,list[str]]]], numerical_col_name: list[str],agg_funcs: list[str], identifier_cols_name: list[str]) -> DataFrame:
    extended_id_cols_name = identifier_cols_name + ["bucket"]
    
    all_aggregations = _all_aggregation_combination(numerical_col_name, agg_funcs)
    #grouped_df = _grouping(data_df, extended_id_cols_name)

    all_aggregated_df = []

    for aggregation in all_aggregations:
        for aggregation_filter in all_aggregation_filters:
            if len(aggregation_filter) > 1:
                print("in selectiong")
                filtered_df = _selecting(grouped_df, aggregation_filter, aggregation)
                all_aggregated_df.append(filtered_df)
            else: 
                print("in pivoting")
                pivoted_df = _pivoting(grouped_df, aggregation_filter, aggregation)
                all_aggregated_df.append(pivoted_df)
               
    
    df_final = reduce(lambda df1, df2: _join_dataframes(df1, df2, extended_id_cols_name), all_aggregated_df)

    return df_final



In [16]:
#pipeline
#configurazione di input:
''' 
numerical_col_name = ["numerical_1", "numerical_2"]
identifier_cols_name = ["id"]  # type: ignore
aggregation_filters =   [   
                            [ ("categorical_feature_1", ["pasta","spezie"])],
                            [ ("categorical_feature_1", ["pasta"]), ("categorical_feature_1",["spezie"]) ],
                            [("categorical_feature_1",["pasta","spezie"]), ("categorical_feature_2",["cash"]) ],
                ]
agg_funcs = ["sum", "avg"]  # type: ignore
'''

# bug noto: non mi rinomina le colonne che crea
numerical_col_name = ["numerical_1", "numerical_2"]
identifier_cols_name = ["id"]  # type: ignore
aggregation_filters: list[list[tuple[str, list[str]]]] =   [   
                            [("categorical_feature_1",[])],
                        ]
agg_funcs = ["sum"]  # type: ignore


aggregated_df = _aggregation_with_filter(data_df, aggregation_filters, numerical_col_name, agg_funcs, identifier_cols_name)
aggregated_df.show()

in pivoting
pivot senza filtri


AttributeError: 'DataFrame' object has no attribute 'pivot'

23/07/24 10:27:12 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
data_df.show()

+---+-------------------+-----------+-----------+---------------------+---------------------+--------------------+
| id|          timestamp|numerical_1|numerical_2|categorical_feature_1|categorical_feature_2|              window|
+---+-------------------+-----------+-----------+---------------------+---------------------+--------------------+
|  1|2023-06-01 00:00:00|          1|          2|                pasta|              bitcoin|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          2|          3|                pasta|                 cash|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          4|          1|               spezie|             bancomat|{2023-05-31 00:00...|
|  1|2023-06-01 00:00:00|          6|          1|           spazzolini|             bancomat|{2023-05-31 00:00...|
|  1|2023-06-02 00:00:00|          7|          6|                pasta|             bancomat|{2023-06-02 00:00...|
|  1|2023-06-06 00:00:00|          4|          2|                pasta|         

23/07/21 10:49:57 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
