# Import di tutte le librerie necessarie, inclusa la nostra libreria di supporto

In [108]:
# Nostre librerie
from ts_train.step.time_bucketing import TimeBucketing
from ts_train.step.aggregating import Aggregating, Aggregation, Filter, AndGroup, OrGroup, Pivot
from ts_train.step.filling import Filling

# Librerie terze
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import count
from pyspark.sql.functions import col, lit
from tsfresh import defaults
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.utilities.dataframe_functions import check_for_nans_in_columns


from pyspark.sql import SparkSession

# Codice per visualizzazione su notebook
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [5]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("feature_generation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/28 15:06:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [64]:
original_data_df = spark.read.parquet("original_transactions.parquet") 
original_data_df.show(truncate=False)
original_data_df.printSchema()

+---------------+--------------+----------------+-------------------+-------+-----+--------------+--------+--------------+---------------+------+------+-------------------------+-------------------------------+-------------+-----+------+----------------+
|ARCA_TIPO_CARTA|DATA_CONTABILE|DATA_TRANSAZIONE|ORA_TRANSAZIONE    |IMPORTO|SEGNO|ID_CLIENTE_BIC|IS_CARTA|TIPO_CANALE   |TIPO_CANALE_AGG|IS_BON|IS_SDD|CATEGORY_LIV0            |CATEGORY_LIV1                  |CATEGORY_LIV2|IS_CC|IS_LIB|MERCHANT        |
+---------------+--------------+----------------+-------------------+-------+-----+--------------+--------+--------------+---------------+------+------+-------------------------+-------------------------------+-------------+-----+------+----------------+
|EVOLUTION      |2021-11-23    |2021-11-20      |2023-08-28 15:33:52|32.25  |-    |1302320901    |true    |FISICO_ESTERNO|FISICO_ESTERNO |false |false |utenze                   |telefono_internet              |null         |false|false

### Metodi secondari utili solo per questa demo

In [14]:
prelievi_aggregated_df = spark.read.parquet("timeseries.parquet") 
prelievi_aggregated_df.show()
prelievi_aggregated_df.printSchema()

+--------------+-------------------+-------------------+-----------------+---------------------------------------------------------+------------------+
|ID_CLIENTE_BIC|       bucket_start|         bucket_end|prelievo_contante|IMPORTO_spese_regolari&IS_CARTA=True&SEGNO=-&tra_20_e_100|conteggio_prelievo|
+--------------+-------------------+-------------------+-----------------+---------------------------------------------------------+------------------+
|      43296171|2022-08-15 00:00:00|2022-08-15 23:59:59|              0.0|                                                      0.0|               0.0|
|      43296171|2022-08-16 00:00:00|2022-08-16 23:59:59|              0.0|                                                      0.0|               0.0|
|      43296171|2022-08-17 00:00:00|2022-08-17 23:59:59|            200.0|                                                      0.0|             200.0|
|      43296171|2022-08-18 00:00:00|2022-08-18 23:59:59|              0.0|              

# Inizio notebook ts-fresh

In [109]:
import tsfresh
from tsfresh.convenience.bindings import spark_feature_extraction_on_chunk
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
from tsfresh import extract_features, select_features
import pandas as pd
from tsfresh import defaults
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.utilities.dataframe_functions import check_for_nans_in_columns


FEATURE_EXTRACTION_LEVEL = MinimalFCParameters

In [136]:
# Parametri 
identifier_col_name = "ID_CLIENTE_BIC"
time_col_name = "bucket_start"

# Feature che vogliamo generare
feature_calculators = [
  'longest_strike_below_mean'
  'benford_correlation',
  'c3',
  'cid_ce',
  'kurtosis',
  'large_standard_deviation',
  'lempel_ziv_complexity',
  'percentage_of_reoccurring_datapoints_to_all_datapoints',
  'range_count',
  'ratio_value_number_to_time_series_length',
  'sample_entropy',
  'sum_of_reoccurring_values',
  'symmetry_looking',
  'variance_larger_than_standard_deviation',
  'variation_coefficient'
  'count_below_mean',
  'sum_values',
  'standard_deviation',
  'number_peaks',
  'longest_strike_below_mean',
  'abs_energy',
  'absolute_sum_of_changes',
  'agg_autocorrelation',
  #'agg_linear_trend',
  #'augmented_dickey_fuller',
  'kurtosis',
  #'large_standard_deviation',
  #'mean_second_derivative_central',
  #'query_similarity_count',
  #'cid_ce',
  #'count_above'
]

In [137]:
# load y_target from y_target.pkl
with open('y_target.pkl', 'rb') as f:
    y_target = pickle.load(f)
    y_target = y_target.squeeze()
    

In [138]:
def stack_df(df, identifier_col_name, time_col_name):
    # drop bucket_end column
    df = df.drop("bucket_end")

    # colonne da usare come chiavi primarie nella nuova tabella
    new_primary_keys = [identifier_col_name] +[time_col_name]
    all_cols = df.columns

    # colonne a cui applicare la pivot
    not_idx_cols = list(set(all_cols) - set(new_primary_keys))
    
    stacked_df = df.unpivot(new_primary_keys,not_idx_cols,variableColumnName="kind", valueColumnName="value")
    
    # cast di valori numerici in double per evitare il lancio di eccezioni
    stacked_df = stacked_df.withColumn("value",stacked_df.value.cast(DoubleType()))

    return stacked_df

#def get_features_setting(feature_calculators: List[str]) -> PickableSettings:
def get_features_setting(feature_calculators):
    default_settings = ComprehensiveFCParameters()
    new_settings = {}
    for feature_calculator_name, feature_calculator_settings in default_settings.items():
        if feature_calculator_name in feature_calculators:
            new_settings[feature_calculator_name] = feature_calculator_settings
  
    return new_settings

def generate_features(stacked_df,feature_calculators,identifier_col_name,time_col_name):
    default_fc_parameters = get_features_setting(feature_calculators)
    grouped_stacked_df = stacked_df.groupby(identifier_col_name, "kind")
    features_df = spark_feature_extraction_on_chunk(grouped_stacked_df, 
                                                    column_id=identifier_col_name,
                                                    column_kind="kind",
                                                    column_sort=time_col_name,
                                                    column_value="value",
                                                    default_fc_parameters=default_fc_parameters
    )
    pivoted_features = features_df.groupby(identifier_col_name).pivot("variable")
    feature_table_df = pivoted_features.agg(F.first("value"))
    return feature_table_df

def _drop_null_columns(df):
    """
    This function drops all columns which contain null values.
    :param df: A PySpark DataFrame
    """    
    null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
    to_drop = [k for k, v in null_counts.items() if v > 0]
    df = df.drop(*to_drop)
    return df

def sanitize_null_features(feature_table_df):
    # renaming colonne per sostituire i "." 
    new_cols = [F.col(f"`{c}`").alias(c.replace(".", "dot")) for c in feature_table_df.columns]
    feature_table = feature_table_df.select(new_cols)

    # Drop all null features
    sanitized_feature_table_df = _drop_null_columns(feature_table)
    return sanitized_feature_table_df


def calculate_relevance_feature(x_data, y_target):
    


    #assert isinstance(x_data, pd.DataFrame), "Please pass features in X as pandas.DataFrame."
    check_for_nans_in_columns(x_data)
    assert isinstance(y_target, (pd.Series, np.ndarray)), (
        "The type of target vector y must be one of: " "pandas.Series, numpy.ndarray"
    )
    assert len(y_target) > 1, "y must contain at least two samples."
    assert len(x_data) == len(y_target), "X and y must contain the same number of samples."
    assert (
        len(set(y_target)) > 1
    ), "Feature selection is only possible if more than 1 label/class is provided"

    if isinstance(y_target, pd.Series) and set(x_data.index) != set(y_target.index):
        raise ValueError("Index of X and y must be identical if provided")

    if isinstance(y_target, np.ndarray):
        y_target = pd.Series(y_target, index=x_data.index)

    relevance_table = calculate_relevance_table(
        x_data, 
        y_target
    )

    return relevance_table

def select_features(features_panda_df, relevance_table_pandas_df):
    relevant_features = relevance_table[relevance_table.relevant].feature
    return features_panda_df.loc[:, relevant_features]

In [139]:
stacked_df = stack_df(prelievi_aggregated_df, identifier_col_name, time_col_name)
feature_table_df = generate_features(stacked_df,feature_calculators, identifier_col_name,time_col_name)
sanitized_feature_table_df = sanitize_null_features(feature_table_df)



features_panda_df = sanitized_feature_table_df.toPandas()
relevance_table = calculate_relevance_feature(features_panda_df, y_target)
selected_features = select_features(features_panda_df, relevance_table)

  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
  return -np.log(A / B)
                                                                                

In [140]:
relevance_table

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
prelievo_contante__longest_strike_below_mean,prelievo_contante__longest_strike_below_mean,real,7.919862e-137,True
prelievo_contante__symmetry_looking__r_0dot45,prelievo_contante__symmetry_looking__r_0dot45,binary,4.024203e-40,True
prelievo_contante__symmetry_looking__r_0dot55,prelievo_contante__symmetry_looking__r_0dot55,binary,4.024203e-40,True
prelievo_contante__symmetry_looking__r_0dot5,prelievo_contante__symmetry_looking__r_0dot5,binary,4.024203e-40,True
prelievo_contante__symmetry_looking__r_0dot65,prelievo_contante__symmetry_looking__r_0dot65,binary,4.024203e-40,True
...,...,...,...,...
prelievo_contante__large_standard_deviation__r_0dot8500000000000001,prelievo_contante__large_standard_deviation__r...,constant,,False
prelievo_contante__large_standard_deviation__r_0dot9,prelievo_contante__large_standard_deviation__r...,constant,,False
prelievo_contante__large_standard_deviation__r_0dot9500000000000001,prelievo_contante__large_standard_deviation__r...,constant,,False
prelievo_contante__range_count__max_0__min_-1000000000000dot0,prelievo_contante__range_count__max_0__min_-10...,constant,,False


In [156]:
print(f"NUm features PRIMA del pruning: {len(features_panda_df.columns)}")

remaining_features = relevance_table[relevance_table["relevant"] == True].set_index('feature')

print(f"NUm features DOPO del pruning: {len(remaining_features)}")
remaining_features.head(100)


NUm features PRIMA del pruning: 208
NUm features DOPO del pruning: 87


Unnamed: 0_level_0,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
prelievo_contante__longest_strike_below_mean,real,7.919862e-137,True
prelievo_contante__symmetry_looking__r_0dot45,binary,4.024203e-40,True
prelievo_contante__symmetry_looking__r_0dot55,binary,4.024203e-40,True
prelievo_contante__symmetry_looking__r_0dot5,binary,4.024203e-40,True
prelievo_contante__symmetry_looking__r_0dot65,binary,4.024203e-40,True
...,...,...,...
conteggio_prelievo__range_count__max_1000000000000dot0__min_0,real,4.309084e-03,True
IMPORTO_spese_regolari&IS_CARTA=True&SEGNO=-&tra_20_e_100__range_count__max_1000000000000dot0__min_0,real,4.309084e-03,True
prelievo_contante__range_count__max_1000000000000dot0__min_0,real,4.309084e-03,True
prelievo_contante__abs_energy,real,4.333242e-03,True


In [81]:
# Creazioen target
''' 
import numpy as np 
import pickle

y_target = np.array(sanitized_feature_table_df.select("prelievo_contante__longest_strike_below_mean").collect())
# dump y_target to file 
with open('y_target.pkl', 'wb') as f:
    pickle.dump(y_target, f)
'''

                                                                                