In [1]:
from pyspark.sql.functions import col

from pyspark.sql import SparkSession
from ts_train.tr2ts.time_bucketing import TimeBucketing
from ts_train.tr2ts.aggregating import Aggregating, Aggregation, Filter, AndGroup
from ts_train.tr2ts.filling import Filling
import numpy as np 
from pyspark.sql import functions as F
import pandas as pd
from ts_train.ts2ft.feature_generating import FeatureGenerating
from ts_train.ts2ft.feature_pruning import FeaturePruning
# Codice per visualizzazione su notebook
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

path_to_data = "../../../dataset_offline/tr2ft/demo_v1/"
DATA_COLUMN_NAME = "DATA_TRANSAZIONE"

# Create a SparkSession
spark = SparkSession.builder \
    .appName("feature_generation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/14 15:12:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Target prima del downampling
target totali: 100746     
Target a 1 = 99746     
Target a 0 = 1000  

In [2]:
targets_df = spark.read.parquet(path_to_data + "targets_df.parquet")
positive_target = targets_df.filter(targets_df.TARGET == 1)
negative_target = targets_df.filter(targets_df.TARGET == 0)

   

# Target attuali:
Target a 1 = 1000     
Target a 0 = 1000  

In [72]:
#downsampling fatto male alla veloce
positive_target = targets_df.filter(targets_df.TARGET == 1).limit(1000)
negative_target = targets_df.filter(targets_df.TARGET == 0).limit(1000)
filtered_target = positive_target.union(negative_target)
filtered_target.toPandas().to_parquet("target_pandas.parquet", index=False)


In [75]:
filtered_target

DataFrame[ID_CLIENTE_BIC: int, TARGET: int]

# Transazioni prima del downsampling
transazioni negativi: 1000 utenti con 99746 transazioni      
transazioni positivi: 99746 utenti con 39682110 transazioni  

In [41]:
negative_tr_df = spark.read.parquet(path_to_data + "negative_target/sample_transactions.parquet")
positive_tr_df = spark.read.parquet(path_to_data + "positive_target/100k/100k_user_transactions.parquet")

# Transazioni dopo del downsampling
transazioni negativi: 1000 utenti con 99746 transazioni      
transazioni positivi: 1000 utenti con 405907 transazioni  
totali: 1366311

In [42]:
lista_id_pos = positive_target.select("ID_CLIENTE_BIC").rdd.flatMap(lambda x: x).collect()
filtered_positive_tr_df = positive_tr_df.where(col("ID_CLIENTE_BIC").isin(lista_id_pos))
df_tr = filtered_positive_tr_df.union(negative_tr_df)

In [59]:
# save transactions
df_tr.write.format("parquet").save("filtered_transaction.parquet")

23/09/14 16:00:50 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/09/14 16:00:52 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/09/14 16:00:52 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
                                                                                

In [43]:
from ts_train.tr2ts.time_bucketing import TimeBucketing
from ts_train.tr2ts.aggregating import Aggregating, Aggregation, Filter, AndGroup, Pivot
from ts_train.tr2ts.filling import Filling
from pyspark.sql import functions as F
from ts_train.ts2ft.feature_generating import FeatureGenerating
from ts_train.ts2ft.feature_pruning import FeaturePruning
import pandas as pd

In [44]:
time_bucketing_step = TimeBucketing(
  time_col_name="DATA_TRANSAZIONE",
  time_bucket_size=1,
  time_bucket_granularity="week",
)

time_bucketed_df = time_bucketing_step(df_tr, spark)

                                                                                

In [45]:
aggregating_step = Aggregating(
  identifier_cols_name=["ID_CLIENTE_BIC"],
  time_bucket_cols_name=["bucket_start", "bucket_end"],
  aggregations=[       
    Aggregation(
      numerical_col_name="IMPORTO",
      agg_function="sum",
      filters=[Filter("SEGNO", "=", "-")],
      pivot=Pivot("CATEGORY_LIV0","in",[
        'altre_spese',
        'tasse',
        'investimenti_patrimonio',
        'scambio_soldi_tra_privati',
      ]),
      new_col_name="somma_uscite_PIVOTVALUE",
    ),
    Aggregation(
      numerical_col_name="IMPORTO",
      agg_function="sum",
      filters=[Filter("SEGNO", "=", "+")],
      pivot=Pivot("CATEGORY_LIV0","in",[
        'entrate_occasionali',
        'entrate_regolari',
        'investimenti_patrimonio',
        'scambio_soldi_tra_privati',
      ]),
      new_col_name="somma_entrate_PIVOTVALUE",
    ),
  ],
)

aggregated_df = aggregating_step(time_bucketed_df, spark)

                                                                                

In [46]:
filling_step = Filling(
  identifier_cols_name=["ID_CLIENTE_BIC"],
  time_bucket_step=time_bucketing_step
)

filled_df = filling_step(df=aggregated_df, spark=spark)

                                                                                

In [47]:
filled_df.write.format("parquet").save("filtered_timeseries.parquet")

                                                                                

In [None]:

from ts_train.ts2ft.feature_generating import FeatureGenerating

feature_generating_step = FeatureGenerating(
  identifier_col_name="ID_CLIENTE_BIC",
  time_col_name = "bucket_start",
  feature_calculators= [
    'minimum',
    'c3', 
    'last_location_of_maximum',
    'last_location_of_minimum',
    'longest_strike_below_mean',
    'median',
    'variance', 
    'kurtosis', 
    'number_peaks', 
    'linear_trend',
    'ar_coefficient',
  ]
)

# drop bucket end 
timeseries_df = filled_df.drop("bucket_end")

features_generated_df = feature_generating_step(timeseries_df)


In [None]:
features_generated_df = features_generated_df.join(targets_df, on="ID_CLIENTE_BIC", how="inner")
time_series_df = features_generated_df.toPandas()

# drop target columns from time_series_df
targets = pd.Series(time_series_df["TARGET"].values)
pandas_feats_df = time_series_df.drop(["TARGET"], axis=1)

# Feature selection

In [51]:
feature_pruning_step = FeaturePruning(
  identifier_col_name="ID_CLIENTE_BIC"
)

pruned_df, relevance_table = feature_pruning_step(pandas_feats_df, targets)

In [52]:
dropped_features = set(time_series_df.columns) - set(pruned_df.columns)
# original features
print(f"Total generated features: {len(time_series_df.columns)}")
print(f"Dropped {len(dropped_features)} features: {dropped_features}")
print(f"Total final features: {len(pruned_df.columns)}")

Total generated features: 130
Dropped 51 features: {'somma_entrate_entrate_regolari__c3__lag_2', 'somma_uscite_tasse__c3__lag_3', 'TARGET', 'somma_uscite_tasse__number_peaks__n_10', 'somma_uscite_investimenti_patrimonio__last_location_of_minimum', 'somma_uscite_tasse__ar_coefficient__coeff_10__k_10', 'somma_entrate_entrate_regolari__c3__lag_3', 'somma_entrate_investimenti_patrimonio__last_location_of_minimum', 'somma_entrate_entrate_regolari__median', 'somma_entrate_entrate_occasionali__number_peaks__n_50', 'somma_uscite_tasse__c3__lag_2', 'somma_entrate_investimenti_patrimonio__c3__lag_3', 'somma_uscite_investimenti_patrimonio__c3__lag_2', 'somma_uscite_scambio_soldi_tra_privati__number_peaks__n_50', 'somma_uscite_tasse__number_peaks__n_50', 'somma_entrate_entrate_regolari__linear_trend__attr_"rvalue"', 'somma_uscite_tasse__last_location_of_maximum', 'somma_uscite_investimenti_patrimonio__number_peaks__n_50', 'somma_uscite_altre_spese__minimum', 'somma_uscite_investimenti_patrimonio__

In [58]:
pruned_df.to_parquet("filtered_features.parquet", index=False)  # Set index=False to exclude the DataFrame index

In [69]:
targets_df_pandas = targets_df.toPandas()
