In [1]:
from typing import Tuple, Optional

from xgboost.spark import SparkXGBClassifier
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler

# Dataset reading and preparation 

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/27 15:34:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
path_to_data = "../../../dataset_offline/ft2model/"

targets_df = spark.read.parquet(path_to_data + "target_pandas.parquet")
df = spark.read.parquet(path_to_data + "filtered_features.parquet").cache()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/vaccaro/anaconda3/envs/transactionsanalysis/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vaccaro/anaconda3/envs/transactionsanalysis/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vaccaro/anaconda3/envs/transactionsanalysis/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

23/09/27 15:34:32 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
df = df.dropDuplicates(subset=['ID_CLIENTE_BIC'])
df = df.join(targets_df, on="ID_CLIENTE_BIC", how="left")

df.show()

+--------------+-------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------+-----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------------------------------+-------------------------------------------------+----------------------------------------------------+--------------------------------------------+----------------------------------------------------+-----------------------------------------

In [None]:
df = df.select("somma_entrate_entrate_regolari__number_peaks__n_3", "somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10", "ID_CLIENTE_BIC", "TARGET")
# df = df.select("somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10", "ID_CLIENTE_BIC", "TARGET")

In [None]:
# Un-comment for multiclass problem

# df = df.withColumn(
#     "TARGET",
#     F.when(
#         F.col("ID_CLIENTE_BIC") < 40000000,
#         2
#     ).otherwise(F.col("TARGET"))
# )

In [None]:
def get_features_cols_name(df: DataFrame, excluded_cols_name: list[str]) -> list[str]:
    return [
        col_name for col_name in df.columns if col_name not in excluded_cols_name
    ]

In [None]:
features_cols_name = get_features_cols_name(df, excluded_cols_name=["ID_CLIENTE_BIC", "TARGET"])
len(features_cols_name)

2

# Dataset preparation

## Splitting Dataset

In [None]:
def train_test_split(df: DataFrame, test_size=0.3):
    return df.randomSplit(weights=[1-test_size, test_size])

In [None]:
train_df, test_df = train_test_split(df)

## Categorical encoding

In [None]:
from pyspark.ml.feature import StringIndexer, StringIndexerModel, OneHotEncoder, OneHotEncoderModel
from pyspark.sql.types import StringType
from typing import Union


def _cols_name_encode(cols_name: list[str]) -> list[str]:
    return [col_name + "_encoded" for col_name in cols_name]

def _cols_replace(df, temp_cols_name, cols_name):
    df = df.drop(*cols_name)
    for temp_col_name, col_name in zip(temp_cols_name, cols_name):
        df = df.withColumnRenamed(temp_col_name, col_name)
    
    return df

def _compute_cols_encoding(df: DataFrame, cols_name: list[str], order: str = "ascending") -> dict[str, list[Union[str, int, float]]]:
    if order == "ascending":
        ascending = True
    elif order == "descending":
        ascending = False
    else:
        raise ValueError('order param should be "ascending" or "descending"')
    
    cols_encoding = {}
    for col_name in cols_name:
        categ_options_row = df.select(col_name).distinct().orderBy(col_name, ascending=ascending).collect()
        for categ_option_row in categ_options_row:
            categ_option = categ_option_row[col_name]
            if col_name not in cols_encoding:
                cols_encoding[col_name] = [categ_option]
            else:
                cols_encoding[col_name].append(categ_option)

    return cols_encoding

### Label Encoding

In [None]:
def label_encode(df, cols_name: list[str], string_indexer_model: Optional[StringIndexerModel] = None, order: str = "ascending") -> Tuple[DataFrame, StringIndexerModel, dict[str, list[Union[str, int, float]]]]:
    cols_encoding = _compute_cols_encoding(df=df, cols_name=cols_name, order="ascending")
    temp_cols_name = _cols_name_encode(cols_name)

    if not string_indexer_model:
        # frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc
        if order == "ascending":
            stringOrderType = "frequencyAsc"
        elif order == "descending":
            stringOrderType = "frequencyDesc"
        else:
            raise ValueError('order param should be "ascending" or "descending"')
        
        string_indexer_model = StringIndexer(inputCols=cols_name, outputCols=temp_cols_name, stringOrderType=stringOrderType).fit(df)

    df = string_indexer_model.transform(df)
    df = _cols_replace(df, temp_cols_name, cols_name)

    return df, string_indexer_model, cols_encoding

### One-Hot-Encoding

In [None]:
def one_hot_encode(
        df: DataFrame, 
        cols_name: list[str],
        one_hot_encoder_model: Optional[OneHotEncoderModel] = None,
        string_indexer_model: Optional[StringIndexerModel] = None,
    ) -> Tuple[DataFrame, OneHotEncoderModel, Optional[StringIndexerModel], dict[str, list[Union[str, int, float]]]]:

    df, string_indexer_model, cols_encoding = label_encode(df, cols_name, string_indexer_model)

    temp_cols_name = _cols_name_encode(cols_name)

    if not one_hot_encoder_model:
        one_hot_encoder_model = OneHotEncoder(inputCols=cols_name, outputCols=temp_cols_name).fit(df)

    df = one_hot_encoder_model.transform(df)
    df = _cols_replace(df, temp_cols_name, cols_name)

    return df, one_hot_encoder_model, string_indexer_model, cols_encoding

### Example

In [None]:
# encoding_df = spark.createDataFrame(data=df.select("somma_uscite_altre_spese__number_peaks__n_5").take(5)).withColumnRenamed("somma_uscite_altre_spese__number_peaks__n_5", "categ_feature")
encoding_expample_df = spark.createDataFrame([("a",), ("b",), ("a",), ("c",)], ["categ_feature"])
encoding_expample_df.show()

+-------------+
|categ_feature|
+-------------+
|            a|
|            b|
|            a|
|            c|
+-------------+



In [None]:
test = spark.createDataFrame([(1,), (1,), (6,), (3,)], ["categ_feature"])
one_hot_encoder_model = OneHotEncoder(inputCols=["categ_feature"], outputCols=["temp_cols_name_encoded"]).fit(test)
one_hot_encoder_model.transform(test).show()

+-------------+----------------------+
|categ_feature|temp_cols_name_encoded|
+-------------+----------------------+
|            1|         (6,[1],[1.0])|
|            1|         (6,[1],[1.0])|
|            6|             (6,[],[])|
|            3|         (6,[3],[1.0])|
+-------------+----------------------+



In [None]:
test = spark.createDataFrame([(2,), (0,), (6,), (4,)], ["categ_feature"])
test_result, _, _, labels = one_hot_encode(test, cols_name=["categ_feature"])
test.show()
test_result.show()

+-------------+
|categ_feature|
+-------------+
|            2|
|            0|
|            6|
|            4|
+-------------+

+-------------+
|categ_feature|
+-------------+
|(3,[1],[1.0])|
|(3,[0],[1.0])|
|    (3,[],[])|
|(3,[2],[1.0])|
+-------------+



In [None]:
test = spark.createDataFrame([("a",), ("c",), ("d",)], ["categ_feature"])
test_result, _, _, labels = one_hot_encode(test, cols_name=["categ_feature"])
test.show()
test_result.show()

+-------------+
|categ_feature|
+-------------+
|            a|
|            c|
|            d|
+-------------+

+-------------+
|categ_feature|
+-------------+
|(2,[0],[1.0])|
|(2,[1],[1.0])|
|    (2,[],[])|
+-------------+



## Vector assembling

In [None]:
from pyspark.ml.linalg import DenseVector, SparseVector

def vector_assemble(
    df: DataFrame, features_cols_name: list[str], output_col_name="features", cols_encoding: Optional[dict[str, list[Union[str, int, float]]]] = None
) -> Tuple[DataFrame, list[str]]:
    vectorAssembler = VectorAssembler(
        inputCols=features_cols_name,
        outputCol=output_col_name,
        handleInvalid="keep",
    )

    new_features_cols_name = []
    for feature_col_name in features_cols_name:
        feature_first_element = df.select(feature_col_name).first()[feature_col_name] # type: ignore
        if isinstance(feature_first_element, (DenseVector, SparseVector)):
            vector_len = len(feature_first_element) + 1
            for idx in range(0, vector_len):
                if cols_encoding is not None and feature_col_name in cols_encoding and vector_len == len(cols_encoding[feature_col_name]):
                    new_features_cols_name.append(f"{feature_col_name}_{cols_encoding[feature_col_name][idx] }")
                else:
                    new_features_cols_name.append(f"{feature_col_name}_{idx}")
        else:
            new_features_cols_name.append(feature_col_name)

    df = vectorAssembler.transform(df)

    return df, new_features_cols_name

In [None]:
train_df, _, _, cols_encoding = one_hot_encode(train_df, cols_name=["somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10"])
test_df, _, _, cols_encoding = one_hot_encode(test_df, cols_name=["somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10"])

In [None]:
preprocessed_train_df, cols_encoding = vector_assemble(train_df, features_cols_name=features_cols_name, cols_encoding=cols_encoding)
preprocessed_test_df, _ = vector_assemble(test_df, features_cols_name=features_cols_name)

In [None]:
cols_encoding

['somma_entrate_entrate_regolari__number_peaks__n_3',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_0.0',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_1.0',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_2.0',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_3.0',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_4.0',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_5.0',
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_6.0']

# Model training and predicting

In [None]:
from pyspark.ml import Transformer
from xgboost.spark.core import _SparkXGBEstimator

def get_estimator(features_col_name: str = "features", label_col_name: str = "label"):
    return SparkXGBClassifier(features_col=features_col_name, label_col=label_col_name, enable_sparse_data_optim=True, missing=0.0)

def train(df: DataFrame, estimator: _SparkXGBEstimator) -> Transformer:
    return estimator.fit(df)

def predict(df: DataFrame, model: Transformer) -> DataFrame:
    return model.transform(df)

In [None]:
estimator = get_estimator(label_col_name="TARGET")
model = train(preprocessed_train_df, estimator)

result_df = predict(preprocessed_test_df, model)
result_df.show()

2023-09-27 11:51:59,876 INFO XGBoost-PySpark: _fit Running xgboost-2.0.0 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'device': 'cpu', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': 0.0}
23/09/27 11:52:00 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
[11:52:02] task 0 got new rank 0                                    (0 + 1) / 1]
  if is_sparse(data):
2023-09-27 11:52:03,443 INFO XGBoost-PySpark: _fit Finished xgboost training!   


+-------------------------------------------------+--------------+------+-----------------------------------------------------------+-------------+--------------------+----------+--------------------+
|somma_entrate_entrate_regolari__number_peaks__n_3|ID_CLIENTE_BIC|TARGET|somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10|     features|       rawPrediction|prediction|         probability|
+-------------------------------------------------+--------------+------+-----------------------------------------------------------+-------------+--------------------+----------+--------------------+
|                                              0.0|      36542722|     0|                                                  (6,[],[])|    (7,[],[])|[1.61482894420623...|       0.0|[0.83408075571060...|
|                                              0.0|      47986971|     0|                                                  (6,[],[])|    (7,[],[])|[1.61482894420623...|       0.0|[0.83408075571060

INFO:XGBoost-PySpark:Do the inference on the CPUs


# Evalutation

In [None]:
from pyspark.ml.evaluation import Evaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator

def get_evaluator(metric: str, prediction_col_name: str = "prediction", rawPredictionCol: str = "rawPrediction", label_col_name: str = "label") -> Evaluator:
    if metric in [
        "f1", 
        "accuracy", 
        "weightedPrecision", 
        "weightedRecall", 
        "weightedTruePositiveRate", 
        "weightedFalsePositiveRate", 
        "weightedFMeasure", 
        "truePositiveRateByLabel", 
        "falsePositiveRateByLabel", 
        "precisionByLabel", 
        "recallByLabel", 
        "fMeasureByLabel", 
        "logLoss", 
        "hammingLoss"]:
        return MulticlassClassificationEvaluator(metricName=metric, predictionCol=prediction_col_name, labelCol=label_col_name) # type: ignore
    elif metric in ["areaUnderROC", "areaUnderPR"]:
        return BinaryClassificationEvaluator(metricName=metric, rawPredictionCol=rawPredictionCol, labelCol=label_col_name) # type: ignore
    else:
        raise ValueError("Metric not supported")

In [None]:
evaluator = get_evaluator(metric="f1", label_col_name="TARGET")

## Without Cross Validation

In [None]:
def evaluate(df: DataFrame, evaluator: Evaluator) -> float:
    return evaluator.evaluate(df)

evaluate(result_df, evaluator)

2023-09-26 10:19:13,678 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

0.8033125254107083

## With Cross Validation

In [None]:
# from pyspark.ml.tuning import CrossValidator

# def evaluate_cv(df: DataFrame, estimator: _SparkXGBEstimator, evaluator: Evaluator, num_folds: float = 3) -> float:
#     cv = CrossValidator(estimator=estimator, estimatorParamMaps=[None], evaluator=evaluator, parallelism=1, numFolds=num_folds) # type: ignore
#     model_cv = cv.fit(df)

#     return model_cv.avgMetrics[0]

# evaluate_cv(preprocessed_train_df, estimator, evaluator)

# Features importance

In [None]:
def get_features_importance(
    model: Transformer, cols_encoding: list[str]
) -> dict[str, float]:
    features_ids_and_importances = model.get_feature_importances(importance_type="weight") # type: ignore

    # if len(features_ids_and_importances) > len(features_cols_name):
    #     raise ValueError("features_cols_name has not the right number of elements")
    
    features_names_and_importances = {}
    for idx, feature_col_name in enumerate(cols_encoding):
        feature_id = f"f{idx}"
        if feature_id in features_ids_and_importances:
            features_names_and_importances[feature_col_name] = features_ids_and_importances[feature_id]
        else:
            features_names_and_importances[feature_col_name] = 0.0


    return features_names_and_importances

In [None]:
ft_imp = get_features_importance(model, cols_encoding)
ft_imp

{'somma_entrate_entrate_regolari__number_peaks__n_3': 922.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_0.0': 0.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_1.0': 82.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_2.0': 98.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_3.0': 88.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_4.0': 94.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_5.0': 124.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10_6.0': 0.0}

# Tuning Parametri

https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation

In [None]:
from typing import Union, Tuple
from pyspark.ml import Model

ParamValue = Union[str, int, float]
Params = dict[str, Union[list[ParamValue], ParamValue]]
BestParams = dict[str, ParamValue]

def tune_parameters(df: DataFrame, params: Params, estimator: _SparkXGBEstimator, evaluator: Evaluator, num_folds: int = 3) -> Tuple[BestParams, Model]:
    grid = ParamGridBuilder()
    for param_name, param_value in params.items():
        if isinstance(param_value, list):
            grid = grid.addGrid(estimator.getParam(param_name), param_value)
        else:
            grid = grid.baseOn((estimator.getParam(param_name), param_value))
    grid = grid.build()

    print(grid)
    
    cv = CrossValidator(estimator=estimator, estimatorParamMaps=grid, evaluator=evaluator, parallelism=1, numFolds=num_folds)
    cv_model = cv.fit(df)
    best_model = cv_model.bestModel

    best_params = {}
    for param_name in params.keys():
        best_params[param_name] = best_model.getOrDefault(param=best_model.getParam(paramName=param_name))

    return best_params, best_model

tune_parameters(df=preprocessed_train_df, params={"max_depth": [2, 3]}, estimator=estimator, evaluator=evaluator)

[{Param(parent='SparkXGBClassifier_ab015fde49d7', name='max_depth', doc='Refer to XGBoost doc of xgboost.sklearn.XGBClassifier for this param max_depth'): 2}, {Param(parent='SparkXGBClassifier_ab015fde49d7', name='max_depth', doc='Refer to XGBoost doc of xgboost.sklearn.XGBClassifier for this param max_depth'): 3}]


2023-09-20 18:25:19,858 INFO XGBoost-PySpark: _fit Running xgboost-2.0.0 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 2, 'objective': 'multi:softprob', 'num_class': 3, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': 0.0}
[18:25:21] task 0 got new rank 0                                    (0 + 1) / 1]
2023-09-20 18:25:23,174 INFO XGBoost-PySpark: _fit Finished xgboost training!   
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (3 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (5 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (6 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark

{'max_depth': 3}

23/09/20 20:33:54 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1018982 ms exceeds timeout 120000 ms
23/09/20 20:33:54 WARN SparkContext: Killing executors is not supported by current scheduler.
23/09/20 20:34:02 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.