In [1]:
from typing import Tuple, Optional

from xgboost.spark import SparkXGBClassifier
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler

# Dataset reading and preparation 

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/22 16:07:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
path_to_data = "../../../dataset_offline/ft2model/"

targets_df = spark.read.parquet(path_to_data + "target_pandas.parquet")
df = spark.read.parquet(path_to_data + "filtered_features.parquet").cache()

23/09/22 16:07:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [4]:
df = df.dropDuplicates(subset=['ID_CLIENTE_BIC'])
df = df.join(targets_df, on="ID_CLIENTE_BIC", how="left")

df.show()

+--------------+-------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------+-----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------------------------------+-------------------------------------------------+----------------------------------------------------+--------------------------------------------+----------------------------------------------------+-----------------------------------------

In [21]:
# Un-comment for multiclass problem

df = df.withColumn(
    "TARGET",
    F.when(
        F.col("ID_CLIENTE_BIC") < 40000000,
        2
    ).otherwise(F.col("TARGET"))
)

In [22]:
def get_features_cols_name(df: DataFrame, excluded_cols_name: list[str]) -> list[str]:
    return [
        col_name for col_name in df.columns if col_name not in excluded_cols_name
    ]

In [23]:
features_cols_name = get_features_cols_name(df, excluded_cols_name=["ID_CLIENTE_BIC", "TARGET"])
len(features_cols_name)

78

# Dataset preparation

## Splitting Dataset

In [24]:
def train_test_split(df: DataFrame, test_size=0.3):
    return df.randomSplit(weights=[1-test_size, test_size])

In [25]:
train_df, test_df = train_test_split(df)

## Categorical encoding

In [22]:
from pyspark.ml.feature import StringIndexer, StringIndexerModel, OneHotEncoder, OneHotEncoderModel
from pyspark.sql.types import StringType


def _cols_name_encode(cols_name: list[str]) -> list[str]:
    return [col_name + "_encoded" for col_name in cols_name]

def _cols_replace(df, temp_cols_name, cols_name):
    df = df.drop(*cols_name)
    for temp_col_name, col_name in zip(temp_cols_name, cols_name):
        df = df.withColumnRenamed(temp_col_name, col_name)
    
    return df

### Label Encoding

In [21]:
def label_encode(df, cols_name: list[str], string_indexer_model: Optional[StringIndexerModel] = None) -> Tuple[DataFrame, StringIndexerModel]:
    temp_cols_name = _cols_name_encode(cols_name)

    if not string_indexer_model:
        # frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc
        string_indexer_model = StringIndexer(inputCols=cols_name, outputCols=temp_cols_name, stringOrderType="alphabetAsc").fit(df)

    df = string_indexer_model.transform(df)
    df = _cols_replace(df, temp_cols_name, cols_name)

    return df, string_indexer_model

### One-Hot-Encoding

In [103]:
def one_hot_encode(
        df: DataFrame, 
        cols_name: list[str],
        one_hot_encoder_model: Optional[OneHotEncoderModel] = None,
        string_indexer_model: Optional[StringIndexerModel] = None,
    ) -> Tuple[DataFrame, OneHotEncoderModel, Optional[StringIndexerModel]]:
    cols_name_to_cast = []
    for col_name in cols_name:
        if isinstance(df.schema[col_name].dataType, StringType):
            cols_name_to_cast.append(col_name)
    if len(cols_name_to_cast) > 0:
        df, string_indexer_model = label_encode(df, cols_name_to_cast, string_indexer_model)

    temp_cols_name = _cols_name_encode(cols_name)

    if not one_hot_encoder_model:
        one_hot_encoder_model = OneHotEncoder(inputCols=cols_name, outputCols=temp_cols_name).fit(df)

    df = one_hot_encoder_model.transform(df)
    df = _cols_replace(df, temp_cols_name, cols_name)

    return df, one_hot_encoder_model, string_indexer_model

### Example

In [121]:
# encoding_df = spark.createDataFrame(data=df.select("somma_uscite_altre_spese__number_peaks__n_5").take(5)).withColumnRenamed("somma_uscite_altre_spese__number_peaks__n_5", "categ_feature")
encoding_expample_df = spark.createDataFrame([("a",), ("b",), ("a",), ("c",)], ["categ_feature"])
encoding_expample_df.show()

+-------------+
|categ_feature|
+-------------+
|            a|
|            b|
|            a|
|            c|
+-------------+



In [122]:
one_hot_encode(encoding_expample_df, ["categ_feature"]).show()

+-------------+
|categ_feature|
+-------------+
|(2,[0],[1.0])|
|(2,[1],[1.0])|
|(2,[0],[1.0])|
|    (2,[],[])|
+-------------+



In [123]:
label_encode(encoding_expample_df, ["categ_feature"]).show()

+-------------+
|categ_feature|
+-------------+
|          0.0|
|          1.0|
|          0.0|
|          2.0|
+-------------+



## Vector assembling

In [26]:
def vector_assemble(
    df: DataFrame, features_cols_name: list[str], output_col_name="features"
) -> DataFrame:
    vectorAssembler = VectorAssembler(
        inputCols=features_cols_name,
        outputCol=output_col_name,
        handleInvalid="keep",
    )

    return vectorAssembler.transform(df)

In [27]:
preprocessed_train_df = vector_assemble(train_df, features_cols_name=features_cols_name)
preprocessed_test_df = vector_assemble(test_df, features_cols_name=features_cols_name)

# Model training and predicting

In [28]:
from pyspark.ml import Transformer
from xgboost.spark.core import _SparkXGBEstimator

def get_estimator(features_col_name: str = "features", label_col_name: str = "label"):
    return SparkXGBClassifier(features_col=features_col_name, label_col=label_col_name, enable_sparse_data_optim=True, missing=0.0)

def train(df: DataFrame, estimator: _SparkXGBEstimator) -> Transformer:
    return estimator.fit(df)

def predict(df: DataFrame, model: Transformer) -> DataFrame:
    return model.transform(df)

In [29]:
estimator = get_estimator(label_col_name="TARGET")
model = train(preprocessed_train_df, estimator)

result_df = predict(preprocessed_test_df, model)
result_df.show()

2023-09-22 16:09:58,300 INFO XGBoost-PySpark: _fit Running xgboost-2.0.0 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'num_class': 3, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': 0.0}
[16:10:00] task 0 got new rank 0                                    (0 + 1) / 1]
  if is_sparse(data):
2023-09-22 16:10:02,173 INFO XGBoost-PySpark: _fit Finished xgboost training!   


+--------------+-------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------+-----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------------------------------+-------------------------------------------------+----------------------------------------------------+--------------------------------------------+----------------------------------------------------+-----------------------------------------

INFO:XGBoost-PySpark:Do the inference on the CPUs


# Evalutation

In [30]:
from pyspark.ml.evaluation import Evaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator

def get_evaluator(metric: str, prediction_col_name: str = "prediction", rawPredictionCol: str = "rawPrediction", label_col_name: str = "label") -> Evaluator:
    if metric in [
        "f1", 
        "accuracy", 
        "weightedPrecision", 
        "weightedRecall", 
        "weightedTruePositiveRate", 
        "weightedFalsePositiveRate", 
        "weightedFMeasure", 
        "truePositiveRateByLabel", 
        "falsePositiveRateByLabel", 
        "precisionByLabel", 
        "recallByLabel", 
        "fMeasureByLabel", 
        "logLoss", 
        "hammingLoss"]:
        return MulticlassClassificationEvaluator(metricName=metric, predictionCol=prediction_col_name, labelCol=label_col_name) # type: ignore
    elif metric in ["areaUnderROC", "areaUnderPR"]:
        return BinaryClassificationEvaluator(metricName=metric, rawPredictionCol=rawPredictionCol, labelCol=label_col_name) # type: ignore
    else:
        raise ValueError("Metric not supported")

In [31]:
evaluator = get_evaluator(metric="f1", label_col_name="TARGET")

## Without Cross Validation

In [32]:
def evaluate(df: DataFrame, evaluator: Evaluator) -> float:
    return evaluator.evaluate(df)

evaluate(result_df, evaluator)

2023-09-22 16:10:05,697 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

0.8306406174319174

## With Cross Validation

In [17]:
from pyspark.ml.tuning import CrossValidator

def evaluate_cv(df: DataFrame, estimator: _SparkXGBEstimator, evaluator: Evaluator, num_folds: float = 3) -> float:
    cv = CrossValidator(estimator=estimator, estimatorParamMaps=[None], evaluator=evaluator, parallelism=1, numFolds=num_folds) # type: ignore
    model_cv = cv.fit(df)

    return model_cv.avgMetrics[0]

evaluate_cv(preprocessed_train_df, estimator, evaluator)

2023-09-21 09:51:38,804 INFO XGBoost-PySpark: _fit Running xgboost-2.0.0 on 1 workers with
	booster params: {'device': 'cpu', 'objective': 'binary:logistic', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': 0.0}
[09:51:41] task 0 got new rank 0                                    (0 + 1) / 1]
2023-09-21 09:51:43,005 INFO XGBoost-PySpark: _fit Finished xgboost training!   
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (2 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (3 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (4 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (6 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs
2023-09-21 09:51:45,

0.9011699897429257

# Features importance

In [33]:
def get_features_importance(
    model: Transformer, features_cols_name: list[str]
) -> dict[str, float]:
    features_ids_and_importances = model.get_feature_importances(importance_type="weight") # type: ignore

    if len(features_ids_and_importances) > len(features_cols_name):
        raise ValueError("features_cols_name has not the right number of elements")
    
    features_names_and_importances = {}
    for idx, feature_col_name in enumerate(features_cols_name):
        feature_id = f"f{idx}"
        if feature_id in features_ids_and_importances:
            features_names_and_importances[feature_col_name] = features_ids_and_importances[feature_id]
        else:
            features_names_and_importances[feature_col_name] = 0.0


    return features_names_and_importances

In [34]:
get_features_importance(model, features_cols_name)

{'somma_entrate_entrate_regolari__number_peaks__n_3': 171.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_1': 81.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_3': 41.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_5': 47.0,
 'somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10': 22.0,
 'somma_entrate_entrate_regolari__number_peaks__n_1': 101.0,
 'somma_entrate_entrate_regolari__number_peaks__n_10': 66.0,
 'somma_entrate_scambio_soldi_tra_privati__variance': 112.0,
 'somma_entrate_entrate_regolari__variance': 536.0,
 'somma_entrate_scambio_soldi_tra_privati__last_location_of_maximum': 63.0,
 'somma_entrate_scambio_soldi_tra_privati__c3__lag_2': 34.0,
 'somma_entrate_scambio_soldi_tra_privati__c3__lag_1': 28.0,
 'somma_entrate_scambio_soldi_tra_privati__c3__lag_3': 46.0,
 'somma_uscite_altre_spese__number_peaks__n_5': 61.0,
 'somma_entrate_entrate_regolari__number_peaks__n_5': 89.0,
 'somma_entrate_entrate_occasionali__number_peaks__n_1

# Tuning Parametri

https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation

In [67]:
from typing import Union, Tuple
from pyspark.ml import Model

ParamValue = Union[str, int, float]
Params = dict[str, Union[list[ParamValue], ParamValue]]
BestParams = dict[str, ParamValue]

def tune_parameters(df: DataFrame, params: Params, estimator: _SparkXGBEstimator, evaluator: Evaluator, num_folds: int = 3) -> Tuple[BestParams, Model]:
    grid = ParamGridBuilder()
    for param_name, param_value in params.items():
        if isinstance(param_value, list):
            grid = grid.addGrid(estimator.getParam(param_name), param_value)
        else:
            grid = grid.baseOn((estimator.getParam(param_name), param_value))
    grid = grid.build()

    print(grid)
    
    cv = CrossValidator(estimator=estimator, estimatorParamMaps=grid, evaluator=evaluator, parallelism=1, numFolds=num_folds)
    cv_model = cv.fit(df)
    best_model = cv_model.bestModel

    best_params = {}
    for param_name in params.keys():
        best_params[param_name] = best_model.getOrDefault(param=best_model.getParam(paramName=param_name))

    return best_params, best_model

tune_parameters(df=preprocessed_train_df, params={"max_depth": [2, 3]}, estimator=estimator, evaluator=evaluator)

[{Param(parent='SparkXGBClassifier_ab015fde49d7', name='max_depth', doc='Refer to XGBoost doc of xgboost.sklearn.XGBClassifier for this param max_depth'): 2}, {Param(parent='SparkXGBClassifier_ab015fde49d7', name='max_depth', doc='Refer to XGBoost doc of xgboost.sklearn.XGBClassifier for this param max_depth'): 3}]


2023-09-20 18:25:19,858 INFO XGBoost-PySpark: _fit Running xgboost-2.0.0 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 2, 'objective': 'multi:softprob', 'num_class': 3, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': 0.0}
[18:25:21] task 0 got new rank 0                                    (0 + 1) / 1]
2023-09-20 18:25:23,174 INFO XGBoost-PySpark: _fit Finished xgboost training!   
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (3 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (5 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs                 (6 + 8) / 200]
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark

{'max_depth': 3}

23/09/20 20:33:54 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1018982 ms exceeds timeout 120000 ms
23/09/20 20:33:54 WARN SparkContext: Killing executors is not supported by current scheduler.
23/09/20 20:34:02 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.

In [62]:
estimator.getParam("max_depth")

Param(parent='SparkXGBClassifier_ab015fde49d7', name='max_depth', doc='Refer to XGBoost doc of xgboost.sklearn.XGBClassifier for this param max_depth')

In [46]:
def get_param_value(param_name: str) -> float:
    return cvModel.bestModel.getOrDefault(param=cvModel.bestModel.getParam(paramName=param_name))

get_param_value("max_depth")

2

# Tuning della Threshold

In [36]:
result_df.show(truncate=False)

+--------------+-------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------+-----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------------------------------+-------------------------------------------------+----------------------------------------------------+--------------------------------------------+----------------------------------------------------+-----------------------------------------

2023-09-22 16:10:17,961 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

In [19]:
y_pred_prob = result_df.select(model.getProbabilityCol())
y_pred_prob

DataFrame[probability: vector]

In [72]:
evaluator = MulticlassClassificationEvaluator(metricName="f1", labelCol="TARGET")
evaluator.evaluate(result_df)

2023-09-19 16:49:10,761 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-09-19 16:49:10,769 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

0.977

In [66]:
# result_df.withColumn("prediction_ttuning", F.max("probability")).show()

In [67]:
result_df.show()

[Stage 496:>                                                        (0 + 1) / 1]

+--------------+-------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------+-----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------------------------------+-------------------------------------------------+----------------------------------------------------+--------------------------------------------+----------------------------------------------------+-----------------------------------------

2023-09-19 16:46:55,864 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

In [81]:
from pyspark.ml.functions import vector_to_array

# result_df = result_df.withColumn("probability_arr", vector_to_array(F.col('probability')))
# result_df = result_df.withColumn("max_probability", F.array_max("probability_arr"))
# result_df = result_df.select('*', F.posexplode("probability_arr"))
# result_df = result_df.where('col==max_probability')

tst_arr = result_df.withColumn("arr",vector_to_array(F.col('probability')))

tst_max=tst_arr.withColumn("max_value",F.array_max("arr"))
tst_max_exp = tst_max.select('*',F.posexplode("arr"))
tst_fin = tst_max_exp.where('col==max_value')

In [82]:
tst_fin.show()

INFO:XGBoost-PySpark:Do the inference on the CPUs
2023-09-19 16:52:50,327 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs


+--------------+-------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------+-----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------------------------------+-------------------------------------------------+----------------------------------------------------+--------------------------------------------+----------------------------------------------------+-----------------------------------------

                                                                                