In [None]:
from pyspark.sql import SparkSession

from ts_train.ft2model.training_helper import TrainingHelper
from ts_train.ft2model.core import get_features_cols_name, train_test_split

# Dataset reading and preparation 

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
path_to_data = "../../../dataset_offline/ft2model/"

targets_df = spark.read.parquet(path_to_data + "target_pandas.parquet")
df = spark.read.parquet(path_to_data + "filtered_features.parquet").cache()

df = df.dropDuplicates(subset=['ID_CLIENTE_BIC'])
df = df.join(targets_df, on="ID_CLIENTE_BIC", how="left")

df.show()

In [None]:
features_cols_name = get_features_cols_name(df=df, excluded_cols_name=["ID_CLIENTE_BIC", "TARGET"])

In [None]:
train_df, test_df = train_test_split(df)

# Model training

In [None]:
training_helper = TrainingHelper(
    type="classification",
    features_cols_name=features_cols_name,
    label_col_name="TARGET",
    params={
        # "max_depth": [3, 5, 7, 9],
        # "min_child_weight": [1, 3, 5],
        # "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
        # "subsample": [0.6, 0.7, 0.8, 0.9],
        # "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
    },
    unordered_categ_features_cols_name=["somma_entrate_scambio_soldi_tra_privati__number_peaks__n_10"] # Activate to add categorical example
)

In [None]:
training_helper.fit(train_df)

# Model predictig and scoring

In [None]:
prediction_df = training_helper.predict(test_df)

In [None]:
training_helper.score(prediction_df)

In [None]:
training_helper.get_feature_importance(spark).show()

# Model Saving

In [None]:
training_helper.save("model")

# Model Loading

In [None]:
training_helper = TrainingHelper.load("model")

In [None]:
result_df = training_helper.predict(test_df)

# Experimenting