# Vertically Federated XGB (SecureBoost) Analysis

>The following codes are demos only. It's **NOT for production** due to system security concerns, please **DO NOT** use it directly in production.

## Comparison with XGB

In this notebook, we are going to compare the performance of our implementation of secureboost vs XGB in cleartext.

In the end we will give a comparison between the two models on the same datasets with respect to AUC.


In [1]:
import pprint
import time
from typing import List

import numpy as np
import pandas as pd
import secretflow as sf
import spu
import xgboost as xgb
from secretflow.data import FedNdarray, partition, PartitionWay
from secretflow.data.split import train_test_split as train_test_split_fed
from secretflow.data.vertical import VDataFrame
from secretflow.device.driver import reveal, wait
from secretflow.ml.boost.sgb_v import (
    get_classic_lightGBM_params,
    get_classic_XGB_params,
    Sgb,
)

from secretflow.ml.boost.sgb_v.core.params import xgb_params_converter
from secretflow.ml.boost.sgb_v.model import load_model
from secretflow.preprocessing import LabelEncoder
from secretflow.utils.simulation.datasets import (
    load_bank_marketing,
    load_bank_marketing_unpartitioned,
    load_creditcard,
    load_creditcard_unpartitioned,
)
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder as SkLabelEncoder

pp = pprint.PrettyPrinter(depth=4)

# Check the version of your SecretFlow
print('The version of SecretFlow: {}'.format(sf.__version__))

In [None]:
alice_ip = '127.0.0.1'
bob_ip = '127.0.0.1'
ip_party_map = {bob_ip: 'bob', alice_ip: 'alice'}

_system_config = {'lineage_pinning_enabled': False}
sf.shutdown()
# init cluster
sf.init(
    ['alice', 'bob'],
    address='local',
    _system_config=_system_config,
    object_store_memory=50 * 1024 * 1024 * 1024,
)

# SPU settings
cluster_def = {
    'nodes': [
        {'party': 'alice', 'id': 'local:0', 'address': alice_ip + ':12945'},
        {'party': 'bob', 'id': 'local:1', 'address': bob_ip + ':12946'},
        # {'party': 'carol', 'id': 'local:2', 'address': '127.0.0.1:12347'},
    ],
    'runtime_config': {
        # SEMI2K support 2/3 PC, ABY3 only support 3PC, CHEETAH only support 2PC.
        # pls pay attention to size of nodes above. nodes size need match to PC setting.
        'protocol': spu.ProtocolKind.SEMI2K,
        'field': spu.FieldType.FM128,
    },
}

# HEU settings
heu_config = {
    'sk_keeper': {'party': 'alice'},
    'evaluators': [{'party': 'bob'}],
    'mode': 'PHEU',
    'he_parameters': {
        # ou is a fast encryption schema that is as secure as paillier.
        'schema': 'ou',
        'key_pair': {
            'generate': {
                # bit size should be 2048 to provide sufficient security.
                'bit_size': 2048,
            },
        },
    },
    'encoding': {
        'cleartext_type': 'DT_I32',
        'encoder': "IntegerEncoder",
        'encoder_args': {"scale": 1},
    },
}

In [3]:
alice = sf.PYU('alice')
bob = sf.PYU('bob')
heu = sf.HEU(heu_config, cluster_def['runtime_config']['field'])

In [4]:
data = load_bank_marketing(parts={alice: (0, 16), bob: (16, 16)}, axis=1, full=True)
label = load_bank_marketing(parts={alice: (16, 17)}, axis=1, full=True)

bank_unpartitioned = load_bank_marketing_unpartitioned(full=True)

In [5]:
bank_unpartitioned.shape

In [6]:
bank_unpartitioned.head

In [7]:
bank_unpartitioned['y']

In [8]:
# preprocess the data
encoder = LabelEncoder()
data['job'] = encoder.fit_transform(data['job'])
data['marital'] = encoder.fit_transform(data['marital'])
data['education'] = encoder.fit_transform(data['education'])
data['default'] = encoder.fit_transform(data['default'])
data['housing'] = encoder.fit_transform(data['housing'])
data['loan'] = encoder.fit_transform(data['loan'])
data['contact'] = encoder.fit_transform(data['contact'])
data['poutcome'] = encoder.fit_transform(data['poutcome'])
data['month'] = encoder.fit_transform(data['month'])
label = encoder.fit_transform(label)

In [9]:
encoder = SkLabelEncoder()
for col in [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'poutcome',
    'y',
]:
    bank_unpartitioned[col] = encoder.fit_transform(bank_unpartitioned[col])

In [10]:
bank_unpartitioned.head

## Find best convergence round with XGB

In [11]:
X = bank_unpartitioned.iloc[:, :-1]
y = bank_unpartitioned.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=94
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=94
)

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(
    tree_method="hist",
    n_estimators=100,
    max_depth=5,
    learning_rate=0.3,
    max_bin=10,
    early_stopping_rounds=5,
    base_score=0.5,
    eval_metric="auc",
    reg_lambda=0.1,
    min_child_weight=0,
)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])

print(
    "train set AUC score: ",
    roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1]),
    "test set AUC score: ",
    roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]),
    "num_trees: ",
    clf.best_iteration + 1,
)

In [12]:
# train on the whole dataset and measure performance

# Use "hist" for constructing the trees, with early stopping enabled.
clf2 = xgb.XGBClassifier(
    tree_method="hist",
    n_estimators=clf.best_iteration + 1,
    max_depth=5,
    learning_rate=0.3,
    max_bin=10,
    eval_metric="auc",
    base_score=0.5,
    reg_lambda=0.1,
    min_child_weight=0,
)
clf2.fit(X, y)

In [13]:
print(
    "train set AUC score: ",
    roc_auc_score(y, clf2.predict_proba(X)[:, 1]),
    "num_trees: ",
    clf2.best_iteration + 1,
)

In [14]:
X_train_fed, X_test_fed = train_test_split_fed(data, test_size=0.2, random_state=94)
y_train_fed, y_test_fed = train_test_split_fed(label, test_size=0.2, random_state=94)

## Try to do the same using sgb

In [15]:
sgb = Sgb(heu)
params = get_classic_XGB_params()
params['num_boost_round'] = 100
params['max_depth'] = 5
params['base_score'] = 0.5
params['reg_lambda'] = 0.1
params['learning_rate'] = 0.3
params['sketch_eps'] = 1 / 10
params['enable_early_stop'] = True
params['enable_monitor'] = True
params['validation_fraction'] = 0.2
params['stopping_rounds'] = 5
params['stopping_tolerance'] = 0.0000000001
params['seed'] = 94
params['first_tree_with_label_holder_feature'] = False
params['save_best_model'] = True

model = sgb.train(params, X_train_fed, y_train_fed)

In [16]:
print(
    "train set AUC score: ",
    roc_auc_score(
        reveal(y_train_fed.partitions[alice].data), reveal(model.predict(X_train_fed))
    ),
    "test set AUC score: ",
    roc_auc_score(
        reveal(y_test_fed.partitions[alice].data), reveal(model.predict(X_test_fed))
    ),
    "num_trees: ",
    len(model.get_trees()),
)

In [17]:
sgb = Sgb(heu)
params = get_classic_XGB_params()
params['num_boost_round'] = clf.best_iteration + 1
params['max_depth'] = 5
params['base_score'] = 0.5
params['learning_rate'] = 0.3
params['sketch_eps'] = 1 / 10
params['seed'] = 94
params['first_tree_with_label_holder_feature'] = False

model2 = sgb.train(params, data, label)

In [18]:
print(
    "train set AUC score: ",
    roc_auc_score(reveal(label.partitions[alice].data), reveal(model2.predict(data))),
    "num_trees: ",
    len(model2.get_trees()),
)

## Pipeline the test run

We have performed one test run on the bank dataset. In this test, we use the same parameters for XGB and SGB and compare the model results.
There are several steps:

1. run XGB on a set of params on training data with early stopping enabled
2. run SGB on a set of params on training data with early stopping enabled
3. run XGB on the whole training data (no early stopping) with the optimal rounds from step 1
4. run SGB on the whole training data (no early stopping) with the optimal rounds from step 1
6. collect results of convergence rounds and AUC scores

We have found the convergence rounds are close and AUC scores are similar between XGB and SGB, therefore add the evidence SGB is similar to XGB in terms of accuracy.

However, a single data point is not enough to make a conclusion about the performance of SGB.
It is possible to perform multiple runs and collect the data.

Now we are going to pipeline the test run process.

In [19]:
# we begin with a single np dataset
# we use label encoding for all the categorical variables
# we separate the labels from the features
# we then give it to alice and bob by halves, in particular give alice the label


def preprocess_data(dataset: pd.DataFrame, label_column: str):
    # perform label encoding on categorical variables
    for col in dataset.columns:
        if isinstance(dataset[col][0], str):
            le = SkLabelEncoder()
            dataset[col] = le.fit_transform(dataset[col])

    # separate labels from features
    X = dataset.drop(label_column, axis=1)
    y = dataset[label_column]

    X_col_num = X.shape[1]
    split_count = int(X_col_num / 2)
    vdata = VDataFrame(
        partitions={
            alice: partition(alice(lambda x: x)(X.iloc[:, :split_count])),
            bob: partition(bob(lambda x: x)(X.iloc[:, split_count:])),
        }
    )
    label = VDataFrame(partitions={alice: partition(alice(lambda x: x)(y))})

    return X, y, vdata, label


DEFAULT_XGB_PARAMS = {
    "tree_method": "hist",
    "n_estimators": 100,
    "max_depth": 5,
    "learning_rate": 0.3,
    "max_bin": 10,
    "early_stopping_rounds": 5,
    "base_score": 0.5,
    "eval_metric": "auc",
    "reg_lambda": 0.1,
    "min_child_weight": 0,
    "random_state": 95,
}


def fit_xgb(X, y, params, valid_frac=0.2, test_frac=0.2):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_frac, random_state=params["random_state"]
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=valid_frac, random_state=params["random_state"]
    )

    # Use "hist" for constructing the trees, with early stopping enabled.
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    return clf, X_test, y_test


def find_xgb_auc_and_best_iter_round(clf, X_test, y_test):
    converge_num_trees = clf.best_iteration + 1
    converge_test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    return converge_test_auc, converge_num_trees


def fit_sgb(vdata, label, params, valid_frac=0.2, test_frac=0.2):
    X_train_fed, X_test_fed = train_test_split_fed(
        vdata, test_size=test_frac, random_state=params["random_state"]
    )
    y_train_fed, y_test_fed = train_test_split_fed(
        label, test_size=test_frac, random_state=params["random_state"]
    )
    sgb = Sgb(heu)
    sgb_params = xgb_params_converter(params)
    sgb_params['validation_fraction'] = valid_frac
    model = sgb.train(sgb_params, X_train_fed, y_train_fed)
    return model, X_test_fed, y_test_fed


def find_sgb_auc_and_best_iter_round(model, X_test_fed, y_test_fed):
    test_auc = roc_auc_score(
        reveal(y_test_fed.partitions[alice].data), reveal(model.predict(X_test_fed))
    )
    num_trees = len(model.get_trees())
    return test_auc, num_trees


def find_sgb_auc_at_xgb_convergence_point(
    model, X_test_fed, y_test_fed, xgb_converge_num_trees
):
    return roc_auc_score(
        reveal(y_test_fed.partitions[alice].data),
        reveal(model[:xgb_converge_num_trees].predict(X_test_fed)),
    )


class ExperimentResult:
    def __init__(self):
        self.xgb_test_auc = 0
        self.xgb_num_trees = 0
        self.xgboost_fit_time = 0

        self.sgb_test_auc = 0
        self.sgb_num_trees = 0
        self.sgb_fit_time = 0
        self.sgb_test_auc_at_xgb_convergence = 0

    def to_dict(self):
        return {
            'xgb_test_auc': self.xgb_test_auc,
            'xgb_num_trees': self.xgb_num_trees,
            'xgboost_fit_time': self.xgboost_fit_time,
            'sgb_test_auc': self.sgb_test_auc,
            'sgb_num_trees': self.sgb_num_trees,
            'sgb_fit_time': self.sgb_fit_time,
            'sgb_test_auc_at_xgb_convergence': self.sgb_test_auc_at_xgb_convergence,
        }


def run_experiement(
    dataset: pd.DataFrame,
    label_column: str,
    params: dict,
    experiment_name: str,
    valid_frac=0.2,
    test_frac=0.2,
):
    X, y, vdata, label = preprocess_data(dataset, label_column)
    print("Starting {}".format(experiment_name))

    start = time.perf_counter()
    clf, X_test, y_test = fit_xgb(X, y, params, valid_frac, test_frac)
    xgboost_fit_time = time.perf_counter() - start
    xgb_test_auc, xgb_num_trees = find_xgb_auc_and_best_iter_round(clf, X_test, y_test)
    print("XGBoost Test AUC: {}, Num Trees: {}".format(xgb_test_auc, xgb_num_trees))

    start = time.perf_counter()
    model, X_test_fed, y_test_fed = fit_sgb(vdata, label, params, valid_frac, test_frac)
    sgb_fit_time = time.perf_counter() - start
    sgb_test_auc, sgb_num_trees = find_sgb_auc_and_best_iter_round(
        model, X_test_fed, y_test_fed
    )
    print("SGB Test AUC: {}, Num Trees: {}".format(sgb_test_auc, sgb_num_trees))

    sgb_test_auc_at_xgb_convergence = find_sgb_auc_at_xgb_convergence_point(
        model, X_test_fed, y_test_fed, xgb_num_trees
    )
    print(
        "SGB Test AUC at XGB Convergence Point: {}".format(
            sgb_test_auc_at_xgb_convergence
        )
    )

    experiment_result = ExperimentResult()

    experiment_result.xgb_test_auc = xgb_test_auc
    experiment_result.xgb_num_trees = xgb_num_trees
    experiment_result.xgboost_fit_time = xgboost_fit_time

    experiment_result.sgb_test_auc = sgb_test_auc
    experiment_result.sgb_num_trees = sgb_num_trees
    experiment_result.sgb_fit_time = sgb_fit_time

    experiment_result.sgb_test_auc_at_xgb_convergence = sgb_test_auc_at_xgb_convergence
    return experiment_result


def collect_results(results: List[ExperimentResult]):
    results_dict = [r.to_dict() for r in results]
    return pd.DataFrame(results_dict)


def run_repeated_experiments(
    dataset: pd.DataFrame,
    label_column: str,
    params: dict,
    experiment_name: str,
    num_repeats: int,
    valid_frac=0.2,
    test_frac=0.2,
    seed=1212,
):
    np.random.seed(seed)

    results = []
    for i in range(num_repeats):
        random_state = np.random.randint(low=0, high=1000000)
        params["random_state"] = random_state
        result = run_experiement(
            dataset,
            label_column,
            params,
            experiment_name + "_repeat_" + str(i + 1),
            valid_frac,
            test_frac,
        )
        results.append(result)
    return collect_results(results)

In [None]:
# outputs are cleared in web demo for viewing purposes

creditcard_experiement_results_table = run_repeated_experiments(
    load_creditcard_unpartitioned(), 'Class', DEFAULT_XGB_PARAMS, 'creditcard', 20
)

In [None]:
# outputs are cleared in web demo for viewing purposes

bank_marketing_experiement_results_table = run_repeated_experiments(
    load_bank_marketing_unpartitioned(full=True),
    'y',
    DEFAULT_XGB_PARAMS,
    'bank_marketing',
    15,
)

In [22]:
creditcard_experiement_results_table.describe()

In [23]:
bank_marketing_experiement_results_table.describe()

In [24]:
import pandas as pd


def xy_to_dataframe(X, y, feature_names=None):
    """
    Convert (X, y) into a pandas DataFrame.

    Parameters:
    - X (array-like): A two-dimensional array-like structure containing feature data.
    - y (array-like): A one-dimensional array-like structure containing target variable.
    - feature_names (list of str, optional): A list of feature names for the DataFrame columns. If None, generic names are used.

    Returns:
    - df (pd.DataFrame): A pandas DataFrame containing X and y combined.
    """

    # Check if feature_names is provided; if not, create generic feature names
    if feature_names is None:
        feature_names = [f"feature_{i}" for i in range(1, len(X[0]) + 1)]
    elif len(feature_names) != len(X[0]):
        raise ValueError(
            "Length of feature_names does not match number of features in X."
        )

    # Convert X and y into a pandas DataFrame
    df_X = pd.DataFrame(X, columns=feature_names)
    df_y = pd.Series(y, name='target')

    # Concatenate X and y dataframes
    df = pd.concat([df_X, df_y], axis=1)

    return df

In [None]:
# outputs are cleared for viewing purpose

from sklearn.datasets import make_classification

random_200w_200d_experiement_results_table = run_repeated_experiments(
    xy_to_dataframe(
        *make_classification(n_samples=200 * 10000, n_features=200, random_state=42)
    ),
    'target',
    DEFAULT_XGB_PARAMS,
    'random_200w_200d',
    10,
)

In [26]:
random_200w_200d_experiement_results_table.describe()

## Conclusion

As we can see, the sgb can perform similarly to XGBoost models. 

However, it is not as fast as XGBoost. The ratio between the time consumptions can range from 8 to 12 times in a LAN setting.

Welcome to contribute and run more analysis on more datasets and parameters!