Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement fair XGBoost. #7640

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
19 changes: 19 additions & 0 deletions doc/model.schema
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@
}
}
},
"binary_regularized_param": {
"type": "object",
"properties": {
"fairness": {
"type": "number"
}
}
},
"aft_loss_param": {
"type": "object",
"properties": {
Expand Down Expand Up @@ -378,6 +386,17 @@
"reg_loss_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "binary:regularized" },
"binary_regularized_param": { "$ref": "#/definitions/binary_regularized_param"}
},
"required": [
"name",
"binary_regularized_param"
]
},
{
"type": "object",
"properties": {
Expand Down
9 changes: 8 additions & 1 deletion doc/parameter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,11 @@ Parameters for Tweedie Regression (``objective=reg:tweedie``)
- Set closer to 2 to shift towards a gamma distribution
- Set closer to 1 to shift towards a Poisson distribution.

Parameter for Fair Classification (``objective=binary:regularized``)
====================================================================

* ``fairness``: The strength of regularization, must be greater than 0.

************************
Learning Task Parameters
************************
Expand All @@ -361,9 +366,10 @@ Specify the learning task and the corresponding learning objective. The objectiv

- ``reg:squarederror``: regression with squared loss.
- ``reg:squaredlogerror``: regression with squared log loss :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2`. All input labels are required to be greater than -1. Also, see metric ``rmsle`` for possible issue with this objective.
- ``reg:logistic``: logistic regression
- ``reg:logistic``: logistic regression.
- ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
- ``binary:logistic``: logistic regression for binary classification, output probability
- ``binary:regularized`` regularized logistic binary classification, outputs probability.
- ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
- ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
- ``count:poisson``: Poisson regression for count data, output mean of Poisson distribution.
Expand Down Expand Up @@ -400,6 +406,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
- ``mape``: `mean absolute percentage error <https://en.wikipedia.org/wiki/Mean_absolute_percentage_error>`_
- ``mphe``: `mean Pseudo Huber error <https://en.wikipedia.org/wiki/Huber_loss>`_. Default metric of ``reg:pseudohubererror`` objective.
- ``logloss``: `negative log-likelihood <http://en.wikipedia.org/wiki/Log-likelihood>`_
- ``regularized-logloss``: Default metric for ``binary:regularized``.
- ``error``: Binary classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
- ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'.
- ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``.
Expand Down
4 changes: 3 additions & 1 deletion include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
class MetaInfo {
public:
/*! \brief number of data fields in MetaInfo */
static constexpr uint64_t kNumField = 12;
static constexpr uint64_t kNumField = 13;

/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
Expand All @@ -63,6 +63,8 @@ class MetaInfo {
std::vector<bst_group_t> group_ptr_; // NOLINT
/*! \brief weights of each instance, optional */
HostDeviceVector<bst_float> weights_; // NOLINT
/*! \brief sensitive feature of each instance, optional */
linalg::Tensor<float, 1> sensitive_features; // NOLINT
/*!
* \brief initialized margins,
* if specified, xgboost will start from this init margin
Expand Down
2 changes: 2 additions & 0 deletions include/xgboost/generic_parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
*/
int32_t Threads() const;

bool IsCPU() const { return gpu_id == kCpuId; }

// declare parameters
DMLC_DECLARE_PARAMETER(GenericParameter) {
DMLC_DECLARE_FIELD(seed).set_default(kDefaultSeed).describe(
Expand Down
15 changes: 13 additions & 2 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -545,8 +545,19 @@ using VectorView = TensorView<T, 1>;
*/
template <typename T>
auto MakeVec(T *ptr, size_t s, int32_t device = -1) {
using U = std::remove_const_t<std::remove_pointer_t<decltype(ptr)>> const;
return linalg::TensorView<U, 1>{{ptr, s}, {s}, device};
return linalg::TensorView<T, 1>{{ptr, s}, {s}, device};
}

template <typename T>
auto MakeVec(HostDeviceVector<T> *data) {
return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
data->Size(), data->DeviceIdx());
}

template <typename T>
auto MakeVec(HostDeviceVector<T> const *data) {
return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
data->Size(), data->DeviceIdx());
}

/**
Expand Down
5 changes: 4 additions & 1 deletion include/xgboost/metric.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ class Metric : public Configurable {
* override this function to maintain internal configuration
* \param out pointer to output JSON object
*/
void SaveConfig(Json*) const override {}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["name"] = String(this->Name());
}

/*!
* \brief evaluate a specific metric
Expand Down
32 changes: 30 additions & 2 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def inner_f(*args, **kwargs):
return inner_f


class DMatrix: # pylint: disable=too-many-instance-attributes
class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-methods
"""Data Matrix used in XGBoost.

DMatrix is an internal data structure that is used by XGBoost,
Expand All @@ -526,6 +526,7 @@ def __init__(
qid=None,
label_lower_bound=None,
label_upper_bound=None,
sensitive_feature=None,
feature_weights=None,
enable_categorical: bool = False,
) -> None:
Expand Down Expand Up @@ -575,6 +576,8 @@ def __init__(
Lower bound for survival training.
label_upper_bound : array_like
Upper bound for survival training.
sensitive_feature: array_like
Sensitive feature for each training sample.
feature_weights : array_like, optional
Set feature weights for column sampling.
enable_categorical: boolean, optional
Expand Down Expand Up @@ -625,6 +628,7 @@ def __init__(
qid=qid,
label_lower_bound=label_lower_bound,
label_upper_bound=label_upper_bound,
sensitive_feature=sensitive_feature,
feature_weights=feature_weights,
)

Expand Down Expand Up @@ -676,6 +680,7 @@ def set_info(
qid=None,
label_lower_bound=None,
label_upper_bound=None,
sensitive_feature=None,
feature_names: FeatNamesT = None,
feature_types: Optional[List[str]] = None,
feature_weights=None
Expand All @@ -687,6 +692,8 @@ def set_info(
self.set_label(label)
if weight is not None:
self.set_weight(weight)
if sensitive_feature is not None:
self.set_sensitive_feature(sensitive_feature)
if base_margin is not None:
self.set_base_margin(base_margin)
if group is not None:
Expand Down Expand Up @@ -836,6 +843,17 @@ def set_weight(self, weight) -> None:
from .data import dispatch_meta_backend
dispatch_meta_backend(self, weight, 'weight', 'float')

def set_sensitive_feature(self, sensitive_feature) -> None:
"""Set sensitive_feature of each instance.

Parameters
----------
sensitive_feature : array like
Sensitive feature for each data point
"""
from .data import dispatch_meta_backend
dispatch_meta_backend(self, sensitive_feature, 'sensitive_feature', 'float')

def set_base_margin(self, margin) -> None:
"""Set base margin of booster to start from.

Expand Down Expand Up @@ -882,7 +900,15 @@ def get_weight(self) -> np.ndarray:
"""
return self.get_float_info('weight')

def get_base_margin(self) -> np.ndarray:
def get_sensitive_feature(self):
"""Get the sensitive feature of the DMatrix.
Returns
-------
sensitive_feature : array
"""
return self.get_float_info('sensitive_feature')

def get_base_margin(self):
"""Get the base margin of the DMatrix.

Returns
Expand Down Expand Up @@ -1174,6 +1200,7 @@ def __init__( # pylint: disable=super-init-not-called
qid=None,
label_lower_bound=None,
label_upper_bound=None,
sensitive_feature=None,
feature_weights=None,
enable_categorical: bool = False,
):
Expand Down Expand Up @@ -1201,6 +1228,7 @@ def __init__( # pylint: disable=super-init-not-called
qid=qid,
label_lower_bound=label_lower_bound,
label_upper_bound=label_upper_bound,
sensitive_feature=sensitive_feature,
feature_weights=feature_weights,
feature_names=feature_names,
feature_types=feature_types,
Expand Down
27 changes: 20 additions & 7 deletions python-package/xgboost/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def __init__(
qid: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None,
label_upper_bound: Optional[_DaskCollection] = None,
sensitive_feature: Optional[_DaskCollection] = None,
feature_weights: Optional[_DaskCollection] = None,
enable_categorical: bool = False,
) -> None:
Expand Down Expand Up @@ -358,6 +359,7 @@ def __init__(
feature_weights=feature_weights,
label_lower_bound=label_lower_bound,
label_upper_bound=label_upper_bound,
sensitive_feature=sensitive_feature,
)

def __await__(self) -> Generator:
Expand All @@ -374,6 +376,7 @@ async def _map_local_data(
feature_weights: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None,
label_upper_bound: Optional[_DaskCollection] = None,
sensitive_feature: Optional[_DaskCollection] = None,
) -> "DaskDMatrix":
"""Obtain references to local data."""

Expand Down Expand Up @@ -427,6 +430,7 @@ def flatten_meta(meta: OpDelayed) -> OpDelayed:
qid_parts = flatten_meta(qid)
ll_parts = flatten_meta(label_lower_bound)
lu_parts = flatten_meta(label_upper_bound)
sf_parts = flatten_meta(sensitive_feature)

parts: Dict[str, List[ddelayed.Delayed]] = {"data": X_parts}

Expand All @@ -437,12 +441,16 @@ def append_meta(m_parts: Optional[List[ddelayed.Delayed]], name: str) -> None:
)
parts[name] = m_parts

append_meta(y_parts, "label")
append_meta(w_parts, "weight")
append_meta(margin_parts, "base_margin")
append_meta(qid_parts, "qid")
append_meta(ll_parts, "label_lower_bound")
append_meta(lu_parts, "label_upper_bound")
for p, n in [
(y_parts, "label"),
(w_parts, "weight"),
(margin_parts, "base_margin"),
(qid_parts, "qid"),
(ll_parts, "label_lower_bound"),
(lu_parts, "label_upper_bound"),
(sf_parts, "sensitive_feature"),
]:
append_meta(p, n)
# At this point, `parts` looks like:
# [(x0, x1, ..), (y0, y1, ..), ..] in delayed form

Expand Down Expand Up @@ -570,7 +578,7 @@ def append(i: int, name: str) -> None:
append(i, "qid")
append(i, "label_lower_bound")
append(i, "label_upper_bound")

append(i, "sensitive_feature")
return result


Expand All @@ -586,6 +594,7 @@ def __init__(
qid: Optional[List[Any]] = None,
label_lower_bound: Optional[List[Any]] = None,
label_upper_bound: Optional[List[Any]] = None,
sensitive_feature: Optional[List[Any]] = None,
feature_names: FeatNamesT = None,
feature_types: Optional[Union[Any, List[Any]]] = None,
) -> None:
Expand All @@ -596,6 +605,7 @@ def __init__(
self._qid = qid
self._label_lower_bound = label_lower_bound
self._label_upper_bound = label_upper_bound
self._sensitive_feature = sensitive_feature
self._feature_names = feature_names
self._feature_types = feature_types

Expand Down Expand Up @@ -646,6 +656,7 @@ def next(self, input_data: Callable) -> int:
base_margin=self._get("_base_margin"),
label_lower_bound=self._get("_label_lower_bound"),
label_upper_bound=self._get("_label_upper_bound"),
sensitive_feature=self._get("_sensitive_feature"),
feature_names=feature_names,
feature_types=self._feature_types,
)
Expand Down Expand Up @@ -687,6 +698,7 @@ def __init__(
qid: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None,
label_upper_bound: Optional[_DaskCollection] = None,
sensitive_feature: Optional[_DaskCollection] = None,
feature_weights: Optional[_DaskCollection] = None,
enable_categorical: bool = False,
) -> None:
Expand All @@ -700,6 +712,7 @@ def __init__(
qid=qid,
label_lower_bound=label_lower_bound,
label_upper_bound=label_upper_bound,
sensitive_feature=sensitive_feature,
missing=missing,
silent=silent,
feature_weights=feature_weights,
Expand Down
10 changes: 10 additions & 0 deletions src/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,16 @@ std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
return result;
}

struct OptionalWeights {
Span<float const> weights;
float dft{1.0f};

explicit OptionalWeights(Span<float const> w) : weights{w} {}
explicit OptionalWeights(float w) : dft{w} {}

XGBOOST_DEVICE float operator[](size_t i) const { return weights.empty() ? dft : weights[i]; }
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_COMMON_H_
25 changes: 24 additions & 1 deletion src/common/linalg_op.cuh
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
/*!
* Copyright 2021 by XGBoost Contributors
* Copyright 2021-2022 by XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_LINALG_OP_CUH_
#define XGBOOST_COMMON_LINALG_OP_CUH_

#include "xgboost/generic_parameters.h"
#include "device_helpers.cuh"
#include "linalg_op.h"
#include "xgboost/linalg.h"

namespace xgboost {
namespace linalg {
template <typename T, int32_t D, typename Fn>
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
"For function with return, use transform instead.");
if (t.Contiguous()) {
auto ptr = t.Values().data();
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable { fn(i, ptr[i]); });
} else {
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
fn(i, v);
});
}
}

template <typename T, int32_t D, typename Fn>
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
if (t.Contiguous()) {
auto ptr = t.Values().data();
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
Expand All @@ -20,6 +38,11 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s
});
}
}

template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
}
} // namespace linalg
} // namespace xgboost
#endif // XGBOOST_COMMON_LINALG_OP_CUH_
Loading