Skip to content

Commit

Permalink
Initial support for one hot categorical split.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Aug 6, 2020
1 parent 8599f87 commit d8ac122
Show file tree
Hide file tree
Showing 57 changed files with 1,347 additions and 433 deletions.
3 changes: 2 additions & 1 deletion include/xgboost/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ using bst_int = int32_t; // NOLINT
using bst_ulong = uint64_t; // NOLINT
/*! \brief float type, used for storing statistics */
using bst_float = float; // NOLINT

/*! \brief Categorical value type. */
using bst_cat_t = int32_t; // NOLINT
/*! \brief Type for data column (feature) index. */
using bst_feature_t = uint32_t; // NOLINT
/*! \brief Type for data row index.
Expand Down
9 changes: 2 additions & 7 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ enum class DataType : uint8_t {
};

enum class FeatureType : uint8_t {
kNumerical
kNumerical,
kCategorical
};

/*!
Expand Down Expand Up @@ -314,12 +315,6 @@ class SparsePage {
}
}

/*!
* \brief Push row block into the page.
* \param batch the row batch.
*/
void Push(const dmlc::RowBlock<uint32_t>& batch);

/**
* \brief Pushes external data batch onto this page
*
Expand Down
4 changes: 3 additions & 1 deletion include/xgboost/feature_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ class FeatureMap {
if (!strcmp("q", tname)) return kQuantitive;
if (!strcmp("int", tname)) return kInteger;
if (!strcmp("float", tname)) return kFloat;
LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
if (!strcmp("categorical", tname)) return kInteger;
LOG(FATAL) << "unknown feature type, use i for indicator, q for quantity "
"and categorical for categorical split.";
return kIndicator;
}
/*! \brief name of the feature */
Expand Down
19 changes: 19 additions & 0 deletions include/xgboost/span.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ namespace common {
"\tBlock: [%d, %d, %d], Thread: [%d, %d, %d]\n\n", \
__FILE__, __LINE__, __PRETTY_FUNCTION__, #cond, blockIdx.x, \
blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); \
assert(false); \
asm("trap;"); \
} \
} while (0);
Expand All @@ -101,6 +102,18 @@ namespace common {
} while (0);
#endif // __CUDA_ARCH__

#if defined(__CUDA_ARCH__)
#define SPAN_LT(lhs, rhs) \
if (!((lhs) < (rhs))) { \
printf("%lu < %lu failed\n", static_cast<size_t>(lhs), \
static_cast<size_t>(rhs)); \
assert(false); \
}
#else
#define SPAN_LT(lhs, rhs) \
SPAN_CHECK((lhs) < (rhs))
#endif // defined(__CUDA_ARCH__)

namespace detail {
/*!
* By default, XGBoost uses uint32_t for indexing data. int64_t covers all
Expand Down Expand Up @@ -515,6 +528,7 @@ class Span {
}

XGBOOST_DEVICE reference operator[](index_type _idx) const {
SPAN_LT(_idx, size());
SPAN_CHECK(_idx < size());
return data()[_idx];
}
Expand Down Expand Up @@ -648,6 +662,11 @@ XGBOOST_DEVICE auto as_writable_bytes(Span<T, E> s) __span_noexcept -> // NOLIN
return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
}

template <typename T, template <class, class...> class Container, typename... Types,
std::size_t Extent = dynamic_extent>
auto MakeSpan(Container<T, Types...> const &container) {
return Span<T, Extent>(container);
}
} // namespace common
} // namespace xgboost

Expand Down
59 changes: 38 additions & 21 deletions include/xgboost/tree_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ class RegTree : public Model {
param.num_deleted = 0;
nodes_.resize(param.num_nodes);
stats_.resize(param.num_nodes);
split_types_.resize(param.num_nodes, FeatureType::kNumerical);
split_categories_segments_.resize(param.num_nodes);
for (int i = 0; i < param.num_nodes; i ++) {
nodes_[i].SetLeaf(0.0f);
nodes_[i].SetParent(kInvalidNodeId);
Expand Down Expand Up @@ -377,30 +379,18 @@ class RegTree : public Model {
* \param leaf_right_child The right child index of leaf, by default kInvalidNodeId,
* some updaters use the right child index of leaf as a marker
*/
void ExpandNode(int nid, unsigned split_index, bst_float split_value,
void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
bool default_left, bst_float base_weight,
bst_float left_leaf_weight, bst_float right_leaf_weight,
bst_float loss_change, float sum_hess, float left_sum,
float right_sum,
bst_node_t leaf_right_child = kInvalidNodeId) {
int pleft = this->AllocNode();
int pright = this->AllocNode();
auto &node = nodes_[nid];
CHECK(node.IsLeaf());
node.SetLeftChild(pleft);
node.SetRightChild(pright);
nodes_[node.LeftChild()].SetParent(nid, true);
nodes_[node.RightChild()].SetParent(nid, false);
node.SetSplit(split_index, split_value,
default_left);

nodes_[pleft].SetLeaf(left_leaf_weight, leaf_right_child);
nodes_[pright].SetLeaf(right_leaf_weight, leaf_right_child);

this->Stat(nid) = {loss_change, sum_hess, base_weight};
this->Stat(pleft) = {0.0f, left_sum, left_leaf_weight};
this->Stat(pright) = {0.0f, right_sum, right_leaf_weight};
}
bst_node_t leaf_right_child = kInvalidNodeId);

void ExpandCategorical(bst_node_t nid, unsigned split_index,
common::Span<uint32_t> split_cat, bool default_left,
bst_float base_weight, bst_float left_leaf_weight,
bst_float right_leaf_weight, bst_float loss_change,
float sum_hess, float left_sum, float right_sum);

/*!
* \brief get current depth
Expand Down Expand Up @@ -553,6 +543,25 @@ class RegTree : public Model {
* \brief calculate the mean value for each node, required for feature contributions
*/
void FillNodeMeanValues();
/*!
* \brief Get split type for a node.
* \param nidx Index of node.
* \return The type of this split. For leaf node it's always kNumerical.
*/
FeatureType NodeSplitType(bst_node_t nidx) const {
return split_types_.at(nidx);
}
/*!
* \brief Get split types for all nodes.
*/
std::vector<FeatureType> const &GetSplitTypes() const { return split_types_; }
common::Span<uint32_t const> GetSplitCategories() const { return split_categories_; }
auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }

struct Segment {
size_t beg {0};
size_t size {0};
};

private:
// vector of nodes
Expand All @@ -562,9 +571,15 @@ class RegTree : public Model {
// stats of nodes
std::vector<RTreeNodeStat> stats_;
std::vector<bst_float> node_mean_values_;
std::vector<FeatureType> split_types_;

// Categories for each internal node.
std::vector<uint32_t> split_categories_;
std::vector<Segment> split_categories_segments_;

// allocate a new node,
// !!!!!! NOTE: may cause BUG here, nodes.resize
int AllocNode() {
bst_node_t AllocNode() {
if (param.num_deleted != 0) {
int nid = deleted_nodes_.back();
deleted_nodes_.pop_back();
Expand All @@ -577,6 +592,8 @@ class RegTree : public Model {
<< "number of nodes in the tree exceed 2^31";
nodes_.resize(param.num_nodes);
stats_.resize(param.num_nodes);
split_types_.resize(param.num_nodes, FeatureType::kNumerical);
split_categories_segments_.resize(param.num_nodes);
return nd;
}
// delete a tree node, keep the parent field to allow trace back
Expand Down
17 changes: 15 additions & 2 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
silent=False,
feature_names=None,
feature_types=None,
nthread=None):
nthread=None,
enable_categorical=False):
"""Parameters
----------
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
Expand Down Expand Up @@ -417,6 +418,17 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
Number of threads to use for loading data when parallelization is
applicable. If -1, uses maximum threads available on the system.
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
Experimental support of specializing for categorical features. Do
not set to True unless you are interested in development.
Currently it's only available for `gpu_hist` tree method with 1 vs
rest (one hot) categorical split. Also, JSON serialization format,
`enable_experimental_json_serialization`, `gpu_predictor` and
pandas input are required.
"""
if isinstance(data, list):
raise TypeError('Input data can not be a list.')
Expand All @@ -435,7 +447,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
data, missing=self.missing,
threads=self.nthread,
feature_names=feature_names,
feature_types=feature_types)
feature_types=feature_types,
enable_categorical=enable_categorical)
assert handle is not None
self.handle = handle

Expand Down
35 changes: 24 additions & 11 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,20 +168,24 @@ def _is_pandas_df(data):
}


def _transform_pandas_df(data, feature_names=None, feature_types=None,
def _transform_pandas_df(data, enable_categorical,
feature_names=None, feature_types=None,
meta=None, meta_type=None):
from pandas import MultiIndex, Int64Index
from pandas.api.types import is_sparse
from pandas.api.types import is_sparse, is_categorical

data_dtypes = data.dtypes
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
(is_categorical(dtype) and enable_categorical)
for dtype in data_dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
if dtype.name not in _pandas_dtype_mapper
]

msg = """DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields """
msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))

if feature_names is None and meta is None:
Expand All @@ -200,6 +204,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[
dtype.subtype.name])
elif is_categorical(dtype) and enable_categorical:
feature_types.append('categorical')
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])

Expand All @@ -209,14 +215,19 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
meta=meta))

dtype = meta_type if meta_type else 'float'
data = data.values.astype(dtype)
try:
data = data.values.astype(dtype)
except ValueError as e:
raise ValueError('Data must be convertable to float, even ' +
'for categorical data.') from e

return data, feature_names, feature_types


def _from_pandas_df(data, missing, nthread, feature_names, feature_types):
def _from_pandas_df(data, enable_categorical, missing, nthread,
feature_names, feature_types):
data, feature_names, feature_types = _transform_pandas_df(
data, feature_names, feature_types)
data, enable_categorical, feature_names, feature_types)
return _from_numpy_array(data, missing, nthread, feature_names,
feature_types)

Expand Down Expand Up @@ -484,7 +495,8 @@ def _has_array_protocol(data):


def dispatch_data_backend(data, missing, threads,
feature_names, feature_types):
feature_names, feature_types,
enable_categorical=False):
'''Dispatch data for DMatrix.'''
if _is_scipy_csr(data):
return _from_scipy_csr(data, missing, feature_names, feature_types)
Expand All @@ -500,7 +512,7 @@ def dispatch_data_backend(data, missing, threads,
if _is_tuple(data):
return _from_tuple(data, missing, feature_names, feature_types)
if _is_pandas_df(data):
return _from_pandas_df(data, missing, threads,
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_pandas_series(data):
return _from_pandas_series(data, missing, threads, feature_names,
Expand Down Expand Up @@ -607,7 +619,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
data, _, _ = _transform_pandas_df(data, False, meta=name,
meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_series(data):
Expand Down
Loading

0 comments on commit d8ac122

Please sign in to comment.