Initial support for one hot categorical split.

dmlc · Aug 6, 2020 · d8ac122 · d8ac122
1 parent 8599f87
commit d8ac122
Show file tree

Hide file tree

Showing 57 changed files with 1,347 additions and 433 deletions.
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
@@ -109,7 +109,8 @@ using bst_int = int32_t;    // NOLINT
 using bst_ulong = uint64_t;  // NOLINT
 /*! \brief float type, used for storing statistics */
 using bst_float = float;  // NOLINT
-
+/*! \brief Categorical value type. */
+using bst_cat_t = int32_t;  // NOLINT
 /*! \brief Type for data column (feature) index. */
 using bst_feature_t = uint32_t;  // NOLINT
 /*! \brief Type for data row index.

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -36,7 +36,8 @@ enum class DataType : uint8_t {
 };
 
 enum class FeatureType : uint8_t {
-  kNumerical
+  kNumerical,
+  kCategorical
 };
 
 /*!
@@ -314,12 +315,6 @@ class SparsePage {
     }
   }
 
-  /*!
-   * \brief Push row block into the page.
-   * \param batch the row batch.
-   */
-  void Push(const dmlc::RowBlock<uint32_t>& batch);
-
   /**
    * \brief Pushes external data batch onto this page
    *

diff --git a/include/xgboost/feature_map.h b/include/xgboost/feature_map.h
@@ -82,7 +82,9 @@ class FeatureMap {
     if (!strcmp("q", tname)) return kQuantitive;
     if (!strcmp("int", tname)) return kInteger;
     if (!strcmp("float", tname)) return kFloat;
-    LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
+    if (!strcmp("categorical", tname)) return kInteger;
+    LOG(FATAL) << "unknown feature type, use i for indicator, q for quantity "
+                  "and categorical for categorical split.";
     return kIndicator;
   }
   /*! \brief name of the feature */

diff --git a/include/xgboost/span.h b/include/xgboost/span.h
@@ -82,6 +82,7 @@ namespace common {
              "\tBlock: [%d, %d, %d], Thread: [%d, %d, %d]\n\n",                \
              __FILE__, __LINE__, __PRETTY_FUNCTION__, #cond, blockIdx.x,       \
              blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);   \
+      assert(false);                                                           \
       asm("trap;");                                                            \
     }                                                                          \
   } while (0);
@@ -101,6 +102,18 @@ namespace common {
   } while (0);
 #endif  // __CUDA_ARCH__
 
+#if defined(__CUDA_ARCH__)
+#define SPAN_LT(lhs, rhs)                                                      \
+  if (!((lhs) < (rhs))) {                                                      \
+    printf("%lu < %lu failed\n", static_cast<size_t>(lhs),                     \
+           static_cast<size_t>(rhs));                                          \
+    assert(false);                                                             \
+  }
+#else
+#define SPAN_LT(lhs, rhs)                       \
+  SPAN_CHECK((lhs) < (rhs))
+#endif  // defined(__CUDA_ARCH__)
+
 namespace detail {
 /*!
  * By default, XGBoost uses uint32_t for indexing data. int64_t covers all
@@ -515,6 +528,7 @@ class Span {
   }
 
   XGBOOST_DEVICE reference operator[](index_type _idx) const {
+    SPAN_LT(_idx, size());
     SPAN_CHECK(_idx < size());
     return data()[_idx];
   }
@@ -648,6 +662,11 @@ XGBOOST_DEVICE auto as_writable_bytes(Span<T, E> s) __span_noexcept ->  // NOLIN
   return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
 }
 
+template <typename T, template <class, class...> class Container, typename... Types,
+          std::size_t Extent = dynamic_extent>
+auto MakeSpan(Container<T, Types...> const &container) {
+  return Span<T, Extent>(container);
+}
 }  // namespace common
 }  // namespace xgboost
 

diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
@@ -283,6 +283,8 @@ class RegTree : public Model {
     param.num_deleted = 0;
     nodes_.resize(param.num_nodes);
     stats_.resize(param.num_nodes);
+    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.resize(param.num_nodes);
     for (int i = 0; i < param.num_nodes; i ++) {
       nodes_[i].SetLeaf(0.0f);
       nodes_[i].SetParent(kInvalidNodeId);
@@ -377,30 +379,18 @@ class RegTree : public Model {
    * \param leaf_right_child  The right child index of leaf, by default kInvalidNodeId,
    *                          some updaters use the right child index of leaf as a marker
    */
-  void ExpandNode(int nid, unsigned split_index, bst_float split_value,
+  void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
                   bool default_left, bst_float base_weight,
                   bst_float left_leaf_weight, bst_float right_leaf_weight,
                   bst_float loss_change, float sum_hess, float left_sum,
                   float right_sum,
-                  bst_node_t leaf_right_child = kInvalidNodeId) {
-    int pleft = this->AllocNode();
-    int pright = this->AllocNode();
-    auto &node = nodes_[nid];
-    CHECK(node.IsLeaf());
-    node.SetLeftChild(pleft);
-    node.SetRightChild(pright);
-    nodes_[node.LeftChild()].SetParent(nid, true);
-    nodes_[node.RightChild()].SetParent(nid, false);
-    node.SetSplit(split_index, split_value,
-                  default_left);
-
-    nodes_[pleft].SetLeaf(left_leaf_weight, leaf_right_child);
-    nodes_[pright].SetLeaf(right_leaf_weight, leaf_right_child);
-
-    this->Stat(nid)    = {loss_change, sum_hess, base_weight};
-    this->Stat(pleft)  = {0.0f, left_sum, left_leaf_weight};
-    this->Stat(pright) = {0.0f, right_sum, right_leaf_weight};
-  }
+                  bst_node_t leaf_right_child = kInvalidNodeId);
+
+  void ExpandCategorical(bst_node_t nid, unsigned split_index,
+                         common::Span<uint32_t> split_cat, bool default_left,
+                         bst_float base_weight, bst_float left_leaf_weight,
+                         bst_float right_leaf_weight, bst_float loss_change,
+                         float sum_hess, float left_sum, float right_sum);
 
   /*!
    * \brief get current depth
@@ -553,6 +543,25 @@ class RegTree : public Model {
    * \brief calculate the mean value for each node, required for feature contributions
    */
   void FillNodeMeanValues();
+  /*!
+   * \brief Get split type for a node.
+   * \param nidx Index of node.
+   * \return The type of this split.  For leaf node it's always kNumerical.
+   */
+  FeatureType NodeSplitType(bst_node_t nidx) const {
+    return split_types_.at(nidx);
+  }
+  /*!
+   * \brief Get split types for all nodes.
+   */
+  std::vector<FeatureType> const &GetSplitTypes() const { return split_types_; }
+  common::Span<uint32_t const> GetSplitCategories() const { return split_categories_; }
+  auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
+
+  struct Segment {
+    size_t beg {0};
+    size_t size {0};
+  };
 
  private:
   // vector of nodes
@@ -562,9 +571,15 @@ class RegTree : public Model {
   // stats of nodes
   std::vector<RTreeNodeStat> stats_;
   std::vector<bst_float> node_mean_values_;
+  std::vector<FeatureType> split_types_;
+
+  // Categories for each internal node.
+  std::vector<uint32_t> split_categories_;
+  std::vector<Segment> split_categories_segments_;
+
   // allocate a new node,
   // !!!!!! NOTE: may cause BUG here, nodes.resize
-  int AllocNode() {
+  bst_node_t AllocNode() {
     if (param.num_deleted != 0) {
       int nid = deleted_nodes_.back();
       deleted_nodes_.pop_back();
@@ -577,6 +592,8 @@ class RegTree : public Model {
         << "number of nodes in the tree exceed 2^31";
     nodes_.resize(param.num_nodes);
     stats_.resize(param.num_nodes);
+    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.resize(param.num_nodes);
     return nd;
   }
   // delete a tree node, keep the parent field to allow trace back

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -382,7 +382,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
                  silent=False,
                  feature_names=None,
                  feature_types=None,
-                 nthread=None):
+                 nthread=None,
+                 enable_categorical=False):
         """Parameters
         ----------
         data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
@@ -417,6 +418,17 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
             Number of threads to use for loading data when parallelization is
             applicable. If -1, uses maximum threads available on the system.
 
+        enable_categorical: boolean, optional
+
+            .. versionadded:: 1.3.0
+
+            Experimental support of specializing for categorical features.  Do
+            not set to True unless you are interested in development.
+            Currently it's only available for `gpu_hist` tree method with 1 vs
+            rest (one hot) categorical split.  Also, JSON serialization format,
+            `enable_experimental_json_serialization`, `gpu_predictor` and
+            pandas input are required.
+
         """
         if isinstance(data, list):
             raise TypeError('Input data can not be a list.')
@@ -435,7 +447,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
             data, missing=self.missing,
             threads=self.nthread,
             feature_names=feature_names,
-            feature_types=feature_types)
+            feature_types=feature_types,
+            enable_categorical=enable_categorical)
         assert handle is not None
         self.handle = handle
 

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -168,20 +168,24 @@ def _is_pandas_df(data):
 }
 
 
-def _transform_pandas_df(data, feature_names=None, feature_types=None,
+def _transform_pandas_df(data, enable_categorical,
+                         feature_names=None, feature_types=None,
                          meta=None, meta_type=None):
     from pandas import MultiIndex, Int64Index
-    from pandas.api.types import is_sparse
+    from pandas.api.types import is_sparse, is_categorical
+
     data_dtypes = data.dtypes
-    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
+    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
+               (is_categorical(dtype) and enable_categorical)
                for dtype in data_dtypes):
         bad_fields = [
             str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
             if dtype.name not in _pandas_dtype_mapper
         ]
 
-        msg = """DataFrame.dtypes for data must be int, float or bool.
-                Did not expect the data types in fields """
+        msg = """DataFrame.dtypes for data must be int, float, bool or categorical.  When
+                categorical type is supplied, DMatrix parameter
+                `enable_categorical` must be set to `True`."""
         raise ValueError(msg + ', '.join(bad_fields))
 
     if feature_names is None and meta is None:
@@ -200,6 +204,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
             if is_sparse(dtype):
                 feature_types.append(_pandas_dtype_mapper[
                     dtype.subtype.name])
+            elif is_categorical(dtype) and enable_categorical:
+                feature_types.append('categorical')
             else:
                 feature_types.append(_pandas_dtype_mapper[dtype.name])
 
@@ -209,14 +215,19 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
                 meta=meta))
 
     dtype = meta_type if meta_type else 'float'
-    data = data.values.astype(dtype)
+    try:
+        data = data.values.astype(dtype)
+    except ValueError as e:
+        raise ValueError('Data must be convertable to float, even ' +
+                         'for categorical data.') from e
 
     return data, feature_names, feature_types
 
 
-def _from_pandas_df(data, missing, nthread, feature_names, feature_types):
+def _from_pandas_df(data, enable_categorical, missing, nthread,
+                    feature_names, feature_types):
     data, feature_names, feature_types = _transform_pandas_df(
-        data, feature_names, feature_types)
+        data, enable_categorical, feature_names, feature_types)
     return _from_numpy_array(data, missing, nthread, feature_names,
                              feature_types)
 
@@ -484,7 +495,8 @@ def _has_array_protocol(data):
 
 
 def dispatch_data_backend(data, missing, threads,
-                          feature_names, feature_types):
+                          feature_names, feature_types,
+                          enable_categorical=False):
     '''Dispatch data for DMatrix.'''
     if _is_scipy_csr(data):
         return _from_scipy_csr(data, missing, feature_names, feature_types)
@@ -500,7 +512,7 @@ def dispatch_data_backend(data, missing, threads,
     if _is_tuple(data):
         return _from_tuple(data, missing, feature_names, feature_types)
     if _is_pandas_df(data):
-        return _from_pandas_df(data, missing, threads,
+        return _from_pandas_df(data, enable_categorical, missing, threads,
                                feature_names, feature_types)
     if _is_pandas_series(data):
         return _from_pandas_series(data, missing, threads, feature_names,
@@ -607,7 +619,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_df(data):
-        data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
+        data, _, _ = _transform_pandas_df(data, False, meta=name,
+                                          meta_type=dtype)
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_series(data):