-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
/
categorical.h
100 lines (85 loc) · 3.2 KB
/
categorical.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/*!
* Copyright 2020-2022 by XGBoost Contributors
* \file categorical.h
*/
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
#define XGBOOST_COMMON_CATEGORICAL_H_
#include <limits>
#include "bitfield.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/parameter.h"
#include "xgboost/span.h"
namespace xgboost {
namespace common {
using CatBitField = LBitField32;
using KCatBitField = CLBitField32;
// Cast the categorical type.
template <typename T>
XGBOOST_DEVICE bst_cat_t AsCat(T const& v) {
return static_cast<bst_cat_t>(v);
}
/* \brief Whether is fidx a categorical feature.
*
* \param ft Feature type for all features.
* \param fidx Feature index.
* \return Whether feature pointed by fidx is categorical feature.
*/
inline XGBOOST_DEVICE bool IsCat(Span<FeatureType const> ft, bst_feature_t fidx) {
return !ft.empty() && ft[fidx] == FeatureType::kCategorical;
}
constexpr inline bst_cat_t OutOfRangeCat() {
// See the round trip assert in `InvalidCat`.
return static_cast<bst_cat_t>(16777217) - static_cast<bst_cat_t>(1);
}
inline XGBOOST_DEVICE bool InvalidCat(float cat) {
constexpr auto kMaxCat = OutOfRangeCat();
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
return cat < 0 || cat >= kMaxCat;
}
/* \brief Whether should it traverse to left branch of a tree.
*
* For one hot split, go to left if it's NOT the matching category.
*/
template <bool validate = true>
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat, bool dft_left) {
KCatBitField const s_cats(cats);
// FIXME: Size() is not accurate since it represents the size of bit set instead of
// actual number of categories.
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
return dft_left;
}
auto pos = KCatBitField::ToBitPos(cat);
if (pos.int_pos >= cats.size()) {
return true;
}
return !s_cats.Check(AsCat(cat));
}
inline void InvalidCategory() {
// OutOfRangeCat() can be accurately represented, but everything after it will be
// rounded toward it, so we use >= for comparison check. As a result, we require input
// values to be less than this last representable value.
auto str = std::to_string(OutOfRangeCat());
LOG(FATAL) << "Invalid categorical value detected. Categorical value should be non-negative, "
"less than total number of categories in training data and less than " +
str;
}
inline void CheckMaxCat(float max_cat, size_t n_categories) {
CHECK_GE(max_cat + 1, n_categories)
<< "Maximum cateogry should not be lesser than the total number of categories.";
}
/*!
* \brief Whether should we use onehot encoding for categorical data.
*/
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot) {
bool use_one_hot = n_cats < max_cat_to_onehot;
return use_one_hot;
}
struct IsCatOp {
XGBOOST_DEVICE bool operator()(FeatureType ft) { return ft == FeatureType::kCategorical; }
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_CATEGORICAL_H_