Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
analyses will require lower memory limits than before (See {ml-pull}1298[#1298].)
* Checkpoint state to allow efficient failover during coarse parameter search
for classification and regression. (See {ml-pull}1300[#1300].)
* Improve data access patterns to speed up classification and regression.
(See {ml-pull}1312[#1312].)

=== Bug Fixes

Expand Down
8 changes: 4 additions & 4 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,10 @@ class MATHS_EXPORT CBoostedTreeImpl final {
std::size_t featureBagSize() const;

//! Sample the features according to their categorical distribution.
TSizeVec featureBag() const;
void featureBag(TDoubleVec& probabilities, TSizeVec& features) const;

//! Get a column mask of the suitable regressor features.
void candidateRegressorFeatures(const TDoubleVec& probabilities, TSizeVec& features) const;

//! Refresh the predictions and loss function derivatives for the masked
//! rows in \p frame with predictions of \p tree.
Expand All @@ -265,9 +268,6 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! Compute the mean of the loss function on the masked rows of \p frame.
double meanLoss(const core::CDataFrame& frame, const core::CPackedBitVector& rowMask) const;

//! Get a column mask of the suitable regressor features.
TSizeVec candidateRegressorFeatures() const;

//! Get the root node of \p tree.
static const CBoostedTreeNode& root(const TNodeVec& tree);

Expand Down
62 changes: 34 additions & 28 deletions lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ namespace {
const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0};
const std::string HYPERPARAMETER_OPTIMIZATION_ROUND{"hyperparameter_optimization_round_"};
const std::string TRAIN_FINAL_FOREST{"train_final_forest"};
const double MEMORY_USAGE_WORST_CASE_TO_AVERAGE{4.75};

//! \brief Record the memory used by a supplied object using the RAII idiom.
class CScopeRecordMemoryUsage {
Expand Down Expand Up @@ -658,7 +657,8 @@ CBoostedTreeImpl::TImmutableRadixSetVec
CBoostedTreeImpl::candidateSplits(const core::CDataFrame& frame,
const core::CPackedBitVector& trainingRowMask) const {

TSizeVec features{this->candidateRegressorFeatures()};
TSizeVec features;
this->candidateRegressorFeatures(m_FeatureSampleProbabilities, features);
LOG_TRACE(<< "candidate features = " << core::CContainerPrinter::print(features));

TSizeVec binaryFeatures(features);
Expand Down Expand Up @@ -750,11 +750,17 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame,
TNodeVec tree(1);
tree.reserve(2 * maximumTreeSize + 1);

// Sampling transforms the probabilities. We use a placeholder outside
// the loop adding nodes so we only allocate the vector once.
TDoubleVec featureSampleProbabilities{m_FeatureSampleProbabilities};
TSizeVec featureBag;
this->featureBag(featureSampleProbabilities, featureBag);

TLeafNodeStatisticsPtrQueue leaves(maximumTreeSize / 2 + 3);
leaves.push_back(std::make_shared<CBoostedTreeLeafNodeStatistics>(
0 /*root*/, m_ExtraColumns, m_Loss->numberParameters(), m_NumberThreads,
frame, *m_Encoder, m_Regularization, candidateSplits,
this->featureBag(), 0 /*depth*/, trainingRowMask));
frame, *m_Encoder, m_Regularization, candidateSplits, featureBag,
0 /*depth*/, trainingRowMask));

// We update local variables because the callback can be expensive if it
// requires accessing atomics.
Expand Down Expand Up @@ -807,11 +813,14 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame,
tree[leaf->id()].split(splitFeature, splitValue, assignMissingToLeft,
leaf->gain(), leaf->curvature(), tree);

featureSampleProbabilities = m_FeatureSampleProbabilities;
this->featureBag(featureSampleProbabilities, featureBag);

TLeafNodeStatisticsPtr leftChild;
TLeafNodeStatisticsPtr rightChild;
std::tie(leftChild, rightChild) = leaf->split(
leftChildId, rightChildId, m_NumberThreads, frame, *m_Encoder,
m_Regularization, candidateSplits, this->featureBag(), tree[leaf->id()]);
m_Regularization, candidateSplits, featureBag, tree[leaf->id()]);

if (less(rightChild, leftChild)) {
std::swap(leftChild, rightChild);
Expand Down Expand Up @@ -980,20 +989,32 @@ std::size_t CBoostedTreeImpl::featureBagSize() const {
std::ceil(m_FeatureBagFraction * static_cast<double>(this->numberFeatures())), 1.0));
}

CBoostedTreeImpl::TSizeVec CBoostedTreeImpl::featureBag() const {
void CBoostedTreeImpl::featureBag(TDoubleVec& probabilities, TSizeVec& bag) const {

std::size_t size{this->featureBagSize()};

TSizeVec features{this->candidateRegressorFeatures()};
if (size >= features.size()) {
return features;
this->candidateRegressorFeatures(probabilities, bag);
if (size >= bag.size()) {
return;
}

TSizeVec sample;
TDoubleVec probabilities(m_FeatureSampleProbabilities);
CSampling::categoricalSampleWithoutReplacement(m_Rng, probabilities, size, sample);
CSampling::categoricalSampleWithoutReplacement(m_Rng, probabilities, size, bag);
std::sort(bag.begin(), bag.end());
}

void CBoostedTreeImpl::candidateRegressorFeatures(const TDoubleVec& probabilities,
TSizeVec& features) const {
features.clear();
features.reserve(probabilities.size());
for (std::size_t i = 0; i < probabilities.size(); ++i) {
if (probabilities[i] > 0.0) {
features.push_back(i);
}
}
}

return sample;
const CBoostedTreeNode& CBoostedTreeImpl::root(const TNodeVec& tree) {
return tree[0];
}

void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& frame,
Expand Down Expand Up @@ -1096,21 +1117,6 @@ double CBoostedTreeImpl::meanLoss(const core::CDataFrame& frame,
return CBasicStatistics::mean(loss);
}

CBoostedTreeImpl::TSizeVec CBoostedTreeImpl::candidateRegressorFeatures() const {
TSizeVec result;
result.reserve(m_FeatureSampleProbabilities.size());
for (std::size_t i = 0; i < m_FeatureSampleProbabilities.size(); ++i) {
if (m_FeatureSampleProbabilities[i] > 0.0) {
result.push_back(i);
}
}
return result;
}

const CBoostedTreeNode& CBoostedTreeImpl::root(const TNodeVec& tree) {
return tree[0];
}

CBoostedTreeNode& CBoostedTreeImpl::root(TNodeVec& tree) {
return tree[0];
}
Expand Down
74 changes: 36 additions & 38 deletions lib/maths/CSampling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -153,25 +153,25 @@ void doCategoricalSampleWithReplacement(RNG& rng,
return;
}

std::size_t p = probabilities.size();
std::size_t m{probabilities.size()};

// Construct the transform function.
for (std::size_t i = 1u; i < p; ++i) {
for (std::size_t i = 1; i < m; ++i) {
probabilities[i] += probabilities[i - 1];
}

if (probabilities[p - 1] == 0.0) {
doUniformSample(rng, std::size_t(0), p, n, result);
if (probabilities[m - 1] == 0.0) {
doUniformSample(rng, std::size_t(0), m, n, result);
} else {
result.reserve(n);
boost::random::uniform_real_distribution<> uniform(0.0, probabilities[p - 1]);
boost::random::uniform_real_distribution<> uniform(0.0, probabilities[m - 1]);
for (std::size_t i = 0u; i < n; ++i) {
double uniform0X = uniform(rng);
double u0X{uniform(rng)};
result.push_back(std::min(
static_cast<std::size_t>(std::lower_bound(probabilities.begin(),
probabilities.end(), uniform0X) -
probabilities.end(), u0X) -
probabilities.begin()),
probabilities.size() - 1));
m - 1));
}
}
}
Expand All @@ -191,45 +191,43 @@ void doCategoricalSampleWithoutReplacement(RNG& rng,
return;
}

std::size_t p = probabilities.size();
if (n >= p) {
result.assign(boost::counting_iterator<std::size_t>(0),
boost::counting_iterator<std::size_t>(p));
std::size_t m{probabilities.size()};
result.assign(boost::counting_iterator<std::size_t>(0),
boost::counting_iterator<std::size_t>(m));

if (n >= m) {
return;
}

// Construct the transform function.
for (std::size_t i = 1u; i < p; ++i) {
for (std::size_t i = 1; i < m; ++i) {
probabilities[i] += probabilities[i - 1];
}

result.reserve(n);
TSizeVec indices(boost::counting_iterator<std::size_t>(0),
boost::counting_iterator<std::size_t>(p));
TSizeVec s(1);

for (std::size_t i = 0u; i < n; ++i, --p) {
if (probabilities[p - 1] <= 0.0) {
doUniformSample(rng, std::size_t(0), indices.size(), 1, s);
result.push_back(indices[s[0]]);
for (std::size_t i = 0; i < n; ++i, --m) {
std::size_t s{0};
double x{probabilities[m - 1]};
if (x <= 0.0) {
s = doUniformSample(rng, std::size_t{0}, m);
} else {
boost::random::uniform_real_distribution<> uniform(0.0, probabilities[p - 1]);
double uniform0X = uniform(rng);
s[0] = std::min(static_cast<std::size_t>(
std::lower_bound(probabilities.begin(),
probabilities.end(), uniform0X) -
probabilities.begin()),
probabilities.size() - 1);

result.push_back(indices[s[0]]);

double ps = probabilities[s[0]] - (s[0] == 0 ? 0.0 : probabilities[s[0] - 1]);
for (std::size_t j = s[0] + 1; j < p; ++j) {
probabilities[j - 1] = probabilities[j] - ps;
}
probabilities.pop_back();
boost::random::uniform_real_distribution<> uniform{0.0, x};
double u0X{uniform(rng)};
s = std::min(static_cast<std::size_t>(
std::lower_bound(probabilities.begin(),
probabilities.begin() + m, u0X) -
probabilities.begin()),
m - 1);
}

double ps{probabilities[s] - (s == 0 ? 0.0 : probabilities[s - 1])};
for (std::size_t j = s + 1; j < m; ++j) {
probabilities[j - 1] = probabilities[j] - ps;
std::swap(result[j - 1], result[j]);
}
indices.erase(indices.begin() + s[0]);
}

// The sampled values are at the end of the vector.
result.erase(result.begin(), result.begin() + m);
}

//! Implementation of multivariate normal sampling.
Expand Down
Loading