Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Fix override of maximum permitted trees for classification and regression #1185

Merged
merged 3 commits into from
May 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 2 additions & 5 deletions docs/CHANGELOG.asciidoc
Expand Up @@ -64,20 +64,17 @@
periods of time. (See {ml-pull}1158[#1158].)
* Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].)

== {es} version 7.8.0

=== Bug Fixes

* Trap and fail if insufficient features are supplied to data frame analyses. This
caused classification and regression getting stuck at zero progress analyzing.
(See {ml-pull}1160[#1160], issue: {issue}55593[#55593].)
* Make categorization respect the `model_memory_limit`. (See {ml-pull}1167[#1167],
issue: {ml-issue}1130[#1130].)

=== Bug Fixes

* Fix underlying cause for "Failed to calculate splitting significance" log errors.
(See {ml-pull}1157[#1157].)
* Respect user overrides for `max_trees` for classification and regression. (See
{ml-pull}1185[#1185].)

== {es} version 7.7.1

Expand Down
6 changes: 4 additions & 2 deletions lib/maths/CBoostedTreeFactory.cc
Expand Up @@ -717,7 +717,9 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) {
auto applyEta = [](CBoostedTreeImpl& tree, double eta) {
tree.m_Eta = CTools::stableExp(eta);
tree.m_EtaGrowthRatePerTree = 1.0 + tree.m_Eta / 2.0;
tree.m_MaximumNumberTrees = computeMaximumNumberTrees(tree.m_Eta);
if (tree.m_MaximumNumberTreesOverride == boost::none) {
tree.m_MaximumNumberTrees = computeMaximumNumberTrees(tree.m_Eta);
}
return true;
};

Expand All @@ -738,7 +740,7 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) {

if (intervalIsEmpty(m_LogEtaSearchInterval)) {
m_TreeImpl->m_EtaOverride = m_TreeImpl->m_Eta;
} else {
} else if (m_TreeImpl->m_MaximumNumberTreesOverride == boost::none) {
m_TreeImpl->m_MaximumNumberTrees =
computeMaximumNumberTrees(MIN_ETA_SCALE * m_TreeImpl->m_Eta);
}
Expand Down
78 changes: 76 additions & 2 deletions lib/maths/unittest/CBoostedTreeTest.cc
Expand Up @@ -4,8 +4,6 @@
* you may not use this file except in compliance with the Elastic License.
*/

#include <atomic>
#include <boost/test/tools/interface.hpp>
#include <core/CDataFrame.h>
#include <core/CJsonStatePersistInserter.h>
#include <core/CLogger.h>
Expand All @@ -28,6 +26,7 @@
#include <boost/test/unit_test.hpp>

#include <algorithm>
#include <atomic>
#include <fstream>
#include <functional>
#include <limits>
Expand Down Expand Up @@ -1470,6 +1469,81 @@ BOOST_AUTO_TEST_CASE(testMissingFeatures) {
}
}

BOOST_AUTO_TEST_CASE(testHyperparameterOverrides) {

// Test hyperparameter overrides are respected.

test::CRandomNumbers rng;

std::size_t rows{300};
std::size_t cols{6};
std::size_t capacity{600};

TDoubleVecVec x(cols);
for (std::size_t i = 0; i < cols; ++i) {
rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
}

auto target = [](const TRowRef& row) { return row[0] + 3.0 * row[3]; };

auto frame = core::makeMainStorageDataFrame(cols, capacity).first;

fillDataFrame(rows, 0, cols, x, TDoubleVec(rows, 0.0), target, *frame);

CTestInstrumentation instrumentation;
{
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
1, std::make_unique<maths::boosted_tree::CMse>())
.analysisInstrumentation(instrumentation)
.maximumNumberTrees(10)
.treeSizePenaltyMultiplier(0.1)
.leafWeightPenaltyMultiplier(0.01)
.buildFor(*frame, cols - 1);

regression->train();

// We use a single leaf to centre the data so end up with limit + 1 trees.
BOOST_REQUIRE_EQUAL(11, regression->bestHyperparameters().maximumNumberTrees());
BOOST_REQUIRE_EQUAL(
0.1, regression->bestHyperparameters().regularization().treeSizePenaltyMultiplier());
BOOST_REQUIRE_EQUAL(
0.01, regression->bestHyperparameters().regularization().leafWeightPenaltyMultiplier());
}
{
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
1, std::make_unique<maths::boosted_tree::CMse>())
.analysisInstrumentation(instrumentation)
.eta(0.2)
.softTreeDepthLimit(2.0)
.softTreeDepthTolerance(0.1)
.buildFor(*frame, cols - 1);

regression->train();

BOOST_REQUIRE_EQUAL(0.2, regression->bestHyperparameters().eta());
BOOST_REQUIRE_EQUAL(
2.0, regression->bestHyperparameters().regularization().softTreeDepthLimit());
BOOST_REQUIRE_EQUAL(
0.1, regression->bestHyperparameters().regularization().softTreeDepthTolerance());
}
{
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
1, std::make_unique<maths::boosted_tree::CMse>())
.analysisInstrumentation(instrumentation)
.depthPenaltyMultiplier(1.0)
.featureBagFraction(0.4)
.downsampleFactor(0.6)
.buildFor(*frame, cols - 1);

regression->train();

BOOST_REQUIRE_EQUAL(
1.0, regression->bestHyperparameters().regularization().depthPenaltyMultiplier());
BOOST_REQUIRE_EQUAL(0.4, regression->bestHyperparameters().featureBagFraction());
BOOST_REQUIRE_EQUAL(0.6, regression->bestHyperparameters().downsampleFactor());
}
}

BOOST_AUTO_TEST_CASE(testPersistRestore) {

std::size_t rows{50};
Expand Down