Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@

* Reduce CPU scheduling priority of native analysis processes to favor the ES JVM
when CPU is constrained. (See {ml-pull}1109[#1109].)
* Take `training_percent` into account when estimating memory usage for classification and regression.
(See {ml-pull}1111[1111].)

== {es} version 7.7.0

Expand Down
2 changes: 2 additions & 0 deletions include/api/CDataFrameTrainBoostedTreeRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
static const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER;
static const std::string BAYESIAN_OPTIMISATION_RESTARTS;
static const std::string NUM_TOP_FEATURE_IMPORTANCE_VALUES;
static const std::string TRAINING_PERCENT_FIELD_NAME;

//Output
static const std::string IS_TRAINING_FIELD_NAME;
Expand Down Expand Up @@ -115,6 +116,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun

std::string m_DependentVariableFieldName;
std::string m_PredictionFieldName;
double m_TrainingPercent;
TBoostedTreeFactoryUPtr m_BoostedTreeFactory;
TBoostedTreeUPtr m_BoostedTree;
CDataFrameTrainBoostedTreeInstrumentation m_Instrumentation;
Expand Down
8 changes: 7 additions & 1 deletion lib/api/CDataFrameTrainBoostedTreeRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::paramete
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(NUM_TOP_FEATURE_IMPORTANCE_VALUES,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
theReader.addParameter(TRAINING_PERCENT_FIELD_NAME,
CDataFrameAnalysisConfigReader::E_OptionalParameter);
return theReader;
}()};
return PARAMETER_READER;
Expand All @@ -77,6 +79,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
m_PredictionFieldName = parameters[PREDICTION_FIELD_NAME].fallback(
m_DependentVariableFieldName + "_prediction");

m_TrainingPercent = parameters[TRAINING_PERCENT_FIELD_NAME].fallback(100.0) / 100.0;
std::size_t downsampleRowsPerFeature{
parameters[DOWNSAMPLE_ROWS_PER_FEATURE].fallback(std::size_t{0})};
double downsampleFactor{parameters[DOWNSAMPLE_FACTOR].fallback(-1.0)};
Expand Down Expand Up @@ -290,7 +293,9 @@ std::size_t CDataFrameTrainBoostedTreeRunner::estimateBookkeepingMemoryUsage(
std::size_t totalNumberRows,
std::size_t /*partitionNumberRows*/,
std::size_t numberColumns) const {
return m_BoostedTreeFactory->estimateMemoryUsage(totalNumberRows, numberColumns);
return m_BoostedTreeFactory->estimateMemoryUsage(
static_cast<std::size_t>(static_cast<double>(totalNumberRows) * m_TrainingPercent + 0.5),
numberColumns);
}

const CDataFrameAnalysisInstrumentation&
Expand All @@ -305,6 +310,7 @@ CDataFrameAnalysisInstrumentation& CDataFrameTrainBoostedTreeRunner::instrumenta
// clang-format off
const std::string CDataFrameTrainBoostedTreeRunner::DEPENDENT_VARIABLE_NAME{"dependent_variable"};
const std::string CDataFrameTrainBoostedTreeRunner::PREDICTION_FIELD_NAME{"prediction_field_name"};
const std::string CDataFrameTrainBoostedTreeRunner::TRAINING_PERCENT_FIELD_NAME{"training_percent"};
const std::string CDataFrameTrainBoostedTreeRunner::DOWNSAMPLE_ROWS_PER_FEATURE{"downsample_rows_per_feature"};
const std::string CDataFrameTrainBoostedTreeRunner::DOWNSAMPLE_FACTOR{"downsample_factor"};
const std::string CDataFrameTrainBoostedTreeRunner::ALPHA{"alpha"};
Expand Down
71 changes: 71 additions & 0 deletions lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <algorithm>
#include <fstream>
#include <functional>
#include <limits>
#include <memory>
#include <streambuf>
#include <utility>
Expand Down Expand Up @@ -1221,6 +1222,76 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByTrain) {
}
}

BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByTrainWithTestRows) {

// Test estimation of the memory used training a model.

test::CRandomNumbers rng;

std::size_t rows{1000};
std::size_t cols{6};
std::size_t capacity{600};
std::int64_t previousEstimatedMemory{std::numeric_limits<std::int64_t>::max()};

for (std::size_t test = 0; test < 3; ++test) {
TDoubleVecVec x(cols - 1);
std::size_t numTestRows{((test + 1) * 100)};
for (std::size_t i = 0; i < cols - 1; ++i) {
rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
}

auto target = [&](std::size_t i) {
double result{0.0};
for (std::size_t j = 0; j < cols - 1; ++j) {
result += x[j][i];
}
return result;
};

auto frame = core::makeMainStorageDataFrame(cols, capacity).first;
frame->categoricalColumns(TBoolVec{true, false, false, false, false, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
*(column++) = std::floor(x[0][i]);
for (std::size_t j = 1; j < cols - 1; ++j, ++column) {
*column = x[j][i];
}
if (i < numTestRows) {
*column = core::CDataFrame::valueOfMissing();
} else {
*column = target(i);
}
});
}
frame->finishWritingRows();

double percentTrainingRows = 1.0 - static_cast<double>(numTestRows) /
static_cast<double>(rows);

std::int64_t estimatedMemory(
maths::CBoostedTreeFactory::constructFromParameters(
1, std::make_unique<maths::boosted_tree::CMse>())
.estimateMemoryUsage(static_cast<std::size_t>(static_cast<double>(rows) * percentTrainingRows),
cols));

CTestInstrumentation instrumentation;
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
1, std::make_unique<maths::boosted_tree::CMse>())
.analysisInstrumentation(instrumentation)
.buildFor(*frame, cols - 1);

regression->train();

LOG_DEBUG(<< "percent training rows = " << percentTrainingRows);
LOG_DEBUG(<< "estimated memory usage = " << estimatedMemory);
LOG_DEBUG(<< "high water mark = " << instrumentation.maxMemoryUsage());

BOOST_TEST_REQUIRE(instrumentation.maxMemoryUsage() < estimatedMemory);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 also let's assert that estimated memory decreases for each test since you're increasing test percentage.

BOOST_TEST_REQUIRE(previousEstimatedMemory > estimatedMemory);
previousEstimatedMemory = estimatedMemory;
}
}

BOOST_AUTO_TEST_CASE(testProgressMonitoring) {

// Test progress monitoring invariants.
Expand Down