Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[7.x][ML] Improve regression and classification QoR for small data sets #1992

Merged
merged 2 commits into from
Aug 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@

* Speed up training of regression and classification models on very large data sets.
(See {ml-pull}1941[#1941].)
* Improve regression and classification training accuracy for small data sets.
(See {ml-pull}1960[#1960].)

== {es} version 7.14.0

Expand Down
5 changes: 4 additions & 1 deletion include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! Estimate test losses for the \p missing folds.
TMeanVarAccumulatorVec estimateMissingTestLosses(const TSizeVec& missing) const;

//! Get the minimum number of rows we require per feature.
std::size_t rowsPerFeature(std::size_t numberRows) const;

//! Get the number of features including category encoding.
std::size_t numberFeatures() const;

Expand Down Expand Up @@ -386,7 +389,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
TOptionalDouble m_EtaOverride;
TOptionalDouble m_EtaGrowthRatePerTreeOverride;
TOptionalSize m_NumberFoldsOverride;
TOptionalSize m_TrainFractionPerFoldOverride;
TOptionalDouble m_TrainFractionPerFoldOverride;
TOptionalSize m_MaximumNumberTreesOverride;
TOptionalDouble m_FeatureBagFractionOverride;
TOptionalStrDoublePrVec m_ClassificationWeightsOverride;
Expand Down
4 changes: 2 additions & 2 deletions lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) {
<< "ms");

BOOST_TEST_REQUIRE(core::CProgramCounters::counter(
counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 4500000);
counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6300000);
BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1910000);
BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0);
BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration);
Expand Down Expand Up @@ -686,7 +686,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) {
<< "ms");

BOOST_TEST_REQUIRE(core::CProgramCounters::counter(
counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 4500000);
counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6300000);
BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1910000);
BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0);
BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration);
Expand Down
4 changes: 3 additions & 1 deletion lib/maths/CBoostedTreeFactory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -399,12 +399,14 @@ void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFra
TSizeVec regressors(frame.numberColumns() - this->numberExtraColumnsForTrain());
std::iota(regressors.begin(), regressors.end(), 0);
regressors.erase(regressors.begin() + m_TreeImpl->m_DependentVariable);
std::size_t numberTrainingRows{
static_cast<std::size_t>(m_TreeImpl->allTrainingRowsMask().manhattan())};
LOG_TRACE(<< "candidate regressors = " << core::CContainerPrinter::print(regressors));

m_TreeImpl->m_Encoder = std::make_unique<CDataFrameCategoryEncoder>(
CMakeDataFrameCategoryEncoder{m_TreeImpl->m_NumberThreads, frame,
m_TreeImpl->m_DependentVariable}
.minimumRowsPerFeature(m_TreeImpl->m_RowsPerFeature)
.minimumRowsPerFeature(m_TreeImpl->rowsPerFeature(numberTrainingRows))
.minimumFrequencyToOneHotEncode(m_MinimumFrequencyToOneHotEncode)
.rowMask(m_TreeImpl->allTrainingRowsMask())
.columnMask(std::move(regressors))
Expand Down
12 changes: 11 additions & 1 deletion lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,8 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
// A binary tree with n + 1 leaves has 2n + 1 nodes in total.
std::size_t maximumNumberLeaves{this->maximumTreeSize(numberRows) + 1};
std::size_t maximumNumberNodes{2 * maximumNumberLeaves - 1};
std::size_t maximumNumberFeatures{std::min(numberColumns - 1, numberRows / m_RowsPerFeature)};
std::size_t maximumNumberFeatures{
std::min(numberColumns - 1, numberRows / this->rowsPerFeature(numberRows))};
std::size_t forestMemoryUsage{
m_MaximumNumberTrees *
(sizeof(TNodeVec) + maximumNumberNodes * CBoostedTreeNode::estimateMemoryUsage(
Expand Down Expand Up @@ -1107,6 +1108,15 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
return predictedTestLosses;
}

std::size_t CBoostedTreeImpl::rowsPerFeature(std::size_t numberRows) const {
// For small data sets (fewer than 1k examples) we allow ourselves to use
// more features than implied by m_RowsPerFeature. Since we remove nuisance
// features which carry little information about the target this is fine
// from an accuracy perspective. From a runtime perspective we always train
// fast for such small data sets.
return std::max(std::min(m_RowsPerFeature, numberRows / 20), std::size_t{1});
}

std::size_t CBoostedTreeImpl::numberFeatures() const {
return m_Encoder->numberEncodedColumns();
}
Expand Down
32 changes: 16 additions & 16 deletions lib/maths/CDataFrameUtils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,17 @@ classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads,

TDoubleVec categoryFrequencies{CDataFrameUtils::categoryFrequencies(
numberThreads, frame, rowMask, {targetColumn})[targetColumn]};
LOG_TRACE(<< "category frequencies = "
<< core::CContainerPrinter::print(categoryFrequencies));

TSizeVec categoryCounts;
CSampling::weightedSample(desiredCount, categoryFrequencies, categoryCounts);
LOG_TRACE(<< "desired category counts per test fold = "
<< core::CContainerPrinter::print(categoryCounts));

auto sampler = std::make_unique<CStratifiedSampler>(categoryCounts.size());
for (std::size_t i = 0; i < categoryCounts.size(); ++i) {
sampler->addSampler(categoryCounts[i], rng);
for (auto categoryCount : categoryCounts) {
sampler->addSampler(categoryCount, rng);
}
sampler->samplerSelector([targetColumn](const TRowRef& row) mutable {
return static_cast<std::size_t>(row[targetColumn]);
Expand Down Expand Up @@ -523,43 +525,36 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
<< ", sample size = " << sampleSize);

TDoubleVec frequencies;

auto makeSampler = [&](std::size_t size) {
auto makeSampler = [&](std::size_t size, const core::CPackedBitVector& rowMask) {
TStratifiedSamplerUPtr result;
if (size > 0) {
if (frame.columnIsCategorical()[targetColumn]) {
std::tie(result, frequencies) = classifierStratifiedCrossValidationRowSampler(
numberThreads, frame, targetColumn, rng, size, allTrainingRowsMask);
numberThreads, frame, targetColumn, rng, size, rowMask);
} else {
result = regressionStratifiedCrossValiationRowSampler(
numberThreads, frame, targetColumn, rng, size,
numberBuckets, allTrainingRowsMask);
numberThreads, frame, targetColumn, rng, size, numberBuckets, rowMask);
}
}
return result;
};

auto excessSampler = makeSampler(excessSampleSize);
auto sampler = makeSampler(sampleSize);
if (sampler == nullptr) {
HANDLE_FATAL(<< "Internal error: failed to create train/test splits.");
return {TPackedBitVectorVec{}, TPackedBitVectorVec{}, TDoubleVec{}};
}
auto excessSampler = makeSampler(excessSampleSize, allTrainingRowsMask);

LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan());

TPackedBitVectorVec testingRowMasks(numberFolds);

TSizeVec rowIndices;
auto sample = [&](const TStratifiedSamplerUPtr& sampler_,
const core::CPackedBitVector& candidateTestingRowsMask) {
const core::CPackedBitVector& rowMask) {
frame.readRows(1, 0, frame.numberRows(),
[&](const TRowItr& beginRows, const TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
sampler_->sample(*row);
}
},
&candidateTestingRowsMask);
&rowMask);
sampler_->finishSampling(rng, rowIndices);
std::sort(rowIndices.begin(), rowIndices.end());
LOG_TRACE(<< "# row indices = " << rowIndices.size());
Expand All @@ -569,7 +564,7 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
result.extend(false, row - result.size());
result.extend(true);
}
result.extend(false, allTrainingRowsMask.size() - result.size());
result.extend(false, rowMask.size() - result.size());
return result;
};

Expand All @@ -579,6 +574,11 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
testingRowMask = std::move(candidateTestingRowsMask);
candidateTestingRowsMask = core::CPackedBitVector{testingRowMask.size(), false};
} else {
auto sampler = makeSampler(sampleSize, candidateTestingRowsMask);
if (sampler == nullptr) {
HANDLE_FATAL(<< "Internal error: failed to create train/test splits.");
return {TPackedBitVectorVec{}, TPackedBitVectorVec{}, TDoubleVec{}};
}
testingRowMask = sample(sampler, candidateTestingRowsMask);
candidateTestingRowsMask ^= testingRowMask;
}
Expand Down
40 changes: 22 additions & 18 deletions lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,13 @@ BOOST_AUTO_TEST_CASE(testPiecewiseConstant) {
0.0, modelBias[i][0],
8.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
// Good R^2...
BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.93);
BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.91);

meanModelRSquared.add(modelRSquared[i][0]);
}

LOG_DEBUG(<< "mean R^2 = " << maths::CBasicStatistics::mean(meanModelRSquared));
BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.95);
BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.94);
}

BOOST_AUTO_TEST_CASE(testLinear) {
Expand Down Expand Up @@ -627,7 +627,7 @@ BOOST_AUTO_TEST_CASE(testNonLinear) {
meanModelRSquared.add(modelRSquared[i][0]);
}
LOG_DEBUG(<< "mean R^2 = " << maths::CBasicStatistics::mean(meanModelRSquared));
BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.98);
BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.97);
}

BOOST_AUTO_TEST_CASE(testHuber) {
Expand Down Expand Up @@ -1850,24 +1850,28 @@ BOOST_AUTO_TEST_CASE(testProgressMonitoring) {
// We don't have accurate upfront estimate of the number of steps so we
// only get progress up to 80% or 90%. In non-test code we always pass 100%
// when the task is complete.
if (task.s_TenPercentProgressPoints.size() != 10 ||
task.s_TenPercentProgressPoints.front() != 0 ||
task.s_TenPercentProgressPoints.back() != 90) {
BOOST_REQUIRE_EQUAL("[0, 10, 20, 30, 40, 50, 60, 70, 80]",
core::CContainerPrinter::print(task.s_TenPercentProgressPoints));
}
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.size() >= 9);
BOOST_TEST_REQUIRE(
std::is_sorted(task.s_TenPercentProgressPoints.begin(),
task.s_TenPercentProgressPoints.end()));
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.front() == 0);
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.back() >= 80);
} else if (task.s_Name == maths::CBoostedTreeFactory::FINE_TUNING_PARAMETERS) {
BOOST_REQUIRE_EQUAL("[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
core::CContainerPrinter::print(task.s_TenPercentProgressPoints));
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.size() >= 10);
BOOST_TEST_REQUIRE(
std::is_sorted(task.s_TenPercentProgressPoints.begin(),
task.s_TenPercentProgressPoints.end()));
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.front() == 0);
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.back() >= 90);
} else if (task.s_Name == maths::CBoostedTreeFactory::FINAL_TRAINING) {
// Progress might be 90% or 100% depending on whether the final
// progress update registered
if (task.s_TenPercentProgressPoints.size() != 11 ||
task.s_TenPercentProgressPoints.front() != 0 ||
task.s_TenPercentProgressPoints.back() != 100) {
BOOST_REQUIRE_EQUAL("[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
core::CContainerPrinter::print(task.s_TenPercentProgressPoints));
}
// progress update registered.
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.size() >= 10);
BOOST_TEST_REQUIRE(
std::is_sorted(task.s_TenPercentProgressPoints.begin(),
task.s_TenPercentProgressPoints.end()));
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.front() == 0);
BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.back() >= 90);
}
BOOST_TEST_REQUIRE(task.s_Monotonic);
}
Expand Down
35 changes: 35 additions & 0 deletions lib/maths/unittest/CDataFrameUtilsTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,41 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
}
}

BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasksRareCategories) {

// Here we test a case that the desired sample size for a specific class
// is zero. In this case we should reassess the class frequencies for
// the unsampled set and still get 5 splits with all classes represented
// in at least one fold.

std::size_t numberFolds{5};
std::size_t numberBins{10};
TDoubleVec categories{0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 4, 5, 5, 6, 6, 6, 6};

auto frame = core::makeMainStorageDataFrame(1).first;
frame->categoricalColumns(TBoolVec{true});
for (auto category : categories) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column,
std::int32_t&) { *column = category; });
}
frame->finishWritingRows();

maths::CPRNG::CXorOShiro128Plus rng;
maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
std::tie(std::ignore, testingRowMasks, std::ignore) =
maths::CDataFrameUtils::stratifiedCrossValidationRowMasks(
1, *frame, 0, rng, numberFolds, 1.0 - 1.0 / static_cast<double>(numberFolds),
numberBins, core::CPackedBitVector{categories.size(), true});

core::CPackedBitVector allTestingRowsMask(categories.size(), false);
for (const auto& testingRowMask : testingRowMasks) {
allTestingRowsMask ^= testingRowMask;
BOOST_TEST_REQUIRE(5.0, testingRowMask.manhattan());
}
BOOST_TEST_REQUIRE(25.0, allTestingRowsMask.manhattan());
}

BOOST_AUTO_TEST_CASE(testMicWithColumn) {

// Test we get the exact MICe value when the number of rows is less than
Expand Down