Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions lib/maths/CBoostedTreeFactory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ const std::size_t MAX_NUMBER_TREES{static_cast<std::size_t>(2.0 / MIN_ETA + 0.5)
// for progress monitoring because we don't know what value we'll choose in the
// line search. Assuming it is less than one avoids a large pause in progress if
// it is reduced in the line search.
const double LINE_SEARCH_ETA_MARGIN{0.5};
const double MAIN_LOOP_ETA_SCALE_FOR_PROGRESS{0.5};

double computeEta(std::size_t numberRegressors) {
// eta is the learning rate. There is a lot of empirical evidence that
Expand Down Expand Up @@ -315,7 +315,7 @@ void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFra
.minimumFrequencyToOneHotEncode(m_MinimumFrequencyToOneHotEncode)
.rowMask(m_TreeImpl->allTrainingRowsMask())
.columnMask(std::move(regressors)));
m_TreeImpl->m_TrainingProgress.increment(1);
m_TreeImpl->m_TrainingProgress.increment(100);
}

void CBoostedTreeFactory::determineFeatureDataTypes(const core::CDataFrame& frame) const {
Expand Down Expand Up @@ -741,7 +741,7 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) {

m_TreeImpl->m_TrainingProgress.incrementRange(
static_cast<int>(this->mainLoopNumberSteps(m_TreeImpl->m_Eta)) -
static_cast<int>(this->mainLoopNumberSteps(LINE_SEARCH_ETA_MARGIN * eta)));
static_cast<int>(this->mainLoopNumberSteps(MAIN_LOOP_ETA_SCALE_FOR_PROGRESS * eta)));
}
}

Expand Down Expand Up @@ -1162,7 +1162,7 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData
//
// This comprises:
// - The cost of category encoding and feature selection which we count as
// one unit,
// one hundred units,
// - One unit for estimating the expected gain and sum curvature per node,
// - LINE_SEARCH_ITERATIONS * "maximum number trees" units per regularization
// parameter which isn't user defined,
Expand All @@ -1178,7 +1178,7 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData
? *m_TreeImpl->m_EtaOverride
: computeEta(frame.numberColumns())};

std::size_t totalNumberSteps{2};
std::size_t totalNumberSteps{101};
std::size_t lineSearchMaximumNumberTrees{computeMaximumNumberTrees(eta)};
if (m_TreeImpl->m_RegularizationOverride.softTreeDepthLimit() == boost::none) {
totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees;
Expand All @@ -1196,10 +1196,10 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData
totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees;
}
if (m_TreeImpl->m_EtaOverride == boost::none) {
totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees *
computeMaximumNumberTrees(LINE_SEARCH_ETA_MARGIN * eta);
totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS *
computeMaximumNumberTrees(MAIN_LOOP_ETA_SCALE_FOR_PROGRESS * eta);
}
totalNumberSteps += this->mainLoopNumberSteps(LINE_SEARCH_ETA_MARGIN * eta);
totalNumberSteps += this->mainLoopNumberSteps(MAIN_LOOP_ETA_SCALE_FOR_PROGRESS * eta);
LOG_TRACE(<< "total number steps = " << totalNumberSteps);
m_TreeImpl->m_TrainingProgress = core::CLoopProgress{
totalNumberSteps, m_TreeImpl->m_Instrumentation->progressCallback(), 1.0, 1024};
Expand Down
36 changes: 27 additions & 9 deletions lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,30 @@ using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatV
namespace {

class CTestInstrumentation : public maths::CDataFrameAnalysisInstrumentationInterface {
public:
using TIntVec = std::vector<int>;

public:
CTestInstrumentation()
: m_TotalFractionalProgress{0}, m_MemoryUsage{0}, m_MaxMemoryUsage{0} {}

int progress() const { return m_TotalFractionalProgress.load(); }
int progress() const {
return (100 * m_TotalFractionalProgress.load()) / 65536;
}
TIntVec tenPercentProgressPoints() const {
return m_TenPercentProgressPoints;
}
std::int64_t maxMemoryUsage() const { return m_MaxMemoryUsage.load(); }

void updateProgress(double fractionalProgress) override {
m_TotalFractionalProgress.fetch_add(
static_cast<int>(65536.0 * fractionalProgress + 0.5));
int progress{m_TotalFractionalProgress.fetch_add(
static_cast<int>(65536.0 * fractionalProgress + 0.5))};
// This needn't be protected because progress is only written from one thread and
// the tests arrange that it is never read at the same time it is being written.
if (m_TenPercentProgressPoints.empty() ||
100 * progress > 65536 * (m_TenPercentProgressPoints.back() + 10)) {
m_TenPercentProgressPoints.push_back(10 * ((10 * progress) / 65536));
}
}

void updateMemoryUsage(std::int64_t delta) override {
Expand All @@ -73,6 +87,7 @@ class CTestInstrumentation : public maths::CDataFrameAnalysisInstrumentationInte

private:
std::atomic_int m_TotalFractionalProgress;
TIntVec m_TenPercentProgressPoints;
std::atomic<std::int64_t> m_MemoryUsage;
std::atomic<std::int64_t> m_MaxMemoryUsage;
};
Expand Down Expand Up @@ -1392,23 +1407,26 @@ BOOST_AUTO_TEST_CASE(testProgressMonitoring) {
finished.store(true);
}};

int lastTotalFractionalProgress{0};
int lastProgressReport{0};

bool monotonic{true};
std::size_t percentage{0};
int percentage{0};
while (finished.load() == false) {
if (instrumentation.progress() > lastProgressReport) {
if (instrumentation.progress() > percentage) {
LOG_DEBUG(<< percentage << "% complete");
percentage += 10;
lastProgressReport += 6554;
}
monotonic &= (instrumentation.progress() >= lastTotalFractionalProgress);
lastTotalFractionalProgress = instrumentation.progress();
monotonic &= (instrumentation.progress() >= lastProgressReport);
lastProgressReport = instrumentation.progress();
}
worker.join();

BOOST_TEST_REQUIRE(monotonic);
LOG_DEBUG(<< "progress points = "
<< core::CContainerPrinter::print(instrumentation.tenPercentProgressPoints()));
BOOST_REQUIRE_EQUAL("[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
core::CContainerPrinter::print(
instrumentation.tenPercentProgressPoints()));

core::startDefaultAsyncExecutor();
}
Expand Down