diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index e01341dc0c..bf12646bd8 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -55,7 +55,7 @@ const std::size_t MAX_NUMBER_TREES{static_cast(2.0 / MIN_ETA + 0.5) // for progress monitoring because we don't know what value we'll choose in the // line search. Assuming it is less than one avoids a large pause in progress if // it is reduced in the line search. -const double LINE_SEARCH_ETA_MARGIN{0.5}; +const double MAIN_LOOP_ETA_SCALE_FOR_PROGRESS{0.5}; double computeEta(std::size_t numberRegressors) { // eta is the learning rate. There is a lot of empirical evidence that @@ -315,7 +315,7 @@ void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFra .minimumFrequencyToOneHotEncode(m_MinimumFrequencyToOneHotEncode) .rowMask(m_TreeImpl->allTrainingRowsMask()) .columnMask(std::move(regressors))); - m_TreeImpl->m_TrainingProgress.increment(1); + m_TreeImpl->m_TrainingProgress.increment(100); } void CBoostedTreeFactory::determineFeatureDataTypes(const core::CDataFrame& frame) const { @@ -741,7 +741,7 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) { m_TreeImpl->m_TrainingProgress.incrementRange( static_cast(this->mainLoopNumberSteps(m_TreeImpl->m_Eta)) - - static_cast(this->mainLoopNumberSteps(LINE_SEARCH_ETA_MARGIN * eta))); + static_cast(this->mainLoopNumberSteps(MAIN_LOOP_ETA_SCALE_FOR_PROGRESS * eta))); } } @@ -1162,7 +1162,7 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData // // This comprises: // - The cost of category encoding and feature selection which we count as - // one unit, + // one hundred units, // - One unit for estimating the expected gain and sum curvature per node, // - LINE_SEARCH_ITERATIONS * "maximum number trees" units per regularization // parameter which isn't user defined, @@ -1178,7 +1178,7 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData ? *m_TreeImpl->m_EtaOverride : computeEta(frame.numberColumns())}; - std::size_t totalNumberSteps{2}; + std::size_t totalNumberSteps{101}; std::size_t lineSearchMaximumNumberTrees{computeMaximumNumberTrees(eta)}; if (m_TreeImpl->m_RegularizationOverride.softTreeDepthLimit() == boost::none) { totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees; @@ -1196,10 +1196,10 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees; } if (m_TreeImpl->m_EtaOverride == boost::none) { - totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees * - computeMaximumNumberTrees(LINE_SEARCH_ETA_MARGIN * eta); + totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * + computeMaximumNumberTrees(MAIN_LOOP_ETA_SCALE_FOR_PROGRESS * eta); } - totalNumberSteps += this->mainLoopNumberSteps(LINE_SEARCH_ETA_MARGIN * eta); + totalNumberSteps += this->mainLoopNumberSteps(MAIN_LOOP_ETA_SCALE_FOR_PROGRESS * eta); LOG_TRACE(<< "total number steps = " << totalNumberSteps); m_TreeImpl->m_TrainingProgress = core::CLoopProgress{ totalNumberSteps, m_TreeImpl->m_Instrumentation->progressCallback(), 1.0, 1024}; diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index eafe61f1ff..6054273494 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -47,16 +47,30 @@ using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatV namespace { class CTestInstrumentation : public maths::CDataFrameAnalysisInstrumentationInterface { +public: + using TIntVec = std::vector; + public: CTestInstrumentation() : m_TotalFractionalProgress{0}, m_MemoryUsage{0}, m_MaxMemoryUsage{0} {} - int progress() const { return m_TotalFractionalProgress.load(); } + int progress() const { + return (100 * m_TotalFractionalProgress.load()) / 65536; + } + TIntVec tenPercentProgressPoints() const { + return m_TenPercentProgressPoints; + } std::int64_t maxMemoryUsage() const { return m_MaxMemoryUsage.load(); } void updateProgress(double fractionalProgress) override { - m_TotalFractionalProgress.fetch_add( - static_cast(65536.0 * fractionalProgress + 0.5)); + int progress{m_TotalFractionalProgress.fetch_add( + static_cast(65536.0 * fractionalProgress + 0.5))}; + // This needn't be protected because progress is only written from one thread and + // the tests arrange that it is never read at the same time it is being written. + if (m_TenPercentProgressPoints.empty() || + 100 * progress > 65536 * (m_TenPercentProgressPoints.back() + 10)) { + m_TenPercentProgressPoints.push_back(10 * ((10 * progress) / 65536)); + } } void updateMemoryUsage(std::int64_t delta) override { @@ -73,6 +87,7 @@ class CTestInstrumentation : public maths::CDataFrameAnalysisInstrumentationInte private: std::atomic_int m_TotalFractionalProgress; + TIntVec m_TenPercentProgressPoints; std::atomic m_MemoryUsage; std::atomic m_MaxMemoryUsage; }; @@ -1392,23 +1407,26 @@ BOOST_AUTO_TEST_CASE(testProgressMonitoring) { finished.store(true); }}; - int lastTotalFractionalProgress{0}; int lastProgressReport{0}; bool monotonic{true}; - std::size_t percentage{0}; + int percentage{0}; while (finished.load() == false) { - if (instrumentation.progress() > lastProgressReport) { + if (instrumentation.progress() > percentage) { LOG_DEBUG(<< percentage << "% complete"); percentage += 10; - lastProgressReport += 6554; } - monotonic &= (instrumentation.progress() >= lastTotalFractionalProgress); - lastTotalFractionalProgress = instrumentation.progress(); + monotonic &= (instrumentation.progress() >= lastProgressReport); + lastProgressReport = instrumentation.progress(); } worker.join(); BOOST_TEST_REQUIRE(monotonic); + LOG_DEBUG(<< "progress points = " + << core::CContainerPrinter::print(instrumentation.tenPercentProgressPoints())); + BOOST_REQUIRE_EQUAL("[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]", + core::CContainerPrinter::print( + instrumentation.tenPercentProgressPoints())); core::startDefaultAsyncExecutor(); }