diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 46c6177c94..11efbf6e1d 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -64,6 +64,8 @@ is no longer decreasing. (See {ml-pull}875[#875].) * Improve performance updating quantile estimates. (See {ml-pull}881[#881].) * Migrate to use Bayesian Optimisation for initial hyperparameter value line searches and stop early if the expected improvement is too small. (See {ml-pull}903[#903].) +* Stop cross-validation early if the predicted test loss has a small chance of being +smaller than for the best parameter values found so far. (See {ml-pull}915[#915].) === Bug Fixes * Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].) diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h index 1991758821..1554df28c5 100644 --- a/include/api/CDataFrameTrainBoostedTreeRunner.h +++ b/include/api/CDataFrameTrainBoostedTreeRunner.h @@ -44,6 +44,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun static const std::string MAXIMUM_NUMBER_TREES; static const std::string FEATURE_BAG_FRACTION; static const std::string NUMBER_FOLDS; + static const std::string STOP_CROSS_VALIDATION_EARLY; static const std::string NUMBER_ROUNDS_PER_HYPERPARAMETER; static const std::string BAYESIAN_OPTIMISATION_RESTARTS; static const std::string TOP_FEATURE_IMPORTANCE_VALUES; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 82cc0ff848..5b94682fd7 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -58,8 +58,10 @@ class MATHS_EXPORT CBoostedTreeFactory final { CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency); //! Set the number of folds to use for estimating the generalisation error. CBoostedTreeFactory& numberFolds(std::size_t numberFolds); - //! Stratify the cross validation we do for regression. + //! Stratify the cross-validation we do for regression. CBoostedTreeFactory& stratifyRegressionCrossValidation(bool stratify); + //! Stop cross-validation early if the test loss is not promising. + CBoostedTreeFactory& stopCrossValidationEarly(bool stopEarly); //! The number of rows per feature to sample in the initial downsample. CBoostedTreeFactory& initialDownsampleRowsPerFeature(double rowsPerFeature); //! Set the sum of leaf depth penalties multiplier. @@ -133,6 +135,9 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Compute the row masks for the missing values for each feature. void initializeMissingFeatureMasks(const core::CDataFrame& frame) const; + //! Set up the number of folds we'll use for cross-validation. + void initializeNumberFolds(core::CDataFrame& frame) const; + //! Set up cross validation. void initializeCrossValidation(core::CDataFrame& frame) const; @@ -187,7 +192,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { void resumeRestoredTrainingProgressMonitoring(); //! The maximum number of trees to use in the hyperparameter optimisation loop. - std::size_t mainLoopMaximumNumberTrees() const; + std::size_t mainLoopMaximumNumberTrees(double eta) const; static void noopRecordProgress(double); static void noopRecordMemoryUsage(std::int64_t); diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 856d139dd7..66330393e0 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -53,9 +53,11 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; using TMeanVarAccumulatorSizePr = std::pair; + using TMeanVarAccumulatorVec = std::vector; using TBayesinOptimizationUPtr = std::unique_ptr; using TNodeVec = CBoostedTree::TNodeVec; using TNodeVecVec = CBoostedTree::TNodeVecVec; + using TLossFunctionUPtr = CBoostedTree::TLossFunctionUPtr; using TProgressCallback = CBoostedTree::TProgressCallback; using TMemoryUsageCallback = CBoostedTree::TMemoryUsageCallback; using TTrainingStateCallback = CBoostedTree::TTrainingStateCallback; @@ -68,7 +70,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT; public: - CBoostedTreeImpl(std::size_t numberThreads, CBoostedTree::TLossFunctionUPtr loss); + CBoostedTreeImpl(std::size_t numberThreads, TLossFunctionUPtr loss); ~CBoostedTreeImpl(); @@ -152,6 +154,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { private: using TSizeDoublePr = std::pair; using TDoubleDoublePr = std::pair; + using TOptionalDoubleVec = std::vector; + using TOptionalDoubleVecVec = std::vector; using TOptionalSize = boost::optional; using TImmutableRadixSetVec = std::vector>; using TVector = CDenseVector; @@ -416,6 +420,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { TDoubleDoublePr gainAndCurvatureAtPercentile(double percentile, const TNodeVecVec& forest) const; + //! Presize the collection to hold the per fold test errors. + void initializePerFoldTestLosses(); + //! Train the forest and compute loss moments on each fold. TMeanVarAccumulatorSizePr crossValidateForest(core::CDataFrame& frame, const TMemoryUsageCallback& recordMemoryUsage); @@ -447,6 +454,16 @@ class MATHS_EXPORT CBoostedTreeImpl final { const std::size_t maximumTreeSize, const TMemoryUsageCallback& recordMemoryUsage) const; + //! Compute the minimum mean test loss per fold for any round. + double minimumTestLoss() const; + + //! Estimate the loss we'll get including the missing folds. + TMeanVarAccumulator correctTestLossMoments(const TSizeVec& missing, + TMeanVarAccumulator lossMoments) const; + + //! Estimate test losses for the \p missing folds. + TMeanVarAccumulatorVec estimateMissingTestLosses(const TSizeVec& missing) const; + //! Get the number of features including category encoding. std::size_t numberFeatures() const; @@ -503,8 +520,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { std::size_t maximumTreeSize(std::size_t numberRows) const; //! Restore \p loss function pointer from the \p traverser. - static bool restoreLoss(CBoostedTree::TLossFunctionUPtr& loss, - core::CStateRestoreTraverser& traverser); + static bool restoreLoss(TLossFunctionUPtr& loss, core::CStateRestoreTraverser& traverser); //! Record the training state using the \p recordTrainState callback function void recordState(const TTrainingStateCallback& recordTrainState) const; @@ -513,10 +529,12 @@ class MATHS_EXPORT CBoostedTreeImpl final { mutable CPRNG::CXorOShiro128Plus m_Rng; std::size_t m_NumberThreads; std::size_t m_DependentVariable = std::numeric_limits::max(); - CBoostedTree::TLossFunctionUPtr m_Loss; + TLossFunctionUPtr m_Loss; + bool m_StopCrossValidationEarly = true; TRegularizationOverride m_RegularizationOverride; TOptionalDouble m_DownsampleFactorOverride; TOptionalDouble m_EtaOverride; + TOptionalSize m_NumberFoldsOverride; TOptionalSize m_MaximumNumberTreesOverride; TOptionalDouble m_FeatureBagFractionOverride; TRegularization m_Regularization; @@ -537,6 +555,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { TPackedBitVectorVec m_TrainingRowMasks; TPackedBitVectorVec m_TestingRowMasks; double m_BestForestTestLoss = INF; + TOptionalDoubleVecVec m_FoldRoundTestLosses; CBoostedTreeHyperparameters m_BestHyperparameters; TNodeVecVec m_BestForest; TBayesinOptimizationUPtr m_BayesianOptimization; diff --git a/include/maths/CTreeShapFeatureImportance.h b/include/maths/CTreeShapFeatureImportance.h index 9f9bffcbc2..64f56b6b47 100644 --- a/include/maths/CTreeShapFeatureImportance.h +++ b/include/maths/CTreeShapFeatureImportance.h @@ -63,8 +63,7 @@ class MATHS_EXPORT CTreeShapFeatureImportance { struct SPath { explicit SPath(std::size_t length) : s_FractionOnes(length), s_FractionZeros(length), - s_FeatureIndex(length, -1), s_Scale(length), s_NextIndex(0), - s_MaxLength(length) {} + s_FeatureIndex(length, -1), s_Scale(length), s_MaxLength(length) {} void extend(int featureIndex, double fractionZero, double fractionOne) { if (s_NextIndex < s_MaxLength) { @@ -81,7 +80,7 @@ class MATHS_EXPORT CTreeShapFeatureImportance { } void reduce(std::size_t pathIndex) { - for (std::size_t i = pathIndex; i < this->depth(); ++i) { + for (int i = static_cast(pathIndex); i < this->depth(); ++i) { s_FeatureIndex[i] = s_FeatureIndex[i + 1]; s_FractionZeros[i] = s_FractionZeros[i + 1]; s_FractionOnes[i] = s_FractionOnes[i + 1]; @@ -107,10 +106,10 @@ class MATHS_EXPORT CTreeShapFeatureImportance { double scale(std::size_t pathIndex) const { return s_Scale[pathIndex]; } //! Current depth in the tree - int depth() const { return static_cast(s_NextIndex) - 1; }; + int depth() const { return static_cast(s_NextIndex) - 1; } //! Get next index. - size_t nextIndex() const { return s_NextIndex; } + std::size_t nextIndex() const { return s_NextIndex; } //! Set next index. void nextIndex(std::size_t nextIndex) { s_NextIndex = nextIndex; } @@ -119,9 +118,8 @@ class MATHS_EXPORT CTreeShapFeatureImportance { TDoubleVec s_FractionZeros; TIntVec s_FeatureIndex; TDoubleVec s_Scale; - std::size_t s_NextIndex; - - std::size_t s_MaxLength; + std::size_t s_NextIndex = 0; + std::size_t s_MaxLength = 0; }; private: diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc index f8e6526eb9..796ca497c5 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc @@ -51,6 +51,8 @@ const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::paramete theReader.addParameter(FEATURE_BAG_FRACTION, CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(NUMBER_FOLDS, CDataFrameAnalysisConfigReader::E_OptionalParameter); + theReader.addParameter(STOP_CROSS_VALIDATION_EARLY, + CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(NUMBER_ROUNDS_PER_HYPERPARAMETER, CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(BAYESIAN_OPTIMISATION_RESTARTS, @@ -82,6 +84,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( parameters[NUMBER_ROUNDS_PER_HYPERPARAMETER].fallback(std::size_t{0})}; std::size_t bayesianOptimisationRestarts{ parameters[BAYESIAN_OPTIMISATION_RESTARTS].fallback(std::size_t{0})}; + bool stopCrossValidationEarly{parameters[STOP_CROSS_VALIDATION_EARLY].fallback(true)}; std::size_t topFeatureImportanceValues{ parameters[TOP_FEATURE_IMPORTANCE_VALUES].fallback(std::size_t{0})}; @@ -120,6 +123,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( maths::CBoostedTreeFactory::constructFromParameters(this->spec().numberThreads())); (*m_BoostedTreeFactory) + .stopCrossValidationEarly(stopCrossValidationEarly) .progressCallback(this->progressRecorder()) .trainingStateCallback(this->statePersister()) .memoryUsageCallback(this->memoryMonitor(counter_t::E_DFTPMPeakMemoryUsage)); @@ -309,10 +313,10 @@ const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"s const std::string CDataFrameTrainBoostedTreeRunner::MAXIMUM_NUMBER_TREES{"maximum_number_trees"}; const std::string CDataFrameTrainBoostedTreeRunner::FEATURE_BAG_FRACTION{"feature_bag_fraction"}; const std::string CDataFrameTrainBoostedTreeRunner::NUMBER_FOLDS{"number_folds"}; +const std::string CDataFrameTrainBoostedTreeRunner::STOP_CROSS_VALIDATION_EARLY{"stop_cross_validation_early"}; const std::string CDataFrameTrainBoostedTreeRunner::NUMBER_ROUNDS_PER_HYPERPARAMETER{"number_rounds_per_hyperparameter"}; const std::string CDataFrameTrainBoostedTreeRunner::BAYESIAN_OPTIMISATION_RESTARTS{"bayesian_optimisation_restarts"}; const std::string CDataFrameTrainBoostedTreeRunner::TOP_FEATURE_IMPORTANCE_VALUES{"top_feature_importance_values"}; - // clang-format on } } diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index eb23f0adca..c71bed0e66 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -25,14 +25,13 @@ BOOST_AUTO_TEST_SUITE(CDataFrameAnalyzerFeatureImportanceTest) using namespace ml; namespace { -using TBoolVec = std::vector; -using TSizeVec = std::vector; -using TRowItr = core::CDataFrame::TRowItr; -using TRowRef = core::CDataFrame::TRowRef; -using TDataFrameUPtr = std::unique_ptr; using TDoubleVec = std::vector; using TStrVec = std::vector; -using TMeanVarAccumulator = ml::maths::CBasicStatistics::SSampleMeanVar::TAccumulator; +using TRowItr = core::CDataFrame::TRowItr; +using TRowRef = core::CDataFrame::TRowRef; +using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; +using TMeanAccumulatorVec = std::vector; +using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; void setupLinearRegressionData(const TStrVec& fieldNames, TStrVec& fieldValues, @@ -228,18 +227,19 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceAllShap, SF // randomly on [-10, 10]. BOOST_TEST_REQUIRE(c1Sum > c3Sum); BOOST_TEST_REQUIRE(c1Sum > c4Sum); - BOOST_REQUIRE_CLOSE(weights[1] / weights[2], c2Sum / c3Sum, 5.0); // ratio within 5% of ratio of coefficients + BOOST_REQUIRE_CLOSE(weights[1] / weights[2], c2Sum / c3Sum, 10.0); // ratio within 10% of ratio of coefficients BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 5.0); // c3 and c4 within 5% of each other // make sure the local approximation differs from the prediction always by the same bias (up to a numeric error) - BOOST_REQUIRE_SMALL(ml::maths::CBasicStatistics::variance(bias), 1e-6); + BOOST_REQUIRE_SMALL(maths::CBasicStatistics::variance(bias), 1e-6); } BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoImportance, SFixture) { // Test that feature importance calculates low SHAP values if regressors have no weight. // We also add high noise variance. std::size_t topShapValues{4}; - auto results{runRegression(topShapValues, {10.0, 0.0, 0.0, 0.0}, 10.0)}; + auto results = runRegression(topShapValues, {10.0, 0.0, 0.0, 0.0}, 10.0); + TMeanAccumulator c2Mean, c3Mean, c4Mean; for (const auto& result : results.GetArray()) { if (result.HasMember("row_results")) { double c1{result["row_results"]["results"]["ml"][maths::CDataFrameRegressionModel::SHAP_PREFIX + "c1"] @@ -252,13 +252,20 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoImportanc .GetDouble()}; double prediction{ result["row_results"]["results"]["ml"]["target_prediction"].GetDouble()}; - // c1 explain 97% of the prediction value, i.e. the difference from the prediction is less than 1%. - BOOST_REQUIRE_CLOSE(c1, prediction, 3.0); - BOOST_REQUIRE_SMALL(c2, 0.25); - BOOST_REQUIRE_SMALL(c3, 0.25); - BOOST_REQUIRE_SMALL(c4, 0.25); + // c1 explains 95% of the prediction value. + BOOST_REQUIRE_CLOSE(c1, prediction, 5.0); + BOOST_REQUIRE_SMALL(c2, 2.0); + BOOST_REQUIRE_SMALL(c3, 2.0); + BOOST_REQUIRE_SMALL(c4, 2.0); + c2Mean.add(std::fabs(c2)); + c3Mean.add(std::fabs(c3)); + c4Mean.add(std::fabs(c4)); } } + + BOOST_REQUIRE_SMALL(maths::CBasicStatistics::mean(c2Mean), 0.1); + BOOST_REQUIRE_SMALL(maths::CBasicStatistics::mean(c3Mean), 0.1); + BOOST_REQUIRE_SMALL(maths::CBasicStatistics::mean(c4Mean), 0.1); } BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeClassificationFeatureImportanceAllShap, SFixture) { @@ -314,7 +321,7 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeClassificationFeatureImportanceAllShap BOOST_TEST_REQUIRE(c1Sum > c4Sum); BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 40.0); // c3 and c4 within 40% of each other // make sure the local approximation differs from the prediction always by the same bias (up to a numeric error) - BOOST_REQUIRE_SMALL(ml::maths::CBasicStatistics::variance(bias), 1e-6); + BOOST_REQUIRE_SMALL(maths::CBasicStatistics::variance(bias), 1e-6); } BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoShap, SFixture) { diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index b6e311617e..fd6d916a31 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -46,6 +46,10 @@ const double MIN_DOWNSAMPLE_LINE_SEARCH_RANGE{2.0}; const double MAX_DOWNSAMPLE_LINE_SEARCH_RANGE{144.0}; const double MIN_DOWNSAMPLE_FACTOR_SCALE{0.3}; const double MAX_DOWNSAMPLE_FACTOR_SCALE{3.0}; +// This isn't a hard limit but we increase the number of default training folds +// if the initial downsample fraction would be larger than this. +const double MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION{0.5}; +const double MAX_NUMBER_FOLDS{5.0}; const std::size_t MAX_NUMBER_TREES{static_cast(2.0 / MIN_ETA + 0.5)}; double computeEta(std::size_t numberRegressors) { @@ -93,11 +97,11 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, return nullptr; } - this->initializeTrainingProgressMonitoring(frame); - m_TreeImpl->m_DependentVariable = dependentVariable; m_TreeImpl->m_Loss = std::move(loss); + this->initializeNumberFolds(frame); + this->initializeTrainingProgressMonitoring(frame); this->initializeMissingFeatureMasks(frame); m_TreeImpl->m_NumberInputColumns = frame.numberColumns(); @@ -229,6 +233,58 @@ void CBoostedTreeFactory::initializeMissingFeatureMasks(const core::CDataFrame& } } +void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { + if (m_TreeImpl->m_NumberFoldsOverride == boost::none) { + auto result = frame.readRows( + m_NumberThreads, + core::bindRetrievableState( + [this](std::size_t& numberTrainingRows, TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + double target{(*row)[m_TreeImpl->m_DependentVariable]}; + if (CDataFrameUtils::isMissing(target) == false) { + ++numberTrainingRows; + } + } + }, + std::size_t{0})); + std::size_t totalNumberTrainingRows{0}; + for (const auto& numberTrainingRows : result.first) { + totalNumberTrainingRows += numberTrainingRows.s_FunctionState; + } + LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows); + + // We want to choose the number of folds so we'll have enough training data + // after leaving out one fold. We choose the initial downsample size based + // on the same sort of criterion. So we require that leaving out one fold + // shouldn't mean than we have fewer rows than constant * desired downsample + // # rows if possible. We choose the constant to be two for no particularly + // good reason except that: + // 1. it isn't too large + // 2. it still means we'll have plenty of variation between random bags. + // + // In order to estimate this we use the number of input features as a proxy + // for the number of features we'll actually use after feature selection. + // + // So how does the following work: we'd like "c * f * # rows" training rows. + // For k folds we'll have "(1 - 1 / k) * # rows" training rows. So we want + // to find the smallest integer k s.t. c * f * # rows <= (1 - 1 / k) * # rows. + // This gives k = ceil(1 / (1 - c * f)). However, we also upper bound this + // by MAX_NUMBER_FOLDS. + + double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature * + static_cast(frame.numberColumns() - 1)) / + static_cast(totalNumberTrainingRows)}; + + m_TreeImpl->m_NumberFolds = static_cast( + std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION, + 1.0 / MAX_NUMBER_FOLDS))); + LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction + << " # folds = " << m_TreeImpl->m_NumberFolds); + } else { + m_TreeImpl->m_NumberFolds = *m_TreeImpl->m_NumberFoldsOverride; + } +} + void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) const { core::CPackedBitVector allTrainingRowsMask{m_TreeImpl->allTrainingRowsMask()}; @@ -346,10 +402,10 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) { } double numberFeatures{static_cast(m_TreeImpl->m_Encoder->numberEncodedColumns())}; - double downSampleFactor{m_InitialDownsampleRowsPerFeature * numberFeatures / + double downsampleFactor{m_InitialDownsampleRowsPerFeature * numberFeatures / m_TreeImpl->m_TrainingRowMasks[0].manhattan()}; m_TreeImpl->m_DownsampleFactor = m_TreeImpl->m_DownsampleFactorOverride.value_or( - CTools::truncate(downSampleFactor, 0.05, 0.5)); + CTools::truncate(downsampleFactor, 0.05, 0.5)); m_TreeImpl->m_Regularization .depthPenaltyMultiplier( @@ -922,7 +978,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) { LOG_WARN(<< "Must use at least two-folds for cross validation"); numberFolds = 2; } - m_TreeImpl->m_NumberFolds = numberFolds; + m_TreeImpl->m_NumberFoldsOverride = numberFolds; return *this; } @@ -931,6 +987,11 @@ CBoostedTreeFactory& CBoostedTreeFactory::stratifyRegressionCrossValidation(bool return *this; } +CBoostedTreeFactory& CBoostedTreeFactory::stopCrossValidationEarly(bool stopEarly) { + m_TreeImpl->m_StopCrossValidationEarly = stopEarly; + return *this; +} + CBoostedTreeFactory& CBoostedTreeFactory::initialDownsampleRowsPerFeature(double rowsPerFeature) { m_InitialDownsampleRowsPerFeature = rowsPerFeature; return *this; @@ -1063,7 +1124,9 @@ std::size_t CBoostedTreeFactory::estimateMemoryUsage(std::size_t numberRows, std::size_t numberColumns) const { std::size_t shapValuesExtraColumns = (m_TopShapValues > 0) ? numberRows * numberColumns * sizeof(CFloatStorage) : 0; - std::size_t maximumNumberTrees{this->mainLoopMaximumNumberTrees()}; + std::size_t maximumNumberTrees{this->mainLoopMaximumNumberTrees( + m_TreeImpl->m_EtaOverride != boost::none ? *m_TreeImpl->m_EtaOverride + : computeEta(numberColumns))}; std::swap(maximumNumberTrees, m_TreeImpl->m_MaximumNumberTrees); std::size_t result{m_TreeImpl->estimateMemoryUsage(numberRows, numberColumns) + shapValuesExtraColumns}; @@ -1112,7 +1175,7 @@ void CBoostedTreeFactory::initializeTrainingProgressMonitoring(const core::CData totalNumberSteps += MAX_LINE_SEARCH_ITERATIONS * lineSearchMaximumNumberTrees; } totalNumberSteps += (this->numberHyperparameterTuningRounds() + 1) * - this->mainLoopMaximumNumberTrees() * m_TreeImpl->m_NumberFolds; + this->mainLoopMaximumNumberTrees(eta) * m_TreeImpl->m_NumberFolds; LOG_TRACE(<< "total number steps = " << totalNumberSteps); m_TreeImpl->m_TrainingProgress = core::CLoopProgress{totalNumberSteps, m_RecordProgress, 1.0, 1024}; @@ -1123,9 +1186,9 @@ void CBoostedTreeFactory::resumeRestoredTrainingProgressMonitoring() { m_TreeImpl->m_TrainingProgress.resumeRestored(); } -std::size_t CBoostedTreeFactory::mainLoopMaximumNumberTrees() const { +std::size_t CBoostedTreeFactory::mainLoopMaximumNumberTrees(double eta) const { if (m_TreeImpl->m_MaximumNumberTreesOverride == boost::none) { - std::size_t maximumNumberTrees{computeMaximumNumberTrees(m_TreeImpl->m_Eta)}; + std::size_t maximumNumberTrees{computeMaximumNumberTrees(eta)}; maximumNumberTrees = scaleMaximumNumberTrees(maximumNumberTrees); return maximumNumberTrees; } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 34fc38ff14..c296f17afe 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -28,6 +28,8 @@ using namespace boosted_tree; using namespace boosted_tree_detail; namespace { +using TStrVec = CBoostedTreeImpl::TStrVec; +using TMeanVarAccumulator = CBoostedTreeImpl::TMeanVarAccumulator; using TRowRef = core::CDataFrame::TRowRef; class CScopeRecordMemoryUsage { @@ -130,6 +132,11 @@ double readActual(const TRowRef& row, std::size_t dependentVariable) { return row[dependentVariable]; } +double lossAtNSigma(double n, const TMeanVarAccumulator& lossMoments) { + return CBasicStatistics::mean(lossMoments) + + n * std::sqrt(CBasicStatistics::variance(lossMoments)); +} + const std::size_t ASSIGN_MISSING_TO_LEFT{0}; const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; const double SMALLEST_RELATIVE_CURVATURE{1e-20}; @@ -503,6 +510,8 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, // Hyperparameter optimisation loop. + this->initializePerFoldTestLosses(); + while (m_CurrentRound < m_NumberRounds) { LOG_TRACE(<< "Optimisation round = " << m_CurrentRound + 1); @@ -668,26 +677,69 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile, return {gains[index], curvatures[index]}; } +void CBoostedTreeImpl::initializePerFoldTestLosses() { + m_FoldRoundTestLosses.resize(m_NumberFolds); + for (auto& losses : m_FoldRoundTestLosses) { + losses.resize(m_NumberRounds); + } +} + CBoostedTreeImpl::TMeanVarAccumulatorSizePr CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame, const TMemoryUsageCallback& recordMemoryUsage) { + + // We want to ensure we evaluate on equal proportions for each fold. + TSizeVec folds(m_NumberFolds); + std::iota(folds.begin(), folds.end(), 0); + CSampling::random_shuffle(m_Rng, folds.begin(), folds.end()); + + auto stopCrossValidationEarly = [&](TMeanVarAccumulator testLossMoments) { + // Always train on at least one fold and every fold for the first + // "number folds" rounds. Exit cross-validation early if it's clear + // that the test error is not close to the minimum test error. We use + // the estimated test error for each remaining fold at two standard + // deviations below the mean for this. + if (m_StopCrossValidationEarly && m_CurrentRound >= m_NumberFolds && + folds.size() < m_NumberFolds) { + for (const auto& testLoss : this->estimateMissingTestLosses(folds)) { + testLossMoments.add( + CBasicStatistics::mean(testLoss) - + 2.0 * std::sqrt(CBasicStatistics::maximumLikelihoodVariance(testLoss))); + } + return CBasicStatistics::mean(testLossMoments) > this->minimumTestLoss(); + } + return false; + }; + TMeanVarAccumulator lossMoments; - TDoubleVec numberTrees(m_NumberFolds); - for (std::size_t i = 0; i < m_NumberFolds; ++i) { + TDoubleVec numberTrees; + numberTrees.reserve(m_NumberFolds); + + while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) { + std::size_t fold{folds.back()}; + folds.pop_back(); TNodeVecVec forest; double loss; - std::tie(forest, loss) = - this->trainForest(frame, m_TrainingRowMasks[i], m_TestingRowMasks[i], - m_TrainingProgress, recordMemoryUsage); - LOG_TRACE(<< "fold = " << i << " forest size = " << forest.size() + std::tie(forest, loss) = this->trainForest( + frame, m_TrainingRowMasks[fold], m_TestingRowMasks[fold], + m_TrainingProgress, recordMemoryUsage); + LOG_TRACE(<< "fold = " << fold << " forest size = " << forest.size() << " test set loss = " << loss); lossMoments.add(loss); - numberTrees[i] = static_cast(forest.size()); + m_FoldRoundTestLosses[fold][m_CurrentRound] = loss; + numberTrees.push_back(static_cast(forest.size())); } + m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size()); + LOG_TRACE(<< "skipped " << folds.size() << " folds"); + std::sort(numberTrees.begin(), numberTrees.end()); + std::size_t medianNumberTrees{ + static_cast(CBasicStatistics::median(numberTrees))}; + lossMoments = this->correctTestLossMoments(std::move(folds), lossMoments); LOG_TRACE(<< "test mean loss = " << CBasicStatistics::mean(lossMoments) << ", sigma = " << std::sqrt(CBasicStatistics::mean(lossMoments))); - return {lossMoments, static_cast(CBasicStatistics::median(numberTrees))}; + + return {lossMoments, medianNumberTrees}; } CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivatives( @@ -992,6 +1044,131 @@ CBoostedTreeImpl::trainTree(core::CDataFrame& frame, return tree; } +double CBoostedTreeImpl::minimumTestLoss() const { + using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; + TMinAccumulator minimumTestLoss; + for (std::size_t round = 0; round < m_CurrentRound - 1; ++round) { + TMeanVarAccumulator roundLossMoments; + for (std::size_t fold = 0; fold < m_NumberFolds; ++fold) { + if (m_FoldRoundTestLosses[fold][round] != boost::none) { + roundLossMoments.add(*m_FoldRoundTestLosses[fold][round]); + } + } + if (static_cast(CBasicStatistics::count(roundLossMoments)) == m_NumberFolds) { + minimumTestLoss.add(CBasicStatistics::mean(roundLossMoments)); + } + } + return minimumTestLoss[0]; +} + +TMeanVarAccumulator +CBoostedTreeImpl::correctTestLossMoments(const TSizeVec& missing, + TMeanVarAccumulator lossMoments) const { + if (missing.empty()) { + return lossMoments; + } + for (const auto& loss : this->estimateMissingTestLosses(missing)) { + lossMoments += loss; + } + return lossMoments; +} + +CBoostedTreeImpl::TMeanVarAccumulatorVec +CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const { + + // We have a subset of folds for which we've computed test loss. We want to + // estimate the test loss we'll see for the remaining folds to decide if it + // is worthwhile to continue training with these parameters and to correct + // the loss value supplied to Bayesian Optimisation to account for the folds + // we haven't trained on. We tackle this problem as follows: + // 1. Find all previous rounds R which share at least one fold with the + // current round, i.e. one fold for which we've computed the actual + // loss for the current round parameters. + // 2. For each fold f_i for which we haven't estimated the loss in the + // current round fit an OLS model m_i to R to predict the loss of f_i. + // 3. Compute the predicted value for the test loss on each f_i given + // the test losses we've computed so far the current round using m_i. + // 4. Estimate the uncertainty from the variance of the residuals from + // fitting the model m_i to R. + // + // The feature vector we use is defined as: + // + // | calculated fold error 1 | + // | calculated fold error 2 | + // | ... | + // | 1{fold error 1 is present} | + // | 1{fold error 2 is present} | + // | ... | + // + // where the indices range over the folds for which we have errors in the + // current round. + + TSizeVec present(m_NumberFolds); + std::iota(present.begin(), present.end(), 0); + TSizeVec ordered{missing}; + std::sort(ordered.begin(), ordered.end()); + CSetTools::inplace_set_difference(present, ordered.begin(), ordered.end()); + LOG_TRACE(<< "present = " << core::CContainerPrinter::print(present)); + + // Get the current round feature vector. Fixed so computed outside the loop. + TVector x(2 * present.size()); + for (std::size_t col = 0; col < present.size(); ++col) { + x(col) = *m_FoldRoundTestLosses[present[col]][m_CurrentRound]; + x(present.size() + col) = 0.0; + } + + TMeanVarAccumulatorVec predictedTestLosses; + predictedTestLosses.reserve(missing.size()); + + for (std::size_t target : missing) { + // Extract the training mask. + TSizeVec trainingMask; + trainingMask.reserve(m_CurrentRound); + for (std::size_t round = 0; round < m_CurrentRound; ++round) { + if (m_FoldRoundTestLosses[target][round] && + std::find_if(present.begin(), present.end(), [&](std::size_t fold) { + return m_FoldRoundTestLosses[fold][round]; + }) != present.end()) { + trainingMask.push_back(round); + } + } + + // Fit the OLS regression. + CDenseMatrix A(trainingMask.size(), 2 * present.size()); + TVector b(trainingMask.size()); + for (std::size_t row = 0; row < trainingMask.size(); ++row) { + for (std::size_t col = 0; col < present.size(); ++col) { + if (m_FoldRoundTestLosses[present[col]][trainingMask[row]]) { + A(row, col) = *m_FoldRoundTestLosses[present[col]][trainingMask[row]]; + A(row, present.size() + col) = 0.0; + } else { + A(row, col) = 0.0; + A(row, present.size() + col) = 1.0; + } + } + b(row) = *m_FoldRoundTestLosses[target][trainingMask[row]]; + } + TVector params{A.colPivHouseholderQr().solve(b)}; + + TMeanVarAccumulator residualMoments; + for (int row = 0; row < A.rows(); ++row) { + residualMoments.add(b(row) - A.row(row) * params); + } + + double predictedTestLoss{params.transpose() * x}; + double predictedTestLossVariance{ + CBasicStatistics::maximumLikelihoodVariance(residualMoments)}; + LOG_TRACE(<< "prediction(x = " << x.transpose() << ", fold = " << target + << ") = (mean = " << predictedTestLoss + << ", variance = " << predictedTestLossVariance << ")"); + + predictedTestLosses.push_back(CBasicStatistics::momentsAccumulator( + 1.0, predictedTestLoss, predictedTestLossVariance)); + } + + return predictedTestLosses; +} + std::size_t CBoostedTreeImpl::numberFeatures() const { return m_Encoder->numberEncodedColumns(); } @@ -1237,8 +1414,7 @@ void CBoostedTreeImpl::captureBestHyperparameters(const TMeanVarAccumulator& los // We capture the parameters with the lowest error at one standard // deviation above the mean. If the mean error improvement is marginal // we prefer the solution with the least variation across the folds. - double loss{CBasicStatistics::mean(lossMoments) + - std::sqrt(CBasicStatistics::variance(lossMoments))}; + double loss{lossAtNSigma(1.0, lossMoments)}; if (loss < m_BestForestTestLoss) { m_BestForestTestLoss = loss; m_BestHyperparameters = CBoostedTreeHyperparameters{ @@ -1282,6 +1458,7 @@ const std::size_t CBoostedTreeImpl::PACKED_BIT_VECTOR_MAXIMUM_ROWS_PER_BYTE{256} namespace { const std::string VERSION_7_5_TAG{"7.5"}; const std::string VERSION_7_6_TAG{"7.6"}; +const TStrVec SUPPORTED_VERSIONS{VERSION_7_5_TAG, VERSION_7_6_TAG}; const std::string BAYESIAN_OPTIMIZATION_TAG{"bayesian_optimization"}; const std::string BEST_FOREST_TAG{"best_forest"}; @@ -1298,6 +1475,7 @@ const std::string FEATURE_BAG_FRACTION_OVERRIDE_TAG{"feature_bag_fraction_overri const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; const std::string FEATURE_DATA_TYPES_TAG{"feature_data_types"}; const std::string FEATURE_SAMPLE_PROBABILITIES_TAG{"feature_sample_probabilities"}; +const std::string FOLD_ROUND_TEST_LOSSES_TAG{"fold_round_test_losses"}; const std::string LOSS_TAG{"loss"}; const std::string MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG{"maximum_attempts_to_add_tree"}; const std::string MAXIMUM_NUMBER_TREES_OVERRIDE_TAG{"maximum_number_trees_override"}; @@ -1306,6 +1484,7 @@ const std::string MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG{ "maximum_optimisation_rounds_per_hyperparameter"}; const std::string MISSING_FEATURE_ROW_MASKS_TAG{"missing_feature_row_masks"}; const std::string NUMBER_FOLDS_TAG{"number_folds"}; +const std::string NUMBER_FOLDS_OVERRIDE_TAG{"number_folds_override"}; const std::string NUMBER_ROUNDS_TAG{"number_rounds"}; const std::string NUMBER_SPLITS_PER_FEATURE_TAG{"number_splits_per_feature"}; const std::string NUMBER_THREADS_TAG{"number_threads"}; @@ -1313,6 +1492,7 @@ const std::string RANDOM_NUMBER_GENERATOR_TAG{"random_number_generator"}; const std::string REGULARIZATION_TAG{"regularization"}; const std::string REGULARIZATION_OVERRIDE_TAG{"regularization_override"}; const std::string ROWS_PER_FEATURE_TAG{"rows_per_feature"}; +const std::string STOP_CROSS_VALIDATION_EARLY_TAG{"stop_cross_validation_eraly"}; const std::string TESTING_ROW_MASKS_TAG{"testing_row_masks"}; const std::string TRAINING_ROW_MASKS_TAG{"training_row_masks"}; const std::string TRAINING_PROGRESS_TAG{"training_progress"}; @@ -1356,6 +1536,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(FEATURE_DATA_TYPES_TAG, m_FeatureDataTypes, inserter); core::CPersistUtils::persist(FEATURE_SAMPLE_PROBABILITIES_TAG, m_FeatureSampleProbabilities, inserter); + core::CPersistUtils::persist(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, inserter); core::CPersistUtils::persist(MAXIMUM_ATTEMPTS_TO_ADD_TREE_TAG, m_MaximumAttemptsToAddTree, inserter); core::CPersistUtils::persist(MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG, @@ -1363,6 +1544,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(MISSING_FEATURE_ROW_MASKS_TAG, m_MissingFeatureRowMasks, inserter); core::CPersistUtils::persist(NUMBER_FOLDS_TAG, m_NumberFolds, inserter); + core::CPersistUtils::persist(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, inserter); core::CPersistUtils::persist(NUMBER_ROUNDS_TAG, m_NumberRounds, inserter); core::CPersistUtils::persist(NUMBER_SPLITS_PER_FEATURE_TAG, m_NumberSplitsPerFeature, inserter); @@ -1372,6 +1554,8 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert m_RegularizationOverride, inserter); core::CPersistUtils::persist(REGULARIZATION_TAG, m_Regularization, inserter); core::CPersistUtils::persist(ROWS_PER_FEATURE_TAG, m_RowsPerFeature, inserter); + core::CPersistUtils::persist(STOP_CROSS_VALIDATION_EARLY_TAG, + m_StopCrossValidationEarly, inserter); core::CPersistUtils::persist(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, inserter); core::CPersistUtils::persist(MAXIMUM_NUMBER_TREES_TAG, m_MaximumNumberTrees, inserter); core::CPersistUtils::persist(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, inserter); @@ -1395,10 +1579,13 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav m_DownsampleFactorOverride = 1.0; m_DownsampleFactor = 1.0; m_BestHyperparameters.downsampleFactor(1.0); + // We can't stop cross-validation early because we haven't gathered the + // per fold test losses. + m_StopCrossValidationEarly = false; } else if (traverser.name() != VERSION_7_6_TAG) { LOG_ERROR(<< "Input error: unsupported state serialization version. " - << "Currently supported versions: " << VERSION_7_5_TAG - << " and " << VERSION_7_6_TAG << "."); + << "Currently supported versions: " + << core::CContainerPrinter::print(SUPPORTED_VERSIONS) << "."); return false; } @@ -1444,6 +1631,9 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav m_MissingFeatureRowMasks, traverser)) RESTORE(NUMBER_FOLDS_TAG, core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser)) + RESTORE(NUMBER_FOLDS_OVERRIDE_TAG, + core::CPersistUtils::restore(NUMBER_FOLDS_OVERRIDE_TAG, + m_NumberFoldsOverride, traverser)) RESTORE(NUMBER_ROUNDS_TAG, core::CPersistUtils::restore(NUMBER_ROUNDS_TAG, m_NumberRounds, traverser)) RESTORE(NUMBER_SPLITS_PER_FEATURE_TAG, @@ -1459,6 +1649,9 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav m_RegularizationOverride, traverser)) RESTORE(ROWS_PER_FEATURE_TAG, core::CPersistUtils::restore(ROWS_PER_FEATURE_TAG, m_RowsPerFeature, traverser)) + RESTORE(STOP_CROSS_VALIDATION_EARLY_TAG, + core::CPersistUtils::restore(STOP_CROSS_VALIDATION_EARLY_TAG, + m_StopCrossValidationEarly, traverser)) RESTORE(TESTING_ROW_MASKS_TAG, core::CPersistUtils::restore(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, traverser)) RESTORE(MAXIMUM_NUMBER_TREES_TAG, @@ -1514,6 +1707,7 @@ std::size_t CBoostedTreeImpl::memoryUsage() const { mem += core::CMemory::dynamicSize(m_MissingFeatureRowMasks); mem += core::CMemory::dynamicSize(m_TrainingRowMasks); mem += core::CMemory::dynamicSize(m_TestingRowMasks); + mem += core::CMemory::dynamicSize(m_FoldRoundTestLosses); mem += core::CMemory::dynamicSize(m_BestForest); mem += core::CMemory::dynamicSize(m_BayesianOptimization); return mem; diff --git a/lib/maths/CTreeShapFeatureImportance.cc b/lib/maths/CTreeShapFeatureImportance.cc index 10dd1e4168..88ea00d979 100644 --- a/lib/maths/CTreeShapFeatureImportance.cc +++ b/lib/maths/CTreeShapFeatureImportance.cc @@ -126,12 +126,10 @@ void CTreeShapFeatureImportance::shapRecursive(const TTree& tree, parentFractionOne, parentFeatureIndex); if (tree[nodeIndex].isLeaf()) { double leafValue = tree[nodeIndex].value(); - for (std::size_t i = 1; i <= splitPath.depth(); ++i) { + for (int i = 1; i <= splitPath.depth(); ++i) { double scale = CTreeShapFeatureImportance::sumUnwoundPath(splitPath, i); std::size_t inputColumnIndex{ - encoder - .encoding(static_cast(splitPath.featureIndex(i))) - .inputColumnIndex()}; + encoder.encoding(splitPath.featureIndex(i)).inputColumnIndex()}; // inputColumnIndex is read by seeing what the feature at position i is on the path to this leaf. // fractionOnes(i) is an indicator variable which tells us if we condition on this variable // do we visit this path from that node or not, fractionZeros(i) tells us what proportion of