[7.6][ML] Stop cross-validation early if the parameters have high pre…

…dicted test loss (#931) Backport #915.
elastic · Jan 13, 2020 · c9043dd · c9043dd
1 parent 94d7534
commit c9043dd
Show file tree

Hide file tree

Showing 10 changed files with 346 additions and 55 deletions.
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -57,6 +57,8 @@ is no longer decreasing. (See {ml-pull}875[#875].)
 * Improve performance updating quantile estimates. (See {ml-pull}881[#881].)
 * Migrate to use Bayesian Optimisation for initial hyperparameter value line searches and
 stop early if the expected improvement is too small. (See {ml-pull}903[#903].)
+* Stop cross-validation early if the predicted test loss has a small chance of being
+smaller than for the best parameter values found so far. (See {ml-pull}915[#915].)
 
 === Bug Fixes
 * Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].)

diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -44,6 +44,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string MAXIMUM_NUMBER_TREES;
     static const std::string FEATURE_BAG_FRACTION;
     static const std::string NUMBER_FOLDS;
+    static const std::string STOP_CROSS_VALIDATION_EARLY;
     static const std::string NUMBER_ROUNDS_PER_HYPERPARAMETER;
     static const std::string BAYESIAN_OPTIMISATION_RESTARTS;
     static const std::string TOP_FEATURE_IMPORTANCE_VALUES;

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
@@ -58,8 +58,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency);
     //! Set the number of folds to use for estimating the generalisation error.
     CBoostedTreeFactory& numberFolds(std::size_t numberFolds);
-    //! Stratify the cross validation we do for regression.
+    //! Stratify the cross-validation we do for regression.
     CBoostedTreeFactory& stratifyRegressionCrossValidation(bool stratify);
+    //! Stop cross-validation early if the test loss is not promising.
+    CBoostedTreeFactory& stopCrossValidationEarly(bool stopEarly);
     //! The number of rows per feature to sample in the initial downsample.
     CBoostedTreeFactory& initialDownsampleRowsPerFeature(double rowsPerFeature);
     //! Set the sum of leaf depth penalties multiplier.
@@ -133,6 +135,9 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     //! Compute the row masks for the missing values for each feature.
     void initializeMissingFeatureMasks(const core::CDataFrame& frame) const;
 
+    //! Set up the number of folds we'll use for cross-validation.
+    void initializeNumberFolds(core::CDataFrame& frame) const;
+
     //! Set up cross validation.
     void initializeCrossValidation(core::CDataFrame& frame) const;
 
@@ -187,7 +192,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     void resumeRestoredTrainingProgressMonitoring();
 
     //! The maximum number of trees to use in the hyperparameter optimisation loop.
-    std::size_t mainLoopMaximumNumberTrees() const;
+    std::size_t mainLoopMaximumNumberTrees(double eta) const;
 
     static void noopRecordProgress(double);
     static void noopRecordMemoryUsage(std::int64_t);

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -53,9 +53,11 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
     using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
     using TMeanVarAccumulatorSizePr = std::pair<TMeanVarAccumulator, std::size_t>;
+    using TMeanVarAccumulatorVec = std::vector<TMeanVarAccumulator>;
     using TBayesinOptimizationUPtr = std::unique_ptr<maths::CBayesianOptimisation>;
     using TNodeVec = CBoostedTree::TNodeVec;
     using TNodeVecVec = CBoostedTree::TNodeVecVec;
+    using TLossFunctionUPtr = CBoostedTree::TLossFunctionUPtr;
     using TProgressCallback = CBoostedTree::TProgressCallback;
     using TMemoryUsageCallback = CBoostedTree::TMemoryUsageCallback;
     using TTrainingStateCallback = CBoostedTree::TTrainingStateCallback;
@@ -68,7 +70,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT;
 
 public:
-    CBoostedTreeImpl(std::size_t numberThreads, CBoostedTree::TLossFunctionUPtr loss);
+    CBoostedTreeImpl(std::size_t numberThreads, TLossFunctionUPtr loss);
 
     ~CBoostedTreeImpl();
 
@@ -152,6 +154,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
 private:
     using TSizeDoublePr = std::pair<std::size_t, double>;
     using TDoubleDoublePr = std::pair<double, double>;
+    using TOptionalDoubleVec = std::vector<TOptionalDouble>;
+    using TOptionalDoubleVecVec = std::vector<TOptionalDoubleVec>;
     using TOptionalSize = boost::optional<std::size_t>;
     using TImmutableRadixSetVec = std::vector<core::CImmutableRadixSet<double>>;
     using TVector = CDenseVector<double>;
@@ -416,6 +420,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TDoubleDoublePr gainAndCurvatureAtPercentile(double percentile,
                                                  const TNodeVecVec& forest) const;
 
+    //! Presize the collection to hold the per fold test errors.
+    void initializePerFoldTestLosses();
+
     //! Train the forest and compute loss moments on each fold.
     TMeanVarAccumulatorSizePr crossValidateForest(core::CDataFrame& frame,
                                                   const TMemoryUsageCallback& recordMemoryUsage);
@@ -447,6 +454,16 @@ class MATHS_EXPORT CBoostedTreeImpl final {
                        const std::size_t maximumTreeSize,
                        const TMemoryUsageCallback& recordMemoryUsage) const;
 
+    //! Compute the minimum mean test loss per fold for any round.
+    double minimumTestLoss() const;
+
+    //! Estimate the loss we'll get including the missing folds.
+    TMeanVarAccumulator correctTestLossMoments(const TSizeVec& missing,
+                                               TMeanVarAccumulator lossMoments) const;
+
+    //! Estimate test losses for the \p missing folds.
+    TMeanVarAccumulatorVec estimateMissingTestLosses(const TSizeVec& missing) const;
+
     //! Get the number of features including category encoding.
     std::size_t numberFeatures() const;
 
@@ -503,8 +520,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     std::size_t maximumTreeSize(std::size_t numberRows) const;
 
     //! Restore \p loss function pointer from the \p traverser.
-    static bool restoreLoss(CBoostedTree::TLossFunctionUPtr& loss,
-                            core::CStateRestoreTraverser& traverser);
+    static bool restoreLoss(TLossFunctionUPtr& loss, core::CStateRestoreTraverser& traverser);
 
     //! Record the training state using the \p recordTrainState callback function
     void recordState(const TTrainingStateCallback& recordTrainState) const;
@@ -513,10 +529,12 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     mutable CPRNG::CXorOShiro128Plus m_Rng;
     std::size_t m_NumberThreads;
     std::size_t m_DependentVariable = std::numeric_limits<std::size_t>::max();
-    CBoostedTree::TLossFunctionUPtr m_Loss;
+    TLossFunctionUPtr m_Loss;
+    bool m_StopCrossValidationEarly = true;
     TRegularizationOverride m_RegularizationOverride;
     TOptionalDouble m_DownsampleFactorOverride;
     TOptionalDouble m_EtaOverride;
+    TOptionalSize m_NumberFoldsOverride;
     TOptionalSize m_MaximumNumberTreesOverride;
     TOptionalDouble m_FeatureBagFractionOverride;
     TRegularization m_Regularization;
@@ -537,6 +555,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TPackedBitVectorVec m_TrainingRowMasks;
     TPackedBitVectorVec m_TestingRowMasks;
     double m_BestForestTestLoss = INF;
+    TOptionalDoubleVecVec m_FoldRoundTestLosses;
     CBoostedTreeHyperparameters m_BestHyperparameters;
     TNodeVecVec m_BestForest;
     TBayesinOptimizationUPtr m_BayesianOptimization;

diff --git a/include/maths/CTreeShapFeatureImportance.h b/include/maths/CTreeShapFeatureImportance.h
@@ -63,8 +63,7 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
     struct SPath {
         explicit SPath(std::size_t length)
             : s_FractionOnes(length), s_FractionZeros(length),
-              s_FeatureIndex(length, -1), s_Scale(length), s_NextIndex(0),
-              s_MaxLength(length) {}
+              s_FeatureIndex(length, -1), s_Scale(length), s_MaxLength(length) {}
 
         void extend(int featureIndex, double fractionZero, double fractionOne) {
             if (s_NextIndex < s_MaxLength) {
@@ -81,7 +80,7 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
         }
 
         void reduce(std::size_t pathIndex) {
-            for (std::size_t i = pathIndex; i < this->depth(); ++i) {
+            for (int i = static_cast<int>(pathIndex); i < this->depth(); ++i) {
                 s_FeatureIndex[i] = s_FeatureIndex[i + 1];
                 s_FractionZeros[i] = s_FractionZeros[i + 1];
                 s_FractionOnes[i] = s_FractionOnes[i + 1];
@@ -107,10 +106,10 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
         double scale(std::size_t pathIndex) const { return s_Scale[pathIndex]; }
 
         //! Current depth in the tree
-        int depth() const { return static_cast<int>(s_NextIndex) - 1; };
+        int depth() const { return static_cast<int>(s_NextIndex) - 1; }
 
         //! Get next index.
-        size_t nextIndex() const { return s_NextIndex; }
+        std::size_t nextIndex() const { return s_NextIndex; }
 
         //! Set next index.
         void nextIndex(std::size_t nextIndex) { s_NextIndex = nextIndex; }
@@ -119,9 +118,8 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
         TDoubleVec s_FractionZeros;
         TIntVec s_FeatureIndex;
         TDoubleVec s_Scale;
-        std::size_t s_NextIndex;
-
-        std::size_t s_MaxLength;
+        std::size_t s_NextIndex = 0;
+        std::size_t s_MaxLength = 0;
     };
 
 private:

diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
@@ -51,6 +51,8 @@ const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::paramete
         theReader.addParameter(FEATURE_BAG_FRACTION,
                                CDataFrameAnalysisConfigReader::E_OptionalParameter);
         theReader.addParameter(NUMBER_FOLDS, CDataFrameAnalysisConfigReader::E_OptionalParameter);
+        theReader.addParameter(STOP_CROSS_VALIDATION_EARLY,
+                               CDataFrameAnalysisConfigReader::E_OptionalParameter);
         theReader.addParameter(NUMBER_ROUNDS_PER_HYPERPARAMETER,
                                CDataFrameAnalysisConfigReader::E_OptionalParameter);
         theReader.addParameter(BAYESIAN_OPTIMISATION_RESTARTS,
@@ -82,6 +84,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
         parameters[NUMBER_ROUNDS_PER_HYPERPARAMETER].fallback(std::size_t{0})};
     std::size_t bayesianOptimisationRestarts{
         parameters[BAYESIAN_OPTIMISATION_RESTARTS].fallback(std::size_t{0})};
+    bool stopCrossValidationEarly{parameters[STOP_CROSS_VALIDATION_EARLY].fallback(true)};
     std::size_t topFeatureImportanceValues{
         parameters[TOP_FEATURE_IMPORTANCE_VALUES].fallback(std::size_t{0})};
 
@@ -120,6 +123,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
         maths::CBoostedTreeFactory::constructFromParameters(this->spec().numberThreads()));
 
     (*m_BoostedTreeFactory)
+        .stopCrossValidationEarly(stopCrossValidationEarly)
         .progressCallback(this->progressRecorder())
         .trainingStateCallback(this->statePersister())
         .memoryUsageCallback(this->memoryMonitor(counter_t::E_DFTPMPeakMemoryUsage));
@@ -309,10 +313,10 @@ const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"s
 const std::string CDataFrameTrainBoostedTreeRunner::MAXIMUM_NUMBER_TREES{"maximum_number_trees"};
 const std::string CDataFrameTrainBoostedTreeRunner::FEATURE_BAG_FRACTION{"feature_bag_fraction"};
 const std::string CDataFrameTrainBoostedTreeRunner::NUMBER_FOLDS{"number_folds"};
+const std::string CDataFrameTrainBoostedTreeRunner::STOP_CROSS_VALIDATION_EARLY{"stop_cross_validation_early"};
 const std::string CDataFrameTrainBoostedTreeRunner::NUMBER_ROUNDS_PER_HYPERPARAMETER{"number_rounds_per_hyperparameter"};
 const std::string CDataFrameTrainBoostedTreeRunner::BAYESIAN_OPTIMISATION_RESTARTS{"bayesian_optimisation_restarts"};
 const std::string CDataFrameTrainBoostedTreeRunner::TOP_FEATURE_IMPORTANCE_VALUES{"top_feature_importance_values"};
-
 // clang-format on
 }
 }
diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
@@ -25,14 +25,13 @@ BOOST_AUTO_TEST_SUITE(CDataFrameAnalyzerFeatureImportanceTest)
 using namespace ml;
 
 namespace {
-using TBoolVec = std::vector<bool>;
-using TSizeVec = std::vector<std::size_t>;
-using TRowItr = core::CDataFrame::TRowItr;
-using TRowRef = core::CDataFrame::TRowRef;
-using TDataFrameUPtr = std::unique_ptr<core::CDataFrame>;
 using TDoubleVec = std::vector<double>;
 using TStrVec = std::vector<std::string>;
-using TMeanVarAccumulator = ml::maths::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
+using TRowItr = core::CDataFrame::TRowItr;
+using TRowRef = core::CDataFrame::TRowRef;
+using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
+using TMeanAccumulatorVec = std::vector<TMeanAccumulator>;
+using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
 
 void setupLinearRegressionData(const TStrVec& fieldNames,
                                TStrVec& fieldValues,
@@ -228,18 +227,19 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceAllShap, SF
     // randomly on [-10, 10].
     BOOST_TEST_REQUIRE(c1Sum > c3Sum);
     BOOST_TEST_REQUIRE(c1Sum > c4Sum);
-    BOOST_REQUIRE_CLOSE(weights[1] / weights[2], c2Sum / c3Sum, 5.0); // ratio within 5% of ratio of coefficients
+    BOOST_REQUIRE_CLOSE(weights[1] / weights[2], c2Sum / c3Sum, 10.0); // ratio within 10% of ratio of coefficients
     BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 5.0); // c3 and c4 within 5% of each other
     // make sure the local approximation differs from the prediction always by the same bias (up to a numeric error)
-    BOOST_REQUIRE_SMALL(ml::maths::CBasicStatistics::variance(bias), 1e-6);
+    BOOST_REQUIRE_SMALL(maths::CBasicStatistics::variance(bias), 1e-6);
 }
 
 BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoImportance, SFixture) {
     // Test that feature importance calculates low SHAP values if regressors have no weight.
     // We also add high noise variance.
     std::size_t topShapValues{4};
-    auto results{runRegression(topShapValues, {10.0, 0.0, 0.0, 0.0}, 10.0)};
+    auto results = runRegression(topShapValues, {10.0, 0.0, 0.0, 0.0}, 10.0);
 
+    TMeanAccumulator c2Mean, c3Mean, c4Mean;
     for (const auto& result : results.GetArray()) {
         if (result.HasMember("row_results")) {
             double c1{result["row_results"]["results"]["ml"][maths::CDataFrameRegressionModel::SHAP_PREFIX + "c1"]
@@ -252,13 +252,20 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoImportanc
                           .GetDouble()};
             double prediction{
                 result["row_results"]["results"]["ml"]["target_prediction"].GetDouble()};
-            // c1 explain 97% of the prediction value, i.e. the difference from the prediction is less than 1%.
-            BOOST_REQUIRE_CLOSE(c1, prediction, 3.0);
-            BOOST_REQUIRE_SMALL(c2, 0.25);
-            BOOST_REQUIRE_SMALL(c3, 0.25);
-            BOOST_REQUIRE_SMALL(c4, 0.25);
+            // c1 explains 95% of the prediction value.
+            BOOST_REQUIRE_CLOSE(c1, prediction, 5.0);
+            BOOST_REQUIRE_SMALL(c2, 2.0);
+            BOOST_REQUIRE_SMALL(c3, 2.0);
+            BOOST_REQUIRE_SMALL(c4, 2.0);
+            c2Mean.add(std::fabs(c2));
+            c3Mean.add(std::fabs(c3));
+            c4Mean.add(std::fabs(c4));
         }
     }
+
+    BOOST_REQUIRE_SMALL(maths::CBasicStatistics::mean(c2Mean), 0.1);
+    BOOST_REQUIRE_SMALL(maths::CBasicStatistics::mean(c3Mean), 0.1);
+    BOOST_REQUIRE_SMALL(maths::CBasicStatistics::mean(c4Mean), 0.1);
 }
 
 BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeClassificationFeatureImportanceAllShap, SFixture) {
@@ -314,7 +321,7 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeClassificationFeatureImportanceAllShap
     BOOST_TEST_REQUIRE(c1Sum > c4Sum);
     BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 40.0); // c3 and c4 within 40% of each other
     // make sure the local approximation differs from the prediction always by the same bias (up to a numeric error)
-    BOOST_REQUIRE_SMALL(ml::maths::CBasicStatistics::variance(bias), 1e-6);
+    BOOST_REQUIRE_SMALL(maths::CBasicStatistics::variance(bias), 1e-6);
 }
 
 BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoShap, SFixture) {