elastic · tveasey · Jan 10, 2020 · Dec 18, 2019 · Dec 19, 2019 · Dec 19, 2019
diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
@@ -251,7 +251,7 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoImportanc
             double c4{result["row_results"]["results"]["ml"][maths::CDataFrameRegressionModel::SHAP_PREFIX + "c4"]
                           .GetDouble()};
             double prediction{
-                result["row_results"]["results"]["ml"]["c5_prediction"].GetDouble()};
+                result["row_results"]["results"]["ml"]["target_prediction"].GetDouble()};
             // c1 explains 95% of the prediction value.
             BOOST_REQUIRE_CLOSE(c1, prediction, 5.0);
             BOOST_REQUIRE_SMALL(c2, 2.0);

diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
@@ -46,7 +46,10 @@ const double MIN_DOWNSAMPLE_LINE_SEARCH_RANGE{2.0};
 const double MAX_DOWNSAMPLE_LINE_SEARCH_RANGE{144.0};
 const double MIN_DOWNSAMPLE_FACTOR_SCALE{0.3};
 const double MAX_DOWNSAMPLE_FACTOR_SCALE{3.0};
-const std::size_t MAX_NUMBER_FOLDS{5};
+// This isn't a hard limit be we increase the number of default training folds
+// if the initial downsample fraction would be larger than this.
+const double MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION{0.5};
+const double MAX_NUMBER_FOLDS{5.0};
 const std::size_t MAX_NUMBER_TREES{static_cast<std::size_t>(2.0 / MIN_ETA + 0.5)};
 
 double computeEta(std::size_t numberRegressors) {
@@ -250,20 +253,32 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
         }
         LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows);
 
-        // We require at least twice the number of rows we'll sample in a bag per
-        // fold if possible. In order to estimate this we use the number of input
-        // features as a proxy for the number of features we'll actually use after
-        // feature selection.
-        double desiredTrainingFraction{(m_InitialDownsampleRowsPerFeature *
-                                        static_cast<double>(frame.numberColumns() - 1)) /
-                                       static_cast<double>(totalNumberTrainingRows)};
-        if (2.0 * desiredTrainingFraction >= 1.0 - 1.0 / static_cast<double>(MAX_NUMBER_FOLDS)) {
-            m_TreeImpl->m_NumberFolds = MAX_NUMBER_FOLDS;
-        } else {
-            m_TreeImpl->m_NumberFolds = static_cast<std::size_t>(
-                std::ceil(1.0 / (1.0 - 2.0 * desiredTrainingFraction)));
-        }
-        LOG_TRACE(<< "desired training fraction = " << desiredTrainingFraction
+        // We want to choose the number of folds so we'll have enough training data
+        // after leaving out one fold. We choose the initial downsample size based
+        // on the same sort of criterion. So we require that leaving out one fold
+        // shouldn't mean than we have fewer rows than constant * desired downsample
+        // # rows if possible. We choose the constant to be two for no particularly
+        // good reason except that:
+        //   1. it isn't too large
+        //   2. it still means we'll have plenty of variation between random bags.
+        //
+        // In order to estimate this we use the number of input features as a proxy
+        // for the number of features we'll actually use after feature selection.
+        //
+        // So how does the following work: we'd like "c * f * # rows" training rows.
+        // For k folds we'll have "(1 - 1 / k) * # rows" training rows. So we want
+        // to find the smallest integer k s.t. c * f * # rows <= (1 - 1 / k) * # rows.
+        // This gives k = ceil(1 / (1 - c * f)). However, we also upper bound this
+        // by MAX_NUMBER_FOLDS.
+
+        double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature *
+                                          static_cast<double>(frame.numberColumns() - 1)) /
+                                         static_cast<double>(totalNumberTrainingRows)};
+
+        m_TreeImpl->m_NumberFolds = static_cast<std::size_t>(
+            std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION,
+                                     1.0 / MAX_NUMBER_FOLDS)));
+        LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction
                   << " # folds = " << m_TreeImpl->m_NumberFolds);
     } else {
         m_TreeImpl->m_NumberFolds = *m_TreeImpl->m_NumberFoldsOverride;
@@ -387,10 +402,10 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) {
     }
 
     double numberFeatures{static_cast<double>(m_TreeImpl->m_Encoder->numberEncodedColumns())};
-    double downSampleFactor{m_InitialDownsampleRowsPerFeature * numberFeatures /
+    double downsampleFactor{m_InitialDownsampleRowsPerFeature * numberFeatures /
                             m_TreeImpl->m_TrainingRowMasks[0].manhattan()};
     m_TreeImpl->m_DownsampleFactor = m_TreeImpl->m_DownsampleFactorOverride.value_or(
-        CTools::truncate(downSampleFactor, 0.05, 0.5));
+        CTools::truncate(downsampleFactor, 0.05, 0.5));
 
     m_TreeImpl->m_Regularization
         .depthPenaltyMultiplier(

diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
@@ -1080,12 +1080,28 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
     // estimate the test loss we'll see for the remaining folds to decide if it
     // is worthwhile to continue training with these parameters and to correct
     // the loss value supplied to Bayesian Optimisation to account for the folds
-    // we haven't trained on. We achieve this by for each missing fold fitting an
-    // OLS to the data (x_i, loss(m_i)) where i ranges over the previous rounds
-    // and x_i is the i'th vector whose components comprise the losses for which
-    // we have values in the current round and indicators for whether they were
-    // missing in the i'th round. We only include a round if we've trained for at
-    // least one of the same folds in the current round.
+    // we haven't trained on. We tackle this problem as follows:
+    //   1. Find all previous rounds R which share at least one fold with the
+    //      current round, i.e. one fold for which we've computed the actual
+    //      loss for the current round parameters
+    //   2. For each fold f_i for which we haven't estimated the loss in this
+    //      round fit an OLS model m_i to R to predict the loss of f_i.
+    //   3. Compute l_i^ the predicted value for the test loss on each f_i given
+    //      the test losses we've computed so far this round using m_i.
+    //   4. Estimate its uncertainty from the variance of the residuals from
+    //      fitting the model m_i to R.
+    //
+    // The feature vector we use is defined as:
+    //
+    //   |   calculated fold error 1  |
+    //   |   calculated fold error 2  |
+    //   |             ...            |
+    //   | 1{fold error 1 is present} |
+    //   | 1{fold error 2 is present} |
+    //   |             ...            |
+    //
+    // where the indices range over the folds for which we have errors in the
+    // current round.
 
     TSizeVec present(m_NumberFolds);
     std::iota(present.begin(), present.end(), 0);
@@ -1094,7 +1110,7 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
     CSetTools::inplace_set_difference(present, ordered.begin(), ordered.end());
     LOG_TRACE(<< "present = " << core::CContainerPrinter::print(present));
 
-    // Get the current round feature vector.
+    // Get the current round feature vector. Fixed so computed outside the loop.
     TVector x(2 * present.size());
     for (std::size_t col = 0; col < present.size(); ++col) {
         x(col) = *m_FoldRoundTestLosses[present[col]][m_CurrentRound];
@@ -1142,11 +1158,12 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
         double predictedTestLoss{params.transpose() * x};
         double predictedTestLossVariance{
             CBasicStatistics::maximumLikelihoodVariance(residualMoments)};
-        predictedTestLosses.push_back(CBasicStatistics::momentsAccumulator(
-            1.0, predictedTestLoss, predictedTestLossVariance));
         LOG_TRACE(<< "prediction(x = " << x.transpose() << ", fold = " << target
                   << ") = (mean = " << predictedTestLoss
                   << ", variance = " << predictedTestLossVariance << ")");
+
+        predictedTestLosses.push_back(CBasicStatistics::momentsAccumulator(
+            1.0, predictedTestLoss, predictedTestLossVariance));
     }
 
     return predictedTestLosses;

diff --git a/lib/maths/CTreeShapFeatureImportance.cc b/lib/maths/CTreeShapFeatureImportance.cc
@@ -126,12 +126,10 @@ void CTreeShapFeatureImportance::shapRecursive(const TTree& tree,
                                            parentFractionOne, parentFeatureIndex);
     if (tree[nodeIndex].isLeaf()) {
         double leafValue = tree[nodeIndex].value();
-        for (std::size_t i = 1; i <= splitPath.depth(); ++i) {
+        for (int i = 1; i <= splitPath.depth(); ++i) {
             double scale = CTreeShapFeatureImportance::sumUnwoundPath(splitPath, i);
             std::size_t inputColumnIndex{
-                encoder
-                    .encoding(static_cast<std::size_t>(splitPath.featureIndex(i)))
-                    .inputColumnIndex()};
+                encoder.encoding(splitPath.featureIndex(i)).inputColumnIndex()};
             // inputColumnIndex is read by seeing what the feature at position i is on the path to this leaf.
             // fractionOnes(i) is an indicator variable which tells us if we condition on this variable
             // do we visit this path from that node or not, fractionZeros(i) tells us what proportion of