From 2adce69651b88b8bd033617b44fe071975bc0232 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Mon, 22 Feb 2021 10:14:56 +0000
Subject: [PATCH 01/35] WIP

---
 include/maths/CBoostedTreeFactory.h    |   2 -
 include/maths/CLowess.h                |  89 +++++++
 include/maths/CLowessDetail.h          | 337 +++++++++++++++++++++++++
 include/maths/CMixtureDistribution.h   |   1 +
 include/maths/CSolvers.h               |  11 +-
 lib/maths/CBoostedTreeFactory.cc       | 173 +++----------
 lib/maths/unittest/CLowessTest.cc      | 262 +++++++++++++++++++
 lib/maths/unittest/COneOfNPriorTest.cc |   1 +
 lib/maths/unittest/CSolversTest.cc     |   1 +
 lib/maths/unittest/Makefile            |   1 +
 lib/maths/unittest/TestUtils.cc        |   1 +
 11 files changed, 740 insertions(+), 139 deletions(-)
 create mode 100644 include/maths/CLowess.h
 create mode 100644 include/maths/CLowessDetail.h
 create mode 100644 lib/maths/unittest/CLowessTest.cc
diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
index 53fc2982bb..02564e594f 100644
--- a/include/maths/CBoostedTreeFactory.h
+++ b/include/maths/CBoostedTreeFactory.h
@@ -215,8 +215,6 @@ class MATHS_EXPORT CBoostedTreeFactory final {
                                        const TApplyParameter& applyParameterStep,
                                        double intervalLeftEnd,
                                        double intervalRightEnd,
-                                       double returnedIntervalLeftEndOffset,
-                                       double returnedIntervalRightEndOffset,
                                        const TAdjustTestLoss& adjustTestLoss = noopAdjustTestLoss) const;
 
     //! Initialize the state for hyperparameter optimisation.
diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
new file mode 100644
index 0000000000..c440f497f8
--- /dev/null
+++ b/include/maths/CLowess.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#ifndef INCLUDED_ml_maths_CLowess_h
+#define INCLUDED_ml_maths_CLowess_h
+
+#include <maths/CLeastSquaresOnlineRegression.h>
+
+#include <maths/CBasicStatistics.h>
+
+#include <utility>
+#include <vector>
+
+namespace ml {
+namespace maths {
+
+//! \brief LOWESS regression using order N polynomial.
+//!
+//! DESCRIPTION:\n
+//! For more details see https://en.wikipedia.org/wiki/Local_regression.
+template<std::size_t N>
+class CLowess {
+public:
+    using TDoubleDoublePr = std::pair<double, double>;
+    using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
+    using TPolynomial = CLeastSquaresOnlineRegression<N>;
+
+public:
+    //! Fit a polynomial LOWESS model to \p data choosing the weight function to
+    //! maximize the likelihood of \p numberFolds hold out sets.
+    //!
+    //! \param[in] data The training data.
+    //! \param[in] numberFolds The number of folds to use in cross-validation to
+    // compute the best weight function from the family exp(-k |xi - xj|).
+    void fit(TDoubleDoublePrVec data, std::size_t numberFolds);
+
+    //! Predict the value at \p x.
+    //!
+    //! \note Defined as zero if no data have been fit.
+    double predict(double x) const;
+
+    //! Compute the minimum of the function on the training data interval.
+    //!
+    //! \note Defined as (0,0) if no data have been fit.
+    TDoubleDoublePr minimum() const;
+
+    //! Get an estimate of residual variance at the observed values.
+    //!
+    //! \note Defined as zero if no data have been fit.
+    double residualVariance() const;
+
+    //! Compute the sublevel set of \p f containing \p xmin.
+    //!
+    //! \param[in] xmin The argument of the minimum of the interpolated function.
+    //! \param[in] fmin The value of the minimum of the function.
+    //! \param[in] f The value of the function for which to compute the sublevel set.
+    //! \note \p f should be greater than fmin.
+    //! \note Defined as (0,0) if no data have been fit.
+    TDoubleDoublePr sublevelSet(double xmin, double fmin, double f) const;
+
+    //! Get how far we are prepared to extrapolate as the interval we will search
+    //! in the minimum and sublevelSet functions.
+    TDoubleDoublePr extrapolationInterval() const;
+
+private:
+    using TDoubleVec = std::vector<double>;
+    using TSizeVec = std::vector<std::size_t>;
+    using TSizeVecVec = std::vector<TSizeVec>;
+    using TSizeVecCItr = TSizeVec::const_iterator;
+    using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
+
+private:
+    void setupMasks(std::size_t numberFolds, TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks) const;
+    double likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks, double k) const;
+    TPolynomial fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const;
+    double weight(double k, double x1, double x2) const;
+
+private:
+    TDoubleDoublePrVec m_Data;
+    TSizeVec m_Mask;
+    double m_K = 0.0;
+};
+}
+}
+
+#endif // INCLUDED_ml_maths_CLowess_h
diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
new file mode 100644
index 0000000000..1c6e5be6ed
--- /dev/null
+++ b/include/maths/CLowessDetail.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#ifndef INCLUDED_ml_maths_CLowessDetail_h
+#define INCLUDED_ml_maths_CLowessDetail_h
+
+#include <maths/CLowess.h>
+
+#include <core/CContainerPrinter.h>
+
+#include <maths/CLeastSquaresOnlineRegression.h>
+#include <maths/CLeastSquaresOnlineRegressionDetail.h>
+#include <maths/CNormalMeanPrecConjugate.h>
+#include <maths/COrderings.h>
+#include <maths/CPRNG.h>
+#include <maths/CSampling.h>
+#include <maths/CSetTools.h>
+#include <maths/CSolvers.h>
+#include <maths/CTools.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace ml {
+namespace maths {
+
+template<std::size_t N>
+void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
+
+    m_K = 0.0;
+    m_Data = std::move(data);
+    std::sort(m_Data.begin(), m_Data.end(), COrderings::SFirstLess{});
+
+    if (m_Data.size() < 4) {
+        return;
+    }
+
+    // We use exponential decay in the weights and cross-validated maximum likelihood
+    // to choose the decay constant. Formally, we are fitting
+    //
+    //   f(x | p^*) = poly(x | p^*(x))
+    //
+    //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }
+    //
+    // where w = exp(-k (x - X_i)) and (X, Y) are the data to fit. We determine k by
+    // solving
+    //
+    //   k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*)) } }
+    //
+    // where H is a hold out set and we assume Y_i ~ N(poly(X_i | p^*), sigma) with
+    // sigma estimated from the training data prediction residuals.
+
+    m_Mask.resize(m_Data.size());
+    std::iota(m_Mask.begin(), m_Mask.end(), 0);
+
+    TSizeVecVec trainingMasks;
+    TSizeVecVec testingMasks;
+    this->setupMasks(numberFolds, trainingMasks, testingMasks);
+
+    TDoubleVec K(17);
+    double range{m_Data.back().first - m_Data.front().first};
+    for (std::size_t i = 0; i < K.size(); ++i) {
+        K[i] = 2.0 * static_cast<double>(i) / range;
+    }
+    LOG_TRACE(<< "range = " << range << ", K = " << core::CContainerPrinter::print(K));
+
+    double kmax;
+    double likelihoodMax;
+    CSolvers::globalMaximize(K,
+                             [&](double k) {
+                                 return this->likelihood(trainingMasks, testingMasks, k);
+                             },
+                             kmax, likelihoodMax);
+    LOG_TRACE(<< "kmax = " << kmax << " likelihood(kmax) = " << likelihoodMax);
+
+    m_K = kmax;
+}
+
+template<std::size_t N>
+double CLowess<N>::predict(double x) const {
+    if (m_Data.empty()) {
+        return 0.0;
+    }
+    auto poly = this->fit(m_Mask.begin(), m_Mask.end(), m_K, x);
+    return poly.predict(x);
+}
+
+template<std::size_t N>
+typename CLowess<N>::TDoubleDoublePr CLowess<N>::minimum() const {
+
+    if (m_Data.empty()) {
+        return {0.0, 0.0};
+    }
+
+    // There is no guaranty the function is convex so we need a global method.
+    // We choose something simple:
+    //   1. Find (local) minimum near a data point.
+    //   2. Search around here for the true local minimum.
+    //
+    // All in all this has complexity O(2 |data| function evaluations).
+
+    TDoubleVec X;
+
+    double xa, xb;
+    std::tie(xa, xb) = this->extrapolationInterval();
+
+    // Coarse.
+    X.reserve(m_Data.size() + 2);
+    X.push_back(xa);
+    for (std::size_t i = 0; i < m_Data.size(); ++i) {
+        X.push_back(m_Data[i].first);
+    }
+    X.push_back(xb);
+    double xmin, fmin;
+    CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xmin, fmin);
+
+    // Refine.
+    double range{(xb - xa) / static_cast<double>(X.size())};
+    xa = std::max(xa, xmin - 0.5 * range);
+    xb = std::min(xb, xmin + 0.5 * range);
+    double dx{2.0 * (xb - xa) / static_cast<double>(X.size())};
+    X.clear();
+    for (double x = xa; x < xb; x += dx) {
+        X.push_back(x);
+    }
+    double xcand, fcand;
+    CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xcand, fcand);
+
+    if (fcand < fmin) {
+        xmin = xcand;
+        fmin = fcand;
+    }
+
+    return {xmin, fmin};
+}
+
+template<std::size_t N>
+double CLowess<N>::residualVariance() const {
+
+    if (m_Data.empty()) {
+        return 0.0;
+    }
+
+    TMeanVarAccumulator moments;
+
+    std::size_t n{m_Data.size()};
+
+    TSizeVec mask(n);
+    std::iota(mask.begin(), mask.end(), 1);
+    for (std::size_t i = 0; i < n; ++i) {
+        double xi, yi;
+        std::tie(xi, yi) = m_Data[i];
+        auto poly = this->fit(mask.begin(), mask.begin() + n - 1, m_K, xi);
+        moments.add(yi - poly.predict(xi));
+        mask[i] = i;
+    }
+
+    return CBasicStatistics::variance(moments);
+}
+
+template<std::size_t N>
+typename CLowess<N>::TDoubleDoublePr
+CLowess<N>::sublevelSet(double xmin, double fmin, double f) const {
+
+    if (m_Data.empty()) {
+        return {0.0, 0.0};
+    }
+    if (f <= fmin) {
+        return {xmin, xmin};
+    }
+
+    auto solve = [&](double n, double stop) {
+        double fx{fmin};
+        for (double i = 1.0; i <= n; i += 1.0) {
+            double xlast{((i - 1.0) * stop + (n - i + 1.0) * xmin) / n};
+            double x{(i * stop + (n - i) * xmin) / n};
+            double flast{fx};
+            fx = this->predict(x);
+            if (fx > f) {
+                return CTools::linearlyInterpolate(flast, fx, xlast, x, f);
+            }
+        }
+        return stop;
+    };
+
+    double xa, xb;
+    std::tie(xa, xb) = this->extrapolationInterval();
+    double alpha{(xmin - xa) / (xb - xa)};
+    double beta{1.0 - alpha};
+    LOG_TRACE(<< "alpha = " << alpha << ", beta = " << beta);
+
+    return {solve(std::ceil(alpha * 40.0), xa),
+            solve(std::ceil((1.0 - alpha) * 40.0), xb)};
+}
+
+template<std::size_t N>
+typename CLowess<N>::TDoubleDoublePr CLowess<N>::extrapolationInterval() const {
+    double xa{m_Data.front().first};
+    double xb{m_Data.back().first};
+    xa -= std::min(0.1 * (xb - xa), 0.5 / m_K);
+    xb += std::min(0.1 * (xb - xa), 0.5 / m_K);
+    return {xa, xb};
+}
+
+template<std::size_t N>
+void CLowess<N>::setupMasks(std::size_t numberFolds,
+                            TSizeVecVec& trainingMasks,
+                            TSizeVecVec& testingMasks) const {
+
+    numberFolds = CTools::truncate(numberFolds, std::size_t{2}, m_Data.size());
+
+    trainingMasks.resize(numberFolds);
+    testingMasks.resize(numberFolds);
+
+    if (numberFolds == m_Data.size()) {
+        // Leave-out-one cross-validation.
+        trainingMasks[0].resize(m_Data.size() - 1);
+        std::iota(trainingMasks[0].begin(), trainingMasks[0].end(), 1);
+        testingMasks[0].push_back(0);
+        for (std::size_t i = 1; i < numberFolds; ++i) {
+            trainingMasks[i] = trainingMasks[0];
+            trainingMasks[i][i - 1] = 0;
+            std::sort(trainingMasks[i].begin(), trainingMasks[i].end());
+            testingMasks[i].push_back(i);
+        }
+    } else {
+        // K-fold cross-validation.
+        CPRNG::CXorOShiro128Plus rng;
+        TSizeVec all(m_Data.size());
+        TSizeVec remaining;
+        TSizeVec sample;
+        TDoubleVec probabilities;
+
+        std::iota(all.begin(), all.end(), 0);
+        remaining = all;
+
+        for (std::size_t i = 0; i < numberFolds; ++i) {
+            std::size_t n{std::min((m_Data.size() + numberFolds - 1) / numberFolds,
+                                   remaining.size())};
+            probabilities.assign(remaining.size(), 1.0);
+            CSampling::categoricalSampleWithoutReplacement(rng, probabilities, n, sample);
+
+            testingMasks[i].reserve(sample.size());
+            for (auto j : sample) {
+                testingMasks[i].push_back(remaining[j]);
+            }
+            std::sort(testingMasks[i].begin(), testingMasks[i].end());
+
+            trainingMasks[i].reserve(all.size() - testingMasks[i].size());
+            std::set_difference(all.begin(), all.end(), testingMasks[i].begin(),
+                                testingMasks[i].end(),
+                                std::back_inserter(trainingMasks[i]));
+
+            CSetTools::inplace_set_difference(remaining, testingMasks[i].begin(),
+                                              testingMasks[i].end());
+            rng.discard(100000);
+        }
+    }
+
+    LOG_TRACE(<< "training masks = " << core::CContainerPrinter::print(trainingMasks));
+    LOG_TRACE(<< "testing masks = " << core::CContainerPrinter::print(testingMasks));
+}
+
+template<std::size_t N>
+double CLowess<N>::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks, double k) const {
+
+    double result{0.0};
+
+    CNormalMeanPrecConjugate::TDouble1Vec samples;
+    CNormalMeanPrecConjugate::TDoubleWeightsAry1Vec weights;
+
+    for (std::size_t i = 0; i < trainingMasks.size(); ++i) {
+
+        CNormalMeanPrecConjugate residuals{
+            CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData)};
+
+        std::size_t last{trainingMasks[i].size() - 1};
+
+        for (auto& j : trainingMasks[i]) {
+            double xj, yj;
+            std::tie(xj, yj) = m_Data[j];
+            std::swap(j, trainingMasks[i][last]);
+            auto poly = this->fit(trainingMasks[i].cbegin(),
+                                  trainingMasks[i].cbegin() + last, k, xj);
+            std::swap(j, trainingMasks[i][last]);
+            residuals.addSamples({yj - poly.predict(xj)}, maths_t::CUnitWeights::SINGLE_UNIT);
+        }
+        LOG_TRACE(<< "residual distribution = " << residuals.print());
+
+        samples.clear();
+        samples.reserve(testingMasks[i].size());
+        for (auto j : testingMasks[i]) {
+            double xj, yj;
+            std::tie(xj, yj) = m_Data[j];
+            auto poly = this->fit(trainingMasks[i].cbegin(),
+                                  trainingMasks[i].cend(), k, xj);
+            samples.push_back(yj - poly.predict(xj));
+        }
+        weights.assign(testingMasks[i].size(), maths_t::CUnitWeights::UNIT);
+        LOG_TRACE(<< "samples = " << samples);
+
+        double likelihood;
+        residuals.jointLogMarginalLikelihood(samples, weights, likelihood);
+        result += likelihood;
+    }
+    LOG_TRACE(<< "k = " << k << ", likelihood = " << result);
+
+    return result;
+}
+
+template<std::size_t N>
+typename CLowess<N>::TPolynomial
+CLowess<N>::fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const {
+    TPolynomial poly;
+    for (auto i = beginMask; i != endMask; ++i) {
+        double xi, yi;
+        std::tie(xi, yi) = m_Data[*i];
+        poly.add(xi, yi, this->weight(k, xi, x));
+    }
+    return poly;
+}
+
+template<std::size_t N>
+double CLowess<N>::weight(double k, double x1, double x2) const {
+    return std::exp(-k * std::fabs(x2 - x1));
+}
+}
+}
+
+#endif // INCLUDED_ml_maths_CLowessDetail_h
diff --git a/include/maths/CMixtureDistribution.h b/include/maths/CMixtureDistribution.h
index 69ac0c4721..6a7e4d29b2 100644
--- a/include/maths/CMixtureDistribution.h
+++ b/include/maths/CMixtureDistribution.h
@@ -10,6 +10,7 @@
 #include <core/CLogger.h>
 #include <core/CStringUtils.h>
 
+#include <maths/CCompositeFunctions.h>
 #include <maths/CEqualWithTolerance.h>
 #include <maths/CMathsFuncs.h>
 #include <maths/CSolvers.h>
diff --git a/include/maths/CSolvers.h b/include/maths/CSolvers.h
index 8f1b5bba54..57ab149c88 100644
--- a/include/maths/CSolvers.h
+++ b/include/maths/CSolvers.h
@@ -11,7 +11,6 @@
 #include <core/CLogger.h>
 
 #include <maths/CBasicStatistics.h>
-#include <maths/CCompositeFunctions.h>
 #include <maths/CEqualWithTolerance.h>
 #include <maths/CMathsFuncs.h>
 #include <maths/COrderings.h>
@@ -860,8 +859,8 @@ class MATHS_EXPORT CSolvers {
     //! \param[out] fx Set to the value of f at \p x.
     template<typename T, typename F>
     static bool globalMaximize(const T& p, const F& f, double& x, double& fx) {
-        CCompositeFunctions::CMinus<F> f_(f);
-        bool result = globalMinimize(p, f_, x, fx);
+        auto minusF = [&](double x_) { return -f(x_); };
+        bool result{globalMinimize(p, minusF, x, fx)};
         fx = -fx;
         return result;
     }
@@ -923,7 +922,7 @@ class MATHS_EXPORT CSolvers {
 
         // [a, x] and [b, r] bracket the sublevel set end points.
 
-        CCompositeFunctions::CMinusConstant<F> f_(f, fc);
+        auto fMinusFc = [=](double x_) { return f(x_) - fc; };
 
         LOG_TRACE(<< "a = " << a << ", x = " << x << ", b = " << b);
         LOG_TRACE(<< "f_(a) = " << fa - fc << ", f_(x) = " << fx - fc
@@ -935,7 +934,7 @@ class MATHS_EXPORT CSolvers {
 
         try {
             std::size_t n = maxIterations;
-            solve(a, x, fa - fc, fx - fc, f_, n, equal, result.first);
+            solve(a, x, fa - fc, fx - fc, fMinusFc, n, equal, result.first);
             LOG_TRACE(<< "iterations = " << n);
         } catch (const std::exception& e) {
             LOG_ERROR(<< "Failed to find left end point: " << e.what());
@@ -944,7 +943,7 @@ class MATHS_EXPORT CSolvers {
 
         try {
             std::size_t n = maxIterations;
-            solve(x, b, fx - fc, fb - fc, f_, n, equal, result.second);
+            solve(x, b, fx - fc, fb - fc, fMinusFc, n, equal, result.second);
             LOG_TRACE(<< "iterations = " << n);
         } catch (std::exception& e) {
             LOG_ERROR(<< "Failed to find right end point: " << e.what());
diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index c2477c3f4a..8037ff4f6e 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -18,8 +18,8 @@
 #include <maths/CBoostedTreeLoss.h>
 #include <maths/CBoostedTreeUtils.h>
 #include <maths/CDataFrameCategoryEncoder.h>
-#include <maths/CLeastSquaresOnlineRegression.h>
-#include <maths/CLeastSquaresOnlineRegressionDetail.h>
+#include <maths/CLowess.h>
+#include <maths/CLowessDetail.h>
 #include <maths/COrderings.h>
 #include <maths/CSampling.h>
 
@@ -56,13 +56,11 @@ const double MIN_DOWNSAMPLE_FACTOR{1e-3};
 const double MIN_INITIAL_DOWNSAMPLE_FACTOR{0.05};
 const double MAX_INITIAL_DOWNSAMPLE_FACTOR{0.5};
 const double MIN_DOWNSAMPLE_FACTOR_SCALE{0.3};
-const double MAX_DOWNSAMPLE_FACTOR_SCALE{3.0};
 // This isn't a hard limit but we increase the number of default training folds
 // if the initial downsample fraction would be larger than this.
 const double MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION{0.5};
 const double MAX_NUMBER_FOLDS{5.0};
 const std::size_t MAX_NUMBER_TREES{static_cast<std::size_t>(2.0 / MIN_ETA + 0.5)};
-const double EPS{0.01};
 
 double computeEta(std::size_t numberRegressors) {
     // eta is the learning rate. There is a lot of empirical evidence that
@@ -106,6 +104,9 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari
         ? this->skipProgressMonitoringFeatureSelection()
         : this->startProgressMonitoringFeatureSelection();
 
+    // Find the maximum number of rows at which the selected tree depth does not change significantly.
+    // Need to call hyperparameter set up first.
+
     skipIfAfter(CBoostedTreeImpl::E_NotInitialized,
                 [&] { this->initializeCrossValidation(frame); });
     skipIfAfter(CBoostedTreeImpl::E_NotInitialized,
@@ -124,6 +125,8 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari
         this->initializeHyperparameterOptimisation();
     }
 
+    LOG_INFO(<< "number threads = " << m_NumberThreads);
+
     auto treeImpl = std::make_unique<CBoostedTreeImpl>(m_NumberThreads,
                                                        m_TreeImpl->m_Loss->clone());
     std::swap(m_TreeImpl, treeImpl);
@@ -551,7 +554,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                     double minSoftDepthLimit{MIN_SOFT_DEPTH_LIMIT};
                     double maxSoftDepthLimit{MIN_SOFT_DEPTH_LIMIT + log2MaxTreeSize};
                     double meanSoftDepthLimit{(minSoftDepthLimit + maxSoftDepthLimit) / 2.0};
-                    double mainLoopSearchInterval{log2MaxTreeSize / 2.0};
                     LOG_TRACE(<< "mean soft depth limit = " << meanSoftDepthLimit);
 
                     auto applySoftDepthLimit = [](CBoostedTreeImpl& tree, double softDepthLimit) {
@@ -562,9 +564,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                     TVector fallback{{minSoftDepthLimit, meanSoftDepthLimit, maxSoftDepthLimit}};
                     m_SoftDepthLimitSearchInterval =
                         this->testLossLineSearch(frame, applySoftDepthLimit,
-                                                 minSoftDepthLimit, maxSoftDepthLimit,
-                                                 -mainLoopSearchInterval / 2.0,
-                                                 mainLoopSearchInterval / 2.0)
+                                                 minSoftDepthLimit, maxSoftDepthLimit)
                             .value_or(fallback);
                     m_SoftDepthLimitSearchInterval =
                         max(m_SoftDepthLimitSearchInterval, TVector{1.0});
@@ -597,7 +597,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                         logMaxDepthPenaltyMultiplier - CTools::stableLog(searchIntervalSize)};
                     double meanLogDepthPenaltyMultiplier{
                         (logMinDepthPenaltyMultiplier + logMaxDepthPenaltyMultiplier) / 2.0};
-                    double mainLoopSearchInterval{CTools::stableLog(searchIntervalSize) / 2.0};
                     LOG_TRACE(<< "mean log depth penalty multiplier = "
                               << meanLogDepthPenaltyMultiplier);
 
@@ -616,9 +615,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                     m_LogDepthPenaltyMultiplierSearchInterval =
                         this->testLossLineSearch(frame, applyDepthPenaltyMultiplier,
                                                  logMinDepthPenaltyMultiplier,
-                                                 logMaxDepthPenaltyMultiplier,
-                                                 -mainLoopSearchInterval / 2.0,
-                                                 mainLoopSearchInterval / 2.0)
+                                                 logMaxDepthPenaltyMultiplier)
                             .value_or(fallback);
                     LOG_TRACE(<< "log depth penalty multiplier search interval = ["
                               << m_LogDepthPenaltyMultiplierSearchInterval.toDelimited()
@@ -651,7 +648,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                         logMaxTreeSizePenaltyMultiplier - CTools::stableLog(searchIntervalSize)};
                     double meanLogTreeSizePenaltyMultiplier{
                         (logMinTreeSizePenaltyMultiplier + logMaxTreeSizePenaltyMultiplier) / 2.0};
-                    double mainLoopSearchInterval{0.5 * CTools::stableLog(searchIntervalSize)};
                     LOG_TRACE(<< "mean log tree size penalty multiplier = "
                               << meanLogTreeSizePenaltyMultiplier);
 
@@ -670,9 +666,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                     m_LogTreeSizePenaltyMultiplierSearchInterval =
                         this->testLossLineSearch(frame, applyTreeSizePenaltyMultiplier,
                                                  logMinTreeSizePenaltyMultiplier,
-                                                 logMaxTreeSizePenaltyMultiplier,
-                                                 -mainLoopSearchInterval / 2.0,
-                                                 mainLoopSearchInterval / 2.0)
+                                                 logMaxTreeSizePenaltyMultiplier)
                             .value_or(fallback);
                     LOG_TRACE(<< "log tree size penalty multiplier search interval = ["
                               << m_LogTreeSizePenaltyMultiplierSearchInterval.toDelimited()
@@ -706,7 +700,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                         CTools::stableLog(searchIntervalSize)};
                     double meanLogLeafWeightPenaltyMultiplier{
                         (logMinLeafWeightPenaltyMultiplier + logMaxLeafWeightPenaltyMultiplier) / 2.0};
-                    double mainLoopSearchInterval{0.5 * CTools::stableLog(searchIntervalSize)};
                     LOG_TRACE(<< "mean log leaf weight penalty multiplier = "
                               << meanLogLeafWeightPenaltyMultiplier);
 
@@ -725,9 +718,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa
                     m_LogLeafWeightPenaltyMultiplierSearchInterval =
                         this->testLossLineSearch(frame, applyLeafWeightPenaltyMultiplier,
                                                  logMinLeafWeightPenaltyMultiplier,
-                                                 logMaxLeafWeightPenaltyMultiplier,
-                                                 -mainLoopSearchInterval / 2.0,
-                                                 mainLoopSearchInterval / 2.0)
+                                                 logMaxLeafWeightPenaltyMultiplier)
                             .value_or(fallback);
                     LOG_TRACE(<< "log leaf weight penalty multiplier search interval = ["
                               << m_LogLeafWeightPenaltyMultiplierSearchInterval.toDelimited()
@@ -818,7 +809,7 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram
                                           double minTestLoss, double testLoss) {
                     return testLoss + CTools::linearlyInterpolate(
                                           logMinDownsampleFactor, logMaxDownsampleFactor,
-                                          0.0, EPS * minTestLoss, logDownsampleFactor);
+                                          0.0, 0.01 * minTestLoss, logDownsampleFactor);
                 };
 
                 TVector fallback;
@@ -827,11 +818,9 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram
                 fallback(MAX_PARAMETER_INDEX) = logMaxDownsampleFactor;
 
                 m_LogDownsampleFactorSearchInterval =
-                    this->testLossLineSearch(
-                            frame, applyDownsampleFactor,
-                            logMinDownsampleFactor, logMaxDownsampleFactor,
-                            CTools::stableLog(MIN_DOWNSAMPLE_FACTOR_SCALE),
-                            CTools::stableLog(MAX_DOWNSAMPLE_FACTOR_SCALE), adjustTestLoss)
+                    this->testLossLineSearch(frame, applyDownsampleFactor,
+                                             logMinDownsampleFactor,
+                                             logMaxDownsampleFactor, adjustTestLoss)
                         .value_or(fallback);
 
                 // Truncate the log(factor) to be less than or equal to log(1.0) and the
@@ -870,7 +859,6 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr
                     2.0 * m_TreeImpl->m_FeatureBagFraction, MAX_FEATURE_BAG_FRACTION))};
                 double logMinFeatureBagFraction{logMaxFeatureBagFraction -
                                                 CTools::stableLog(searchIntervalSize)};
-                double mainLoopSearchInterval{CTools::stableLog(0.2 * searchIntervalSize)};
 
                 auto applyFeatureBagFraction = [&](CBoostedTreeImpl& tree,
                                                    double logFeatureBagFraction) {
@@ -888,7 +876,7 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr
                                           double minTestLoss, double testLoss) {
                     return testLoss + CTools::linearlyInterpolate(
                                           logMinFeatureBagFraction, logMaxFeatureBagFraction,
-                                          0.0, EPS * minTestLoss, logFeatureBagFraction);
+                                          0.0, 0.01 * minTestLoss, logFeatureBagFraction);
                 };
 
                 TVector fallback;
@@ -896,10 +884,9 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr
                 fallback(BEST_PARAMETER_INDEX) = logMaxFeatureBagFraction;
                 fallback(MAX_PARAMETER_INDEX) = logMaxFeatureBagFraction;
                 m_LogFeatureBagFractionInterval =
-                    this->testLossLineSearch(
-                            frame, applyFeatureBagFraction, logMinFeatureBagFraction,
-                            logMaxFeatureBagFraction, -mainLoopSearchInterval / 2.0,
-                            mainLoopSearchInterval / 2.0, adjustTestLoss)
+                    this->testLossLineSearch(frame, applyFeatureBagFraction,
+                                             logMinFeatureBagFraction,
+                                             logMaxFeatureBagFraction, adjustTestLoss)
                         .value_or(fallback);
 
                 // Truncate the log(fraction) to be less than or equal to log(MAX_FEATURE_BAG_FRACTION).
@@ -931,7 +918,6 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) {
                                                    m_TreeImpl->m_Eta)};
                 double logMinEta{logMaxEta - CTools::stableLog(searchIntervalSize)};
                 double meanLogEta{(logMaxEta + logMinEta) / 2.0};
-                double mainLoopSearchInterval{CTools::stableLog(0.2 * searchIntervalSize)};
                 LOG_TRACE(<< "mean log eta = " << meanLogEta);
 
                 auto applyEta = [](CBoostedTreeImpl& tree, double eta) {
@@ -951,9 +937,7 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) {
                 fallback(MAX_PARAMETER_INDEX) = logMaxEta;
 
                 m_LogEtaSearchInterval =
-                    this->testLossLineSearch(frame, applyEta, logMinEta, logMaxEta,
-                                             -mainLoopSearchInterval / 2.0,
-                                             mainLoopSearchInterval / 2.0)
+                    this->testLossLineSearch(frame, applyEta, logMinEta, logMaxEta)
                         .value_or(fallback);
                 m_LogEtaSearchInterval = min(m_LogEtaSearchInterval, TVector{0.0});
                 LOG_TRACE(<< "log eta search interval = ["
@@ -1006,8 +990,6 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame,
                                         const TApplyParameter& applyParameter,
                                         double intervalLeftEnd,
                                         double intervalRightEnd,
-                                        double returnedIntervalLeftEndOffset,
-                                        double returnedIntervalRightEndOffset,
                                         const TAdjustTestLoss& adjustTestLoss_) const {
 
     // This has the following steps:
@@ -1024,14 +1006,13 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame,
     //      the returned interval if we can determine there is a low chance of
     //      missing the best solution by doing so.
 
-    using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
     using TMinAccumulator = CBasicStatistics::SMin<double>::TAccumulator;
 
     TMinAccumulator minTestLoss;
     TDoubleDoublePrVec testLosses;
     testLosses.reserve(MAX_LINE_SEARCH_ITERATIONS);
     // Ensure we choose one value based on expected improvement.
-    std::size_t minNumberTestLosses{5};
+    std::size_t minNumberTestLosses{6};
 
     for (auto parameter :
          {intervalLeftEnd, (2.0 * intervalLeftEnd + intervalRightEnd) / 3.0,
@@ -1101,97 +1082,27 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame,
     }
 
     std::sort(testLosses.begin(), testLosses.end());
-    LOG_TRACE(<< "test losses = " << core::CContainerPrinter::print(testLosses));
-
-    // Find the smallest test losses and the corresponding parameter interval.
-    auto minimumTestLosses = CBasicStatistics::orderStatisticsAccumulator<TDoubleDoublePr>(
-        minNumberTestLosses - 1, COrderings::SSecondLess{});
-    minimumTestLosses.add(testLosses);
-    double minGoodParameter{std::min_element(minimumTestLosses.begin(),
-                                             minimumTestLosses.end(), COrderings::SFirstLess{})
-                                ->first};
-    double maxGoodParameter{std::max_element(minimumTestLosses.begin(),
-                                             minimumTestLosses.end(), COrderings::SFirstLess{})
-                                ->first};
-    auto beginGoodParameterLosses =
-        std::find_if(testLosses.begin(), testLosses.end(),
-                     [minGoodParameter](const TDoubleDoublePr& loss) {
-                         return loss.first == minGoodParameter;
-                     });
-    auto endGoodParameterLosses =
-        std::find_if(testLosses.begin(), testLosses.end(),
-                     [maxGoodParameter](const TDoubleDoublePr& loss) {
-                         return loss.first == maxGoodParameter;
-                     }) +
-        1;
-    LOG_TRACE(<< "good parameter range = [" << minGoodParameter << ","
-              << maxGoodParameter << "]");
-
-    CLeastSquaresOnlineRegression<2, double> leastSquaresQuadraticTestLoss;
-    for (auto loss = beginGoodParameterLosses; loss != endGoodParameterLosses; ++loss) {
-        leastSquaresQuadraticTestLoss.add(loss->first, loss->second);
-    }
-    CLeastSquaresOnlineRegression<2, double>::TArray params;
-    if (leastSquaresQuadraticTestLoss.parameters(params) == false) {
-        return TOptionalVector{};
-    }
-
-    double gradient{params[1]};
-    double curvature{params[2]};
-    LOG_TRACE(<< "[intercept, slope, curvature] = "
-              << core::CContainerPrinter::print(params));
-
-    // Find the minimizer of the least squares quadratic fit to the test loss
-    // in the search interval. (Note step size is negative.)
-    double stationaryPoint{-(gradient == curvature ? 0.5 : gradient / 2.0 / curvature)};
-    double bestParameter{[&] {
-        if (curvature < 0.0) {
-            // Stationary point is a maximum so use furthest point in interval.
-            double distanceToLeftEndpoint{std::fabs(minGoodParameter - stationaryPoint)};
-            double distanceToRightEndpoint{std::fabs(maxGoodParameter - stationaryPoint)};
-            return distanceToLeftEndpoint > distanceToRightEndpoint ? minGoodParameter
-                                                                    : maxGoodParameter;
-        }
-        // Stationary point is a minimum so use nearest point in the interval.
-        return CTools::truncate(stationaryPoint, minGoodParameter, maxGoodParameter);
-    }()};
-    LOG_TRACE(<< "best parameter = " << bestParameter);
-
-    TVector interval{{returnedIntervalLeftEndOffset, 0.0, returnedIntervalRightEndOffset}};
-    if (minGoodParameter > intervalLeftEnd) {
-        interval(MIN_PARAMETER_INDEX) = std::max(minGoodParameter - bestParameter,
-                                                 interval(MIN_PARAMETER_INDEX));
-    }
-    if (maxGoodParameter < intervalRightEnd) {
-        interval(MAX_PARAMETER_INDEX) = std::min(maxGoodParameter - bestParameter,
-                                                 interval(MAX_PARAMETER_INDEX));
-    }
-    if (curvature > 0.0) {
-        // Find a short interval with a high probability of containing the optimal
-        // regularisation parameter if we found a minimum. In particular, we solve
-        // curvature * (x - best)^2 = 3 sigma where sigma is the standard deviation
-        // of the test loss residuals to get the interval endpoints. We don't
-        // extrapolate the loss function outside the line segment we searched so
-        // don't truncate if an endpoint lies outside the searched interval.
-        TMeanVarAccumulator residualMoments;
-        for (auto loss = beginGoodParameterLosses; loss != endGoodParameterLosses; ++loss) {
-            residualMoments.add(loss->second -
-                                leastSquaresQuadraticTestLoss.predict(loss->first));
-        }
-        double sigma{std::sqrt(CBasicStatistics::variance(residualMoments))};
-        double threeSigmaInterval{std::sqrt(3.0 * sigma / curvature)};
-        if (bestParameter - threeSigmaInterval >= minGoodParameter) {
-            interval(MIN_PARAMETER_INDEX) =
-                std::max(-threeSigmaInterval, returnedIntervalLeftEndOffset);
-        }
-        if (bestParameter + threeSigmaInterval <= maxGoodParameter) {
-            interval(MAX_PARAMETER_INDEX) =
-                std::min(threeSigmaInterval, returnedIntervalRightEndOffset);
-        }
-    }
-    interval += TVector{bestParameter};
-
-    return TOptionalVector{interval};
+    LOG_INFO(<< "test losses = " << core::CContainerPrinter::print(testLosses));
+
+    CLowess<2> lowess;
+    lowess.fit(std::move(testLosses), testLosses.size());
+
+    double bestParameter, bestParameterTestLoss;
+    std::tie(bestParameter, bestParameterTestLoss) = lowess.minimum();
+    LOG_INFO(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss);
+
+    double width{(intervalRightEnd - intervalLeftEnd) / static_cast<double>(MAX_LINE_SEARCH_ITERATIONS)};
+    intervalLeftEnd = bestParameter - width;
+    intervalRightEnd = bestParameter + width;
+    LOG_INFO(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]");
+    //double residualVariance{lowess.residualVariance()};
+    //std::tie(intervalLeftEnd, intervalRightEnd) =
+    //    lowess.sublevelSet(bestParameter, bestParameterTestLoss,
+    //                       bestParameterTestLoss + std::sqrt(residualVariance));
+    //LOG_INFO(<< "residual variance = " << residualVariance << " interval = ["
+    //          << intervalLeftEnd << "," << intervalRightEnd << "]");
+
+    return TVector{{intervalLeftEnd, bestParameter, intervalRightEnd}};
 }
 
 CBoostedTreeFactory CBoostedTreeFactory::constructFromParameters(std::size_t numberThreads,
diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc
new file mode 100644
index 0000000000..cd3073949b
--- /dev/null
+++ b/lib/maths/unittest/CLowessTest.cc
@@ -0,0 +1,262 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#include <core/CLogger.h>
+
+#include <maths/CBasicStatistics.h>
+#include <maths/CLowess.h>
+#include <maths/CLowessDetail.h>
+
+#include <test/BoostTestCloseAbsolute.h>
+#include <test/CRandomNumbers.h>
+
+#include <boost/math/constants/constants.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <cmath>
+
+BOOST_AUTO_TEST_SUITE(CLowessTest)
+
+using namespace ml;
+
+using TDoubleVec = std::vector<double>;
+using TDoubleVecVec = std::vector<TDoubleVec>;
+using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
+
+BOOST_AUTO_TEST_CASE(testInvariants) {
+
+    // Test invariants are satisfied on random input.
+
+    // We check:
+    //   1. Minimum is a local minimum.
+    //   2. The sublevel set contains the minimum.
+    //   3. The minimum is within 10% of the training data interval.
+    //   4. The ends of the sublevel set is within 10% of the training data interval.
+    //   5. The variance is greater than or equal to the variance of the residuals at
+    //      the training data.
+
+    test::CRandomNumbers rng;
+
+    TDoubleVec scale;
+    TDoubleVec offset;
+    TDoubleVec noise;
+    maths::CLowess<2>::TDoubleDoublePrVec data;
+
+    std::function<double(double)> trends[]{
+        [&](double x) {
+            return scale[0] * std::sin(boost::math::double_constants::two_pi /
+                                       20.0 * (x + offset[0]));
+        },
+        [&](double x) { return scale[0] * x / 10.0; },
+        [&](double x) {
+            return scale[0] * (x - offset[0]) * (x - offset[0]) / 100.0;
+        }
+    };
+
+    for (std::size_t i = 0; i < 100; ++i) {
+
+        for (const auto& trend : trends) {
+            rng.generateUniformSamples(0.0, 10.0, 1, scale);
+            rng.generateUniformSamples(0.0, 20.0, 1, offset);
+            rng.generateNormalSamples(0.0, 4.0, 20, noise);
+
+            data.clear();
+            for (std::size_t j = 0; j < noise.size(); ++j) {
+                double x{static_cast<double>(j)};
+                data.emplace_back(x, trend(x) + noise[j]);
+            }
+
+            maths::CLowess<2> lowess;
+            lowess.fit(data, 5);
+
+            double xea, xeb;
+            std::tie(xea, xeb) = lowess.extrapolationInterval();
+
+            double xmin, fmin;
+            std::tie(xmin, fmin) = lowess.minimum();
+            BOOST_REQUIRE_EQUAL(fmin, lowess.predict(xmin));
+            BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea)));
+            BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::min(xmin + 0.1, xeb)));
+
+            double xa, xb;
+            std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, fmin + 0.1);
+            BOOST_TEST_REQUIRE(xa <= xmin);
+            BOOST_TEST_REQUIRE(xb >= xmin);
+
+            BOOST_TEST_REQUIRE(xmin >= xea);
+            BOOST_TEST_REQUIRE(xmin <= xeb);
+            BOOST_TEST_REQUIRE(xa >= xea);
+            BOOST_TEST_REQUIRE(xb <= xeb);
+            BOOST_TEST_REQUIRE(xa >= xea);
+            BOOST_TEST_REQUIRE(xb <= xeb);
+
+            TMeanVarAccumulator residualMoments;
+            for (const auto& x : data) {
+                residualMoments.add(x.second - lowess.predict(x.first));
+            }
+            BOOST_TEST_REQUIRE(maths::CBasicStatistics::variance(residualMoments) <
+                               lowess.residualVariance());
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testSmooth) {
+
+    // Test the prediction errors on a smooth function.
+
+    test::CRandomNumbers rng;
+
+    auto trend = [](double x) {
+        return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
+    };
+
+    maths::CLowess<2>::TDoubleDoublePrVec data;
+    for (std::size_t i = 0; i < 20; ++i) {
+        double x{static_cast<double>(i)};
+        data.emplace_back(x, trend(x));
+    }
+
+    maths::CLowess<2> lowess;
+    lowess.fit(data, 5);
+
+    TMeanVarAccumulator errorMoments;
+    for (std::size_t i = 0; i < 20; ++i) {
+        double x{static_cast<double>(i)};
+        errorMoments.add(std::fabs(lowess.predict(x) - trend(x)));
+    }
+    LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(errorMoments));
+
+    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(errorMoments) < 0.1);
+}
+
+BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) {
+
+    // Test the prediction errors on a smooth function plus noise.
+
+    test::CRandomNumbers rng;
+
+    TDoubleVec noise;
+    rng.generateNormalSamples(0.0, 4.0, 20, noise);
+
+    auto trend = [](double x) {
+        return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
+    };
+
+    maths::CLowess<2>::TDoubleDoublePrVec data;
+    for (std::size_t i = 0; i < noise.size(); ++i) {
+        double x{static_cast<double>(i)};
+        data.emplace_back(x, trend(x) + noise[i]);
+    }
+
+    maths::CLowess<2> lowess;
+    lowess.fit(data, 5);
+
+    TMeanVarAccumulator errorMoments;
+    for (std::size_t i = 0; i < 20; ++i) {
+        double x{static_cast<double>(i)};
+        errorMoments.add(std::fabs(lowess.predict(x) - trend(x)));
+    }
+    LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(errorMoments));
+
+    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(errorMoments) < 0.8);
+    BOOST_TEST_REQUIRE(std::fabs(std::sqrt(lowess.residualVariance()) - 2.0) < 0.6);
+}
+
+BOOST_AUTO_TEST_CASE(testMinimum) {
+
+    // Check that the minimum and the predicted value at the minimum is close to
+    // what we'd expect.
+
+    test::CRandomNumbers rng;
+
+    auto trend = [](double x) {
+        return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
+    };
+
+    maths::CLowess<2>::TDoubleDoublePrVec data;
+    for (std::size_t i = 0; i < 20; ++i) {
+        double x{static_cast<double>(i)};
+        data.emplace_back(x, trend(x));
+    }
+
+    maths::CLowess<2> lowess;
+    lowess.fit(data, 5);
+
+    double x, fx;
+    std::tie(x, fx) = lowess.minimum();
+
+    // Expect minimum at ((3 / 2) * pi) / (2 pi / 20) = 15 and a value of around -8.0;
+
+    LOG_DEBUG(<< "xmin = " << x << ", f(xmin) = " << fx);
+    BOOST_REQUIRE_CLOSE(15.0, x, 1.0);  // 1%
+    BOOST_REQUIRE_CLOSE(-8.0, fx, 5.0); // 5%
+}
+
+BOOST_AUTO_TEST_CASE(testTrainingLossCurves) {
+
+    // Test minimization of some training loss curves from boosted tree hyperparameter
+    // line searches for:
+    //   1. Miniboone
+    //   2. Car-parts
+    //   3. Boston
+
+    using TDoubleDoublePrVecVec = std::vector<maths::CLowess<2>::TDoubleDoublePrVec>;
+
+    // clang-format off
+    TDoubleDoublePrVecVec curves{
+        {{2.0, 0.1767327}, {6.080264, 0.1659147}, {9.615924, 0.1607294}, {10.16053, 0.1614871}, {14.24079, 0.1633198}},
+        {{-2.561376, 0.1672884}, {-1.085517, 0.1647196}, {0.3903422, 0.1639279}, {1.474411, 0.1662013}, {1.866201, 0.1628465}},
+        {{-2.561376, 0.162188}, {-1.085517, 0.1600827}, {-0.5958557, 0.1598617}, {0.3903422, 0.1642588}, {1.866201, 0.1778405}},
+        {{-1.600108, 0.1588888}, {0.342874, 0.1574784}, {2.285856, 0.1569175}, {3.825301, 0.1527161}, {4.228838, 0.1555854}},
+        {{-4.969813, 0.5935475}, {-3.313209, 0.2387051}, {-1.656604, 0.1552702}, {-0.7187975, 0.1507938}, {0, 0.1494794}},
+        {{-2.302585, 0.1651654}, {-1.609438, 0.1712131}, {-0.9162907, 0.1550724}, {-0.4452244, 0.1491943}, {-0.2231436, 0.1489314}},
+        {{2.0, 0.01361971}, {5.811543, 0.002268836}, {6.648845, 0.001762906}, {6.731061, 0.001930386}, {8.76648, 0.001210521},
+         {9.58383, 0.002405683}, {9.623085, 0.002132054}, {9.787585, 0.002502508}, {10.42778, 0.001915853}, {13.43463, 0.001321818}},
+        {{1.71972, 0.003296972}, {3.890569, 0.002917327}, {3.939936, 0.00103488}, {3.97139, 0.003646344}, {6.022504, 0.002943863},
+         {6.061419, 0.001830975}, {7.801588, 0.003221994}, {7.930129, 0.003912988}, {8.232269, 0.004673212}},
+        {{1.71972, 0.003408918}, {2.043608, 0.003519984}, {3.890569, 0.01988785}, {6.061419, 0.0764257}, {8.232269, 0.1406254}},
+        {{-0.05942321, 0.003394985}, {0.6689442, 0.003665651}, {1.924394, 0.004942474}, {3.908212, 0.006659611}, {5.892029, 0.0157031}},
+        {{-4.969813, 0.1798482}, {-3.313209, 0.1798566}, {-1.656604, 0.01256459}, {-1.154333, 0.004852421}, {-0.8191196, 0.003527397},
+         {-0.2381196, 0.001983409}, {0, 0.002551422}},
+        {{-2.302585, 0.001822712}, {-1.609438, 0.00345773}, {-0.9162907, 0.003139631}, {-0.2855592, 0.003175851}, {-0.2231436, 0.002630656}},
+        {{-3.800451, 0.002890249}, {-2.801874, 0.002432233}, {-2.446324, 0.002333384}, {-2.291018, 0.001627785}, {-2.190441, 0.001669799},
+         {-1.999605, 0.002137923}, {-1.803296, 0.001832592}, {-1.628174, 0.003295475}, {-0.8946376, 0.001722856}, {-0.804719, 0.001301327}},
+        {{2.0, 10.71672}, {4.827566, 9.507881}, {4.830618, 8.36871}, {7.661235, 9.822492}, {10.49185, 10.09627}},
+        {{-5.991457, 9.803939}, {-2.538955, 9.975635}, {0.9135475, 9.298096}, {3.543894, 8.223675}, {4.36605, 8.962077}},
+        {{-5.991457, 9.35017}, {-3.357034, 9.962562}, {-2.538955, 9.027685}, {-1.97598, 8.668243}, {0.9135475, 10.19129}, {4.36605, 11.89721}},
+        {{0.6931472, 9.422628}, {1.610698, 9.089348}, {1.691725, 8.93955}, {2.158699, 10.18192}, {2.545694, 9.212234}, {2.690302, 9.148424},
+         {2.943044, 10.4056}, {3.688879, 11.13337}},
+        {{-1.279388, 11.9904}, {-0.8885609, 9.800607}, {-0.6476757, 8.581057}, {-0.5692195, 7.907454}, {-0.4977335, 8.514873},
+         {-0.1069061, 9.885219}},
+        {{-3.800451, 8.317797}, {-3.738576, 8.053429}, {-3.403612, 8.338234}, {-2.801874, 8.890816}, {-2.333564, 8.705093},
+         {-2.208987, 10.69139}, {-1.803296, 9.234116}, {-1.002829, 10.67219}, {-0.9090844, 12.46085}, {-0.804719, 13.98731}}};
+    // clang-format on
+
+    for (const auto& curve : curves) {
+        maths::CLowess<2> lowess;
+        lowess.fit(curve, curve.size());
+        double xmin, fmin;
+        std::tie(xmin, fmin) = lowess.minimum();
+        double variance{lowess.residualVariance()};
+
+        double xa, xb;
+        double ftarget{fmin + std::sqrt(variance)};
+        std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, ftarget);
+
+        if (xa <= curve.front().first) {
+            BOOST_TEST_REQUIRE(lowess.predict(xa) <= 1.01 * ftarget);
+        } else {
+            BOOST_REQUIRE_CLOSE(lowess.predict(xa), ftarget, 1.0); // 1.0%
+        }
+        if (xb >= curve.back().first) {
+            BOOST_TEST_REQUIRE(lowess.predict(xb) <= 1.01 * ftarget);
+        } else {
+            BOOST_REQUIRE_CLOSE(lowess.predict(xb), ftarget, 1.0); // 1.0%
+        }
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/maths/unittest/COneOfNPriorTest.cc b/lib/maths/unittest/COneOfNPriorTest.cc
index 7cfedbfe4d..8be97e715e 100644
--- a/lib/maths/unittest/COneOfNPriorTest.cc
+++ b/lib/maths/unittest/COneOfNPriorTest.cc
@@ -10,6 +10,7 @@
 #include <core/CRapidXmlStatePersistInserter.h>
 #include <core/CRapidXmlStateRestoreTraverser.h>
 
+#include <maths/CCompositeFunctions.h>
 #include <maths/CGammaRateConjugate.h>
 #include <maths/CLogNormalMeanPrecConjugate.h>
 #include <maths/CMultimodalPrior.h>
diff --git a/lib/maths/unittest/CSolversTest.cc b/lib/maths/unittest/CSolversTest.cc
index 671376bf45..058a66897c 100644
--- a/lib/maths/unittest/CSolversTest.cc
+++ b/lib/maths/unittest/CSolversTest.cc
@@ -8,6 +8,7 @@
 #include <core/CLogger.h>
 #include <core/CStringUtils.h>
 
+#include <maths/CCompositeFunctions.h>
 #include <maths/CEqualWithTolerance.h>
 #include <maths/CSolvers.h>
 
diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile
index 0ff2bd9b53..4209ad3eb7 100644
--- a/lib/maths/unittest/Makefile
+++ b/lib/maths/unittest/Makefile
@@ -58,6 +58,7 @@ SRCS=\
 	CLinearAlgebraTest.cc \
 	CLogNormalMeanPrecConjugateTest.cc \
 	CLogTDistributionTest.cc \
+	CLowessTest.cc \
 	CMathsFuncsTest.cc \
 	CMathsMemoryTest.cc \
 	CMicTest.cc \
diff --git a/lib/maths/unittest/TestUtils.cc b/lib/maths/unittest/TestUtils.cc
index 43394d1d32..6b4036dfea 100644
--- a/lib/maths/unittest/TestUtils.cc
+++ b/lib/maths/unittest/TestUtils.cc
@@ -6,6 +6,7 @@
 
 #include "TestUtils.h"
 
+#include <maths/CCompositeFunctions.h>
 #include <maths/CEqualWithTolerance.h>
 #include <maths/CIntegration.h>
 #include <maths/CPrior.h>

From b2af7143daf445ea0aa3166f6a92b24f9df37053 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 29 Jun 2021 13:07:14 +0100
Subject: [PATCH 02/35] Restrict the maximum number of rows used during
 hyperparameter tuning to avoid runtime blowup

---
 include/maths/CBoostedTreeFactory.h |  3 ++
 include/maths/CBoostedTreeImpl.h    |  5 ++-
 include/maths/CDataFrameUtils.h     | 10 +++--
 lib/maths/CBoostedTreeFactory.cc    | 49 ++++++++++++++------
 lib/maths/CBoostedTreeImpl.cc       | 42 +++++++++--------
 lib/maths/CDataFrameUtils.cc        | 70 ++++++++++++++++-------------
 6 files changed, 110 insertions(+), 69 deletions(-)

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
index 02564e594f..74eeae688f 100644
--- a/include/maths/CBoostedTreeFactory.h
+++ b/include/maths/CBoostedTreeFactory.h
@@ -80,6 +80,8 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency);
     //! Set the number of folds to use for estimating the generalisation error.
     CBoostedTreeFactory& numberFolds(std::size_t numberFolds);
+    //! Set the maximum number of rows to use for training when tuning hyperparameters.
+    CBoostedTreeFactory& maximumNumberTrainRows(std::size_t rows);
     //! Stratify the cross-validation we do for regression.
     CBoostedTreeFactory& stratifyRegressionCrossValidation(bool stratify);
     //! Stop cross-validation early if the test loss is not promising.
@@ -275,6 +277,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TOptionalSize m_BayesianOptimisationRestarts;
     bool m_StratifyRegressionCrossValidation = true;
     double m_InitialDownsampleRowsPerFeature = 200.0;
+    std::size_t m_MaximumNumberOfTrainRows = 1000000;
     double m_GainPerNode1stPercentile = 0.0;
     double m_GainPerNode50thPercentile = 0.0;
     double m_GainPerNode90thPercentile = 0.0;
diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
index fad8343531..ad19e896c7 100644
--- a/include/maths/CBoostedTreeImpl.h
+++ b/include/maths/CBoostedTreeImpl.h
@@ -320,6 +320,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Check invariants which are assumed to hold in order to train on \p frame.
     void checkTrainInvariants(const core::CDataFrame& frame) const;
 
+    //! Get the count of train/validation folds.
+    std::size_t numberFolds() const;
+
     //! Get the number of hyperparameters to tune.
     std::size_t numberHyperparametersToTune() const;
 
@@ -380,7 +383,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     double m_DownsampleFactor = 0.5;
     double m_Eta = 0.1;
     double m_EtaGrowthRatePerTree = 1.05;
-    std::size_t m_NumberFolds = 4;
+    double m_FractionalFolds = 4.0;
     std::size_t m_MaximumNumberTrees = 20;
     std::size_t m_MaximumAttemptsToAddTree = 3;
     std::size_t m_NumberSplitsPerFeature = 75;
diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h
index 581cd74f30..411a680455 100644
--- a/include/maths/CDataFrameUtils.h
+++ b/include/maths/CDataFrameUtils.h
@@ -280,9 +280,11 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
     //! \param[in] frame The data frame for which to compute the row masks.
     //! \param[in] targetColumn The index of the column to predict.
     //! \param[in] rng The random number generator to use.
-    //! \param[in] numberFolds The number of folds to use.
-    //! \param[in] numberBuckets The number of buckets to use when stratifying by
-    //! target quantiles for regression.
+    //! \param[in] numberFolds The number of folds to use. If this is less than
+    //! two, there will be two train masks, but their size will be less than 50%
+    //! of the data.
+    //! \param[in] numberBuckets The number of buckets to use when stratifying
+    //! by target quantiles for regression.
     //! \param[in] allTrainingRowsMask A mask of the candidate training rows.
     //! \warning This fails if the target is not categorical.
     static std::tuple<TPackedBitVectorVec, TPackedBitVectorVec, TDoubleVec>
@@ -290,7 +292,7 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
                                       const core::CDataFrame& frame,
                                       std::size_t targetColumn,
                                       CPRNG::CXorOShiro128Plus rng,
-                                      std::size_t numberFolds,
+                                      double numberFolds,
                                       std::size_t numberBuckets,
                                       const core::CPackedBitVector& allTrainingRowsMask);
 
diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index 1d1a6305b5..8218489619 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -125,8 +125,6 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari
         this->initializeHyperparameterOptimisation();
     }
 
-    LOG_INFO(<< "number threads = " << m_NumberThreads);
-
     auto treeImpl = std::make_unique<CBoostedTreeImpl>(m_NumberThreads,
                                                        m_TreeImpl->m_Loss->clone());
     std::swap(m_TreeImpl, treeImpl);
@@ -334,18 +332,35 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
         // to find the smallest integer k s.t. c * f * # rows <= (1 - 1 / k) * # rows.
         // This gives k = ceil(1 / (1 - c * f)). However, we also upper bound this
         // by MAX_NUMBER_FOLDS.
+        //
+        // In addition, we want to constrain the maximum amount of training data we'll
+        // use during hyperparameter search to avoid very long run times. To do this
+        // we set the number of folds to be less than two. We define the size of the
+        // training data set to be (k - 1) / k * # rows, with k the number of folds.
+        // If k < 2 this means we end up selecting less than half the data for training.
+        // To meet the constraint on the maximum number of rows M we must choose k
+        // which satisfies M >= (k - 1) / k * # rows. This is trivially satisfied for
+        // # rows less than M and, given we also constrain the maximum number of folds,
+        // we only care if # rows > MAX_NUMBER_FOLDS * M / (MAX_NUMBER_FOLDS - 1).
 
         double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature *
                                           static_cast<double>(frame.numberColumns() - 1)) /
                                          static_cast<double>(totalNumberTrainingRows)};
-
-        m_TreeImpl->m_NumberFolds = static_cast<std::size_t>(
+        double minimumTrainingDataConstraintNumberFolds{
             std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION,
-                                     1.0 / MAX_NUMBER_FOLDS)));
+                                     1.0 / MAX_NUMBER_FOLDS))};
+        double maximumTrainingDataConstraintNumberFolds{
+            1.0 / (1.0 - static_cast<double>(m_MaximumNumberOfTrainRows) /
+                             std::max(static_cast<double>(frame.numberRows()),
+                                      MAX_NUMBER_FOLDS / (MAX_NUMBER_FOLDS - 1.0) *
+                                          static_cast<double>(m_MaximumNumberOfTrainRows)))};
+
+        m_TreeImpl->m_FractionalFolds = std::min(minimumTrainingDataConstraintNumberFolds,
+                                                 maximumTrainingDataConstraintNumberFolds);
         LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction
-                  << " # folds = " << m_TreeImpl->m_NumberFolds);
+                  << " # folds = " << m_TreeImpl->m_FractionalFolds);
     } else {
-        m_TreeImpl->m_NumberFolds = *m_TreeImpl->m_NumberFoldsOverride;
+        m_TreeImpl->m_FractionalFolds = static_cast<double>(*m_TreeImpl->m_NumberFoldsOverride);
     }
 }
 
@@ -378,7 +393,7 @@ void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) con
     std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks, std::ignore) =
         CDataFrameUtils::stratifiedCrossValidationRowMasks(
             m_TreeImpl->m_NumberThreads, frame, dependentVariable, m_TreeImpl->m_Rng,
-            m_TreeImpl->m_NumberFolds, numberBuckets, allTrainingRowsMask);
+            m_TreeImpl->m_FractionalFolds, numberBuckets, allTrainingRowsMask);
 }
 
 void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFrame& frame) const {
@@ -813,8 +828,7 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram
                 fallback(MAX_PARAMETER_INDEX) = logMaxDownsampleFactor;
 
                 m_LogDownsampleFactorSearchInterval =
-                    this->testLossLineSearch(frame, applyDownsampleFactor,
-                                             logMinDownsampleFactor,
+                    this->testLossLineSearch(frame, applyDownsampleFactor, logMinDownsampleFactor,
                                              logMaxDownsampleFactor, adjustTestLoss)
                         .value_or(fallback);
 
@@ -869,9 +883,10 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr
                 // larger than the minimum.
                 auto adjustTestLoss = [=](double logFeatureBagFraction,
                                           double minTestLoss, double testLoss) {
-                    return testLoss + CTools::linearlyInterpolate(
-                                          logMinFeatureBagFraction, logMaxFeatureBagFraction,
-                                          0.0, 0.01 * minTestLoss, logFeatureBagFraction);
+                    return testLoss +
+                           CTools::linearlyInterpolate(
+                               logMinFeatureBagFraction, logMaxFeatureBagFraction,
+                               0.0, 0.01 * minTestLoss, logFeatureBagFraction);
                 };
 
                 TVector fallback;
@@ -1086,7 +1101,8 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame,
     std::tie(bestParameter, bestParameterTestLoss) = lowess.minimum();
     LOG_INFO(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss);
 
-    double width{(intervalRightEnd - intervalLeftEnd) / static_cast<double>(MAX_LINE_SEARCH_ITERATIONS)};
+    double width{(intervalRightEnd - intervalLeftEnd) /
+                 static_cast<double>(MAX_LINE_SEARCH_ITERATIONS)};
     intervalLeftEnd = bestParameter - width;
     intervalRightEnd = bestParameter + width;
     LOG_INFO(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]");
@@ -1164,6 +1180,11 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) {
     return *this;
 }
 
+CBoostedTreeFactory& CBoostedTreeFactory::maximumNumberTrainRows(std::size_t rows) {
+    m_MaximumNumberOfTrainRows = rows;
+    return *this;
+}
+
 CBoostedTreeFactory& CBoostedTreeFactory::stratifyRegressionCrossValidation(bool stratify) {
     m_StratifyRegressionCrossValidation = stratify;
     return *this;
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index fe133c6a11..07d9b27ec5 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -334,7 +334,7 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
         m_MaximumNumberTrees *
         (sizeof(TNodeVec) + maximumNumberNodes * CBoostedTreeNode::estimateMemoryUsage(
                                                      m_Loss->numberParameters()))};
-    std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds *
+    std::size_t foldRoundLossMemoryUsage{this->numberFolds() * m_NumberRounds *
                                          sizeof(TOptionalDouble)};
     std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)};
     std::size_t tunableHyperparametersMemoryUsage{
@@ -367,8 +367,7 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
     // we get a constant 8 / 64.
     std::size_t missingFeatureMaskMemoryUsage{8 * numberColumns * numberRows / 64};
     std::size_t trainTestMaskMemoryUsage{
-        2 * static_cast<std::size_t>(std::ceil(std::log2(static_cast<double>(m_NumberFolds)))) *
-        numberRows};
+        2 * static_cast<std::size_t>(std::ceil(std::log2(m_FractionalFolds))) * numberRows};
     std::size_t bayesianOptimisationMemoryUsage{CBayesianOptimisation::estimateMemoryUsage(
         this->numberHyperparametersToTune(), m_NumberRounds)};
     std::size_t worstCaseMemoryUsage{
@@ -436,7 +435,7 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile,
 }
 
 void CBoostedTreeImpl::initializePerFoldTestLosses() {
-    m_FoldRoundTestLosses.resize(m_NumberFolds);
+    m_FoldRoundTestLosses.resize(this->numberFolds());
     for (auto& losses : m_FoldRoundTestLosses) {
         losses.resize(m_NumberRounds);
     }
@@ -523,7 +522,7 @@ CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple
 CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
 
     // We want to ensure we evaluate on equal proportions for each fold.
-    TSizeVec folds(m_NumberFolds);
+    TSizeVec folds(this->numberFolds());
     std::iota(folds.begin(), folds.end(), 0);
     CSampling::random_shuffle(m_Rng, folds.begin(), folds.end());
 
@@ -533,8 +532,8 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
         // that the test error is not close to the minimum test error. We use
         // the estimated test error for each remaining fold at two standard
         // deviations below the mean for this.
-        if (m_StopCrossValidationEarly && m_CurrentRound >= m_NumberFolds &&
-            folds.size() < m_NumberFolds) {
+        if (m_StopCrossValidationEarly && m_CurrentRound >= this->numberFolds() &&
+            folds.size() < this->numberFolds()) {
             for (const auto& testLoss : this->estimateMissingTestLosses(folds)) {
                 testLossMoments.add(
                     CBasicStatistics::mean(testLoss) -
@@ -547,7 +546,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
 
     TMeanVarAccumulator lossMoments;
     TDoubleVec numberTrees;
-    numberTrees.reserve(m_NumberFolds);
+    numberTrees.reserve(this->numberFolds());
     TMeanAccumulator meanForestSizeAccumulator;
 
     while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) {
@@ -973,12 +972,13 @@ double CBoostedTreeImpl::minimumTestLoss() const {
     TMinAccumulator minimumTestLoss;
     for (std::size_t round = 0; round < m_CurrentRound - 1; ++round) {
         TMeanVarAccumulator roundLossMoments;
-        for (std::size_t fold = 0; fold < m_NumberFolds; ++fold) {
+        for (std::size_t fold = 0; fold < this->numberFolds(); ++fold) {
             if (m_FoldRoundTestLosses[fold][round] != boost::none) {
                 roundLossMoments.add(*m_FoldRoundTestLosses[fold][round]);
             }
         }
-        if (static_cast<std::size_t>(CBasicStatistics::count(roundLossMoments)) == m_NumberFolds) {
+        if (static_cast<std::size_t>(CBasicStatistics::count(roundLossMoments)) ==
+            this->numberFolds()) {
             minimumTestLoss.add(CBasicStatistics::mean(roundLossMoments));
         }
     }
@@ -1027,7 +1027,7 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
     // where the indices range over the folds for which we have errors in the
     // current round.
 
-    TSizeVec present(m_NumberFolds);
+    TSizeVec present(this->numberFolds());
     std::iota(present.begin(), present.end(), 0);
     TSizeVec ordered{missing};
     std::sort(ordered.begin(), ordered.end());
@@ -1478,6 +1478,10 @@ void CBoostedTreeImpl::scaleRegularizers(double scale) {
     }
 }
 
+std::size_t CBoostedTreeImpl::numberFolds() const {
+    return static_cast<std::size_t>(std::ceil(m_FractionalFolds));
+}
+
 std::size_t CBoostedTreeImpl::numberHyperparametersToTune() const {
     return m_RegularizationOverride.countNotSet() +
            (m_DownsampleFactorOverride != boost::none ? 0 : 1) +
@@ -1500,7 +1504,7 @@ void CBoostedTreeImpl::recordHyperparameters() {
     m_Instrumentation->hyperparameters().s_Eta = m_Eta;
     m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective;
     m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor;
-    m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds;
+    m_Instrumentation->hyperparameters().s_NumFolds = m_FractionalFolds;
     m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees;
     m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction;
     m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree;
@@ -1583,13 +1587,13 @@ void CBoostedTreeImpl::startProgressMonitoringFineTuneHyperparameters() {
 
     m_Instrumentation->startNewProgressMonitoredTask(CBoostedTreeFactory::FINE_TUNING_PARAMETERS);
 
-    std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * m_NumberFolds};
+    std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * this->numberFolds()};
     LOG_TRACE(<< "main loop total number steps = " << totalNumberSteps);
     m_TrainingProgress = core::CLoopProgress{
         totalNumberSteps, m_Instrumentation->progressCallback(), 1.0, 1024};
 
     // Make sure progress starts where it left off.
-    m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * m_NumberFolds);
+    m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * this->numberFolds());
 }
 
 void CBoostedTreeImpl::startProgressMonitoringFinalTrain() {
@@ -1629,6 +1633,7 @@ const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"};
 const std::string FEATURE_DATA_TYPES_TAG{"feature_data_types"};
 const std::string FEATURE_SAMPLE_PROBABILITIES_TAG{"feature_sample_probabilities"};
 const std::string FOLD_ROUND_TEST_LOSSES_TAG{"fold_round_test_losses"};
+const std::string FRACTIONAL_FOLDS_TAG{"number_folds"};
 const std::string INITIALIZATION_STAGE_TAG{"initialization_progress"};
 const std::string LOSS_TAG{"loss"};
 const std::string LOSS_NAME_TAG{"loss_name"};
@@ -1640,7 +1645,6 @@ const std::string MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG{
 const std::string MEAN_FOREST_SIZE_ACCUMULATOR_TAG{"mean_forest_size"};
 const std::string MEAN_LOSS_ACCUMULATOR_TAG{"mean_loss"};
 const std::string MISSING_FEATURE_ROW_MASKS_TAG{"missing_feature_row_masks"};
-const std::string NUMBER_FOLDS_TAG{"number_folds"};
 const std::string NUMBER_FOLDS_OVERRIDE_TAG{"number_folds_override"};
 const std::string NUMBER_ROUNDS_TAG{"number_rounds"};
 const std::string NUMBER_SPLITS_PER_FEATURE_TAG{"number_splits_per_feature"};
@@ -1704,6 +1708,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert
     core::CPersistUtils::persist(FEATURE_SAMPLE_PROBABILITIES_TAG,
                                  m_FeatureSampleProbabilities, inserter);
     core::CPersistUtils::persist(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, inserter);
+    core::CPersistUtils::persist(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, inserter);
     core::CPersistUtils::persist(INITIALIZATION_STAGE_TAG,
                                  static_cast<int>(m_InitializationStage), inserter);
     if (m_Loss != nullptr) {
@@ -1723,7 +1728,6 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert
     core::CPersistUtils::persist(MEAN_LOSS_ACCUMULATOR_TAG, m_MeanLossAccumulator, inserter);
     core::CPersistUtils::persist(MISSING_FEATURE_ROW_MASKS_TAG,
                                  m_MissingFeatureRowMasks, inserter);
-    core::CPersistUtils::persist(NUMBER_FOLDS_TAG, m_NumberFolds, inserter);
     core::CPersistUtils::persist(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, inserter);
     core::CPersistUtils::persist(NUMBER_ROUNDS_TAG, m_NumberRounds, inserter);
     core::CPersistUtils::persist(NUMBER_SPLITS_PER_FEATURE_TAG,
@@ -1820,6 +1824,8 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav
         RESTORE(FOLD_ROUND_TEST_LOSSES_TAG,
                 core::CPersistUtils::restore(FOLD_ROUND_TEST_LOSSES_TAG,
                                              m_FoldRoundTestLosses, traverser))
+        RESTORE(FRACTIONAL_FOLDS_TAG,
+                core::CPersistUtils::restore(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, traverser))
         RESTORE(INITIALIZATION_STAGE_TAG,
                 core::CPersistUtils::restore(INITIALIZATION_STAGE_TAG,
                                              initializationStage, traverser))
@@ -1846,8 +1852,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav
         RESTORE(MISSING_FEATURE_ROW_MASKS_TAG,
                 core::CPersistUtils::restore(MISSING_FEATURE_ROW_MASKS_TAG,
                                              m_MissingFeatureRowMasks, traverser))
-        RESTORE(NUMBER_FOLDS_TAG,
-                core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser))
         RESTORE(NUMBER_FOLDS_OVERRIDE_TAG,
                 core::CPersistUtils::restore(NUMBER_FOLDS_OVERRIDE_TAG,
                                              m_NumberFoldsOverride, traverser))
@@ -1909,7 +1913,7 @@ void CBoostedTreeImpl::checkRestoredInvariants() const {
         VIOLATES_INVARIANT(m_TunableHyperparameters.size(), ==, samples.size());
     }
     if (m_FoldRoundTestLosses.size() > 0) {
-        VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, m_NumberFolds);
+        VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, this->numberFolds());
         for (const auto& losses : m_FoldRoundTestLosses) {
             VIOLATES_INVARIANT(losses.size(), ==, m_NumberRounds);
         }
diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc
index 26759110fd..fed947f074 100644
--- a/lib/maths/CDataFrameUtils.cc
+++ b/lib/maths/CDataFrameUtils.cc
@@ -494,66 +494,74 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
                                                    const core::CDataFrame& frame,
                                                    std::size_t targetColumn,
                                                    CPRNG::CXorOShiro128Plus rng,
-                                                   std::size_t numberFolds,
+                                                   double numberFolds,
                                                    std::size_t numberBuckets,
                                                    const core::CPackedBitVector& allTrainingRowsMask) {
     TDoubleVec frequencies;
     TStratifiedSamplerPtr sampler;
 
-    double numberTrainingRows{allTrainingRowsMask.manhattan()};
-    if (numberTrainingRows < 2.0) {
+    double numberRows{allTrainingRowsMask.manhattan()};
+    if (numberRows < std::max(numberFolds, 2.0)) {
         HANDLE_FATAL(<< "Input error: unsufficient training data provided.");
         return {{}, {}, {}};
     }
 
-    std::size_t desiredCount{
-        (static_cast<std::size_t>(numberTrainingRows) + numberFolds / 2) / numberFolds};
+    // We sample the smaller of the test/train sets in the loop.
+    std::size_t numberTrainingRows{static_cast<std::size_t>(
+        1.0 - (numberFolds - 1.0) / numberFolds * numberRows + 0.5)};
+    std::size_t numberTestingRows{static_cast<std::size_t>(numberRows) - numberTrainingRows};
+    std::size_t sampleSize{std::min(numberTrainingRows, numberTestingRows)};
 
     if (frame.columnIsCategorical()[targetColumn]) {
         std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler(
-            numberThreads, frame, targetColumn, rng, desiredCount, allTrainingRowsMask);
+            numberThreads, frame, targetColumn, rng, sampleSize, allTrainingRowsMask);
     } else {
         sampler = regressionStratifiedCrossValiationRowSampler(
-            numberThreads, frame, targetColumn, rng, desiredCount,
-            numberBuckets, allTrainingRowsMask);
+            numberThreads, frame, targetColumn, rng, sampleSize, numberBuckets,
+            allTrainingRowsMask);
     }
 
     LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan());
 
-    TPackedBitVectorVec testingRowMasks(numberFolds);
+    TPackedBitVectorVec testingRowMasks(static_cast<std::size_t>(std::ceil(numberFolds)));
 
     TSizeVec rowIndices;
     core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask};
-    for (std::size_t fold = 0; fold < numberFolds - 1; ++fold) {
-        frame.readRows(1, 0, frame.numberRows(),
-                       [&](const TRowItr& beginRows, const TRowItr& endRows) {
-                           for (auto row = beginRows; row != endRows; ++row) {
-                               sampler->sample(*row);
-                           }
-                       },
-                       &candidateTestingRowsMask);
-        sampler->finishSampling(rng, rowIndices);
-        std::sort(rowIndices.begin(), rowIndices.end());
-        LOG_TRACE(<< "# row indices = " << rowIndices.size());
-
-        for (auto row : rowIndices) {
-            testingRowMasks[fold].extend(false, row - testingRowMasks[fold].size());
-            testingRowMasks[fold].extend(true);
+    for (std::size_t fold = 0; fold < testingRowMasks.size(); ++fold) {
+        if (candidateTestingRowsMask.manhattan() <
+            static_cast<double>(sampleSize - numberFolds)) {
+            frame.readRows(1, 0, frame.numberRows(),
+                           [&](const TRowItr& beginRows, const TRowItr& endRows) {
+                               for (auto row = beginRows; row != endRows; ++row) {
+                                   sampler->sample(*row);
+                               }
+                           },
+                           &candidateTestingRowsMask);
+            sampler->finishSampling(rng, rowIndices);
+            std::sort(rowIndices.begin(), rowIndices.end());
+            LOG_TRACE(<< "# row indices = " << rowIndices.size());
+
+            for (auto row : rowIndices) {
+                testingRowMasks[fold].extend(false, row - testingRowMasks[fold].size());
+                testingRowMasks[fold].extend(true);
+            }
+            testingRowMasks[fold].extend(false, allTrainingRowsMask.size() -
+                                                    testingRowMasks[fold].size());
+        } else {
+            testingRowMasks[fold] = candidateTestingRowsMask;
         }
-        testingRowMasks[fold].extend(false, allTrainingRowsMask.size() -
-                                                testingRowMasks[fold].size());
 
         // We exclusive or here to remove the rows we've selected for the current
-        //test fold. This is equivalent to samplng without replacement
+        // test/train fold. This is equivalent to sampling without replacement.
         candidateTestingRowsMask ^= testingRowMasks[fold];
     }
 
-    // Everything which is left.
-    testingRowMasks.back() = std::move(candidateTestingRowsMask);
-    LOG_TRACE(<< "# remaining rows = " << testingRowMasks.back().manhattan());
-
     TPackedBitVectorVec trainingRowMasks{complementRowMasks(testingRowMasks, allTrainingRowsMask)};
 
+    if (numberTrainingRows < numberTestingRows) {
+        std::swap(trainingRowMasks, testingRowMasks);
+    }
+
     return {std::move(trainingRowMasks), std::move(testingRowMasks), std::move(frequencies)};
 }
 

From 26070c4de30cad30e2fab43e3c5a3c94bb0608f6 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 1 Jul 2021 11:38:38 +0100
Subject: [PATCH 03/35] Allow one to disable fine tuning entirely for fast mode

---
 lib/api/CDataFrameTrainBoostedTreeRunner.cc | 10 +++++--
 lib/maths/CBoostedTreeFactory.cc            | 30 ++++++++++-----------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
index 0e29efe898..4acbbecbe6 100644
--- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
@@ -14,6 +14,7 @@
 #include <core/CStateDecompressor.h>
 #include <core/CStopWatch.h>
 
+#include <limits>
 #include <maths/CBoostedTree.h>
 #include <maths/CBoostedTreeFactory.h>
 #include <maths/CBoostedTreeLoss.h>
@@ -29,6 +30,10 @@
 
 namespace ml {
 namespace api {
+namespace {
+const std::size_t UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER{
+    std::numeric_limits<std::size_t>::max()};
+}
 
 const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::parameterReader() {
     static const CDataFrameAnalysisConfigReader PARAMETER_READER{[] {
@@ -96,7 +101,8 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
     std::size_t maxTrees{parameters[MAX_TREES].fallback(std::size_t{0})};
     std::size_t numberFolds{parameters[NUM_FOLDS].fallback(std::size_t{0})};
     std::size_t numberRoundsPerHyperparameter{
-        parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback(std::size_t{0})};
+        parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback(
+            UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER)};
     std::size_t bayesianOptimisationRestarts{
         parameters[BAYESIAN_OPTIMISATION_RESTARTS].fallback(std::size_t{0})};
     bool stopCrossValidationEarly{parameters[STOP_CROSS_VALIDATION_EARLY].fallback(true)};
@@ -192,7 +198,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
     if (numberFolds > 1) {
         m_BoostedTreeFactory->numberFolds(numberFolds);
     }
-    if (numberRoundsPerHyperparameter > 0) {
+    if (numberRoundsPerHyperparameter != UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER) {
         m_BoostedTreeFactory->maximumOptimisationRoundsPerHyperparameter(numberRoundsPerHyperparameter);
     }
     if (bayesianOptimisationRestarts > 0) {
diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index 8218489619..ec093d66fe 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -359,8 +359,6 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
                                                  maximumTrainingDataConstraintNumberFolds);
         LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction
                   << " # folds = " << m_TreeImpl->m_FractionalFolds);
-    } else {
-        m_TreeImpl->m_FractionalFolds = static_cast<double>(*m_TreeImpl->m_NumberFoldsOverride);
     }
 }
 
@@ -467,30 +465,20 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) {
 }
 
 void CBoostedTreeFactory::initializeHyperparametersSetup(core::CDataFrame& frame) {
-    if (m_TreeImpl->m_EtaOverride != boost::none) {
-        m_TreeImpl->m_Eta = *(m_TreeImpl->m_EtaOverride);
-    } else {
+    if (m_TreeImpl->m_EtaOverride == boost::none) {
         m_TreeImpl->m_Eta =
             computeEta(frame.numberColumns() - this->numberExtraColumnsForTrain());
         m_TreeImpl->m_EtaGrowthRatePerTree = 1.0 + m_TreeImpl->m_Eta / 2.0;
     }
 
-    if (m_TreeImpl->m_EtaGrowthRatePerTreeOverride != boost::none) {
-        m_TreeImpl->m_EtaGrowthRatePerTree = *(m_TreeImpl->m_EtaGrowthRatePerTreeOverride);
-    }
-
-    if (m_TreeImpl->m_MaximumNumberTreesOverride != boost::none) {
-        m_TreeImpl->m_MaximumNumberTrees = *(m_TreeImpl->m_MaximumNumberTreesOverride);
-    } else {
+    if (m_TreeImpl->m_MaximumNumberTreesOverride == boost::none) {
         // This needs to be tied to the learn rate to avoid bias.
         m_TreeImpl->m_MaximumNumberTrees = computeMaximumNumberTrees(m_TreeImpl->m_Eta);
     }
 
     double numberFeatures{static_cast<double>(m_TreeImpl->m_Encoder->numberEncodedColumns())};
 
-    if (m_TreeImpl->m_FeatureBagFractionOverride != boost::none) {
-        m_TreeImpl->m_FeatureBagFraction = *(m_TreeImpl->m_FeatureBagFractionOverride);
-    } else {
+    if (m_TreeImpl->m_FeatureBagFractionOverride == boost::none) {
         m_TreeImpl->m_FeatureBagFraction =
             std::min(m_TreeImpl->m_FeatureBagFraction,
                      m_TreeImpl->m_TrainingRowMasks[0].manhattan() /
@@ -1159,6 +1147,7 @@ CBoostedTreeFactory::classAssignmentObjective(CBoostedTree::EClassAssignmentObje
 
 CBoostedTreeFactory& CBoostedTreeFactory::classificationWeights(TStrDoublePrVec weights) {
     m_TreeImpl->m_ClassificationWeightsOverride = std::move(weights);
+    m_TreeImpl->m_ClassificationWeights = *m_TreeImpl->m_ClassificationWeightsOverride;
     return *this;
 }
 
@@ -1177,6 +1166,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) {
         numberFolds = 2;
     }
     m_TreeImpl->m_NumberFoldsOverride = numberFolds;
+    m_TreeImpl->m_FractionalFolds = static_cast<double>(numberFolds);
     return *this;
 }
 
@@ -1209,6 +1199,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::downsampleFactor(double factor) {
         factor = 1.0;
     }
     m_TreeImpl->m_DownsampleFactorOverride = factor;
+    m_TreeImpl->m_DownsampleFactor = factor;
     return *this;
 }
 
@@ -1218,6 +1209,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::depthPenaltyMultiplier(double depthPen
         depthPenaltyMultiplier = 0.0;
     }
     m_TreeImpl->m_RegularizationOverride.depthPenaltyMultiplier(depthPenaltyMultiplier);
+    m_TreeImpl->m_Regularization.depthPenaltyMultiplier(depthPenaltyMultiplier);
     return *this;
 }
 
@@ -1227,6 +1219,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::treeSizePenaltyMultiplier(double treeS
         treeSizePenaltyMultiplier = 0.0;
     }
     m_TreeImpl->m_RegularizationOverride.treeSizePenaltyMultiplier(treeSizePenaltyMultiplier);
+    m_TreeImpl->m_Regularization.treeSizePenaltyMultiplier(treeSizePenaltyMultiplier);
     return *this;
 }
 
@@ -1236,6 +1229,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::leafWeightPenaltyMultiplier(double lea
         leafWeightPenaltyMultiplier = 0.0;
     }
     m_TreeImpl->m_RegularizationOverride.leafWeightPenaltyMultiplier(leafWeightPenaltyMultiplier);
+    m_TreeImpl->m_Regularization.leafWeightPenaltyMultiplier(leafWeightPenaltyMultiplier);
     return *this;
 }
 
@@ -1245,6 +1239,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::softTreeDepthLimit(double softTreeDept
         softTreeDepthLimit = MIN_SOFT_DEPTH_LIMIT;
     }
     m_TreeImpl->m_RegularizationOverride.softTreeDepthLimit(softTreeDepthLimit);
+    m_TreeImpl->m_Regularization.softTreeDepthLimit(softTreeDepthLimit);
     return *this;
 }
 
@@ -1254,6 +1249,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::softTreeDepthTolerance(double softTree
         softTreeDepthTolerance = 0.01;
     }
     m_TreeImpl->m_RegularizationOverride.softTreeDepthTolerance(softTreeDepthTolerance);
+    m_TreeImpl->m_Regularization.softTreeDepthTolerance(softTreeDepthTolerance);
     return *this;
 }
 
@@ -1268,6 +1264,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::eta(double eta) {
         eta = 1.0;
     }
     m_TreeImpl->m_EtaOverride = eta;
+    m_TreeImpl->m_Eta = eta;
     return *this;
 }
 
@@ -1278,6 +1275,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::etaGrowthRatePerTree(double etaGrowthR
         etaGrowthRatePerTree = std::max(etaGrowthRatePerTree, MIN_ETA);
     }
     m_TreeImpl->m_EtaGrowthRatePerTreeOverride = etaGrowthRatePerTree;
+    m_TreeImpl->m_EtaGrowthRatePerTree = etaGrowthRatePerTree;
     return *this;
 }
 
@@ -1292,6 +1290,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::maximumNumberTrees(std::size_t maximum
         maximumNumberTrees = std::min(maximumNumberTrees, MAX_NUMBER_TREES);
     }
     m_TreeImpl->m_MaximumNumberTreesOverride = maximumNumberTrees;
+    m_TreeImpl->m_MaximumNumberTrees = maximumNumberTrees;
     return *this;
 }
 
@@ -1302,6 +1301,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::featureBagFraction(double featureBagFr
         featureBagFraction = CTools::truncate(featureBagFraction, 0.0, 1.0);
     }
     m_TreeImpl->m_FeatureBagFractionOverride = featureBagFraction;
+    m_TreeImpl->m_FeatureBagFraction = featureBagFraction;
     return *this;
 }
 

From 81d3ffd3a7626380c1a6c6b351c21d308d789b6a Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 1 Jul 2021 17:51:56 +0100
Subject: [PATCH 04/35] Uncouple training fraction parameter from the number of
 folds

---
 .../api/CDataFrameTrainBoostedTreeRunner.h    |   1 +
 include/maths/CBoostedTreeFactory.h           |   4 +-
 include/maths/CBoostedTreeImpl.h              |   7 +-
 ...ataFrameAnalysisInstrumentationInterface.h |  29 ++---
 include/maths/CDataFrameUtils.h               |   8 +-
 lib/api/CDataFrameAnalysisInstrumentation.cc  |   4 +
 lib/api/CDataFrameTrainBoostedTreeRunner.cc   |  12 +-
 lib/maths/CBoostedTreeFactory.cc              | 104 +++++++++---------
 lib/maths/CBoostedTreeImpl.cc                 |  57 ++++++----
 lib/maths/CDataFrameUtils.cc                  |  28 ++---
 lib/maths/unittest/CBoostedTreeTest.cc        |  64 +++++++++++
 lib/maths/unittest/CDataFrameUtilsTest.cc     |   8 +-
 12 files changed, 207 insertions(+), 119 deletions(-)

diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
index 7fe4ff7100..5dc3c87f21 100644
--- a/include/api/CDataFrameTrainBoostedTreeRunner.h
+++ b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -48,6 +48,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string MAX_TREES;
     static const std::string FEATURE_BAG_FRACTION;
     static const std::string NUM_FOLDS;
+    static const std::string TRAIN_FRACTION_PER_FOLD;
     static const std::string STOP_CROSS_VALIDATION_EARLY;
     static const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER;
     static const std::string BAYESIAN_OPTIMISATION_RESTARTS;
diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
index 74eeae688f..ca06cdf0f2 100644
--- a/include/maths/CBoostedTreeFactory.h
+++ b/include/maths/CBoostedTreeFactory.h
@@ -80,6 +80,8 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency);
     //! Set the number of folds to use for estimating the generalisation error.
     CBoostedTreeFactory& numberFolds(std::size_t numberFolds);
+    //! Set the fraction fold data to use for training.
+    CBoostedTreeFactory& trainFractionPerFold(double fraction);
     //! Set the maximum number of rows to use for training when tuning hyperparameters.
     CBoostedTreeFactory& maximumNumberTrainRows(std::size_t rows);
     //! Stratify the cross-validation we do for regression.
@@ -277,7 +279,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TOptionalSize m_BayesianOptimisationRestarts;
     bool m_StratifyRegressionCrossValidation = true;
     double m_InitialDownsampleRowsPerFeature = 200.0;
-    std::size_t m_MaximumNumberOfTrainRows = 1000000;
+    std::size_t m_MaximumNumberOfTrainRows = 750000;
     double m_GainPerNode1stPercentile = 0.0;
     double m_GainPerNode50thPercentile = 0.0;
     double m_GainPerNode90thPercentile = 0.0;
diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
index ad19e896c7..a2d948aa33 100644
--- a/include/maths/CBoostedTreeImpl.h
+++ b/include/maths/CBoostedTreeImpl.h
@@ -320,9 +320,6 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Check invariants which are assumed to hold in order to train on \p frame.
     void checkTrainInvariants(const core::CDataFrame& frame) const;
 
-    //! Get the count of train/validation folds.
-    std::size_t numberFolds() const;
-
     //! Get the number of hyperparameters to tune.
     std::size_t numberHyperparametersToTune() const;
 
@@ -375,6 +372,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TOptionalDouble m_EtaOverride;
     TOptionalDouble m_EtaGrowthRatePerTreeOverride;
     TOptionalSize m_NumberFoldsOverride;
+    TOptionalSize m_TrainFractionPerFoldOverride;
     TOptionalSize m_MaximumNumberTreesOverride;
     TOptionalDouble m_FeatureBagFractionOverride;
     TOptionalStrDoublePrVec m_ClassificationWeightsOverride;
@@ -383,7 +381,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     double m_DownsampleFactor = 0.5;
     double m_Eta = 0.1;
     double m_EtaGrowthRatePerTree = 1.05;
-    double m_FractionalFolds = 4.0;
+    std::size_t m_NumberFolds = 4;
+    double m_TrainFractionPerFold = 0.75;
     std::size_t m_MaximumNumberTrees = 20;
     std::size_t m_MaximumAttemptsToAddTree = 3;
     std::size_t m_NumberSplitsPerFeature = 75;
diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
index 23a876dd84..f6b35916b0 100644
--- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h
+++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
@@ -103,25 +103,26 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
               s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance},
               s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier},
               s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}
-        double s_DepthPenaltyMultiplier = -1.0;
-        double s_SoftTreeDepthLimit = -1.0;
-        double s_SoftTreeDepthTolerance = -1.0;
-        double s_TreeSizePenaltyMultiplier = -1.0;
-        double s_LeafWeightPenaltyMultiplier = -1.0;
+        double s_DepthPenaltyMultiplier{-1.0};
+        double s_SoftTreeDepthLimit{-1.0};
+        double s_SoftTreeDepthTolerance{-1.0};
+        double s_TreeSizePenaltyMultiplier{-1.0};
+        double s_LeafWeightPenaltyMultiplier{-1.0};
     };
     struct SHyperparameters {
-        double s_Eta = -1.0;
+        double s_Eta{-1.0};
         CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective =
             CBoostedTree::E_MinimumRecall;
         SRegularization s_Regularization;
-        double s_DownsampleFactor = -1.0;
-        std::size_t s_NumFolds = 0;
-        std::size_t s_MaxTrees = 0;
-        double s_FeatureBagFraction = -1.0;
-        double s_EtaGrowthRatePerTree = -1.0;
-        std::size_t s_MaxAttemptsToAddTree = 0;
-        std::size_t s_NumSplitsPerFeature = 0;
-        std::size_t s_MaxOptimizationRoundsPerHyperparameter = 0;
+        double s_DownsampleFactor{-1.0};
+        std::size_t s_NumFolds{0};
+        double s_TrainFractionPerFold{0.0};
+        std::size_t s_MaxTrees{0};
+        double s_FeatureBagFraction{-1.0};
+        double s_EtaGrowthRatePerTree{-1.0};
+        std::size_t s_MaxAttemptsToAddTree{0};
+        std::size_t s_NumSplitsPerFeature{0};
+        std::size_t s_MaxOptimizationRoundsPerHyperparameter{0};
     };
     using TDoubleVec = std::vector<double>;
 
diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h
index 411a680455..0e08f1dca0 100644
--- a/include/maths/CDataFrameUtils.h
+++ b/include/maths/CDataFrameUtils.h
@@ -280,9 +280,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
     //! \param[in] frame The data frame for which to compute the row masks.
     //! \param[in] targetColumn The index of the column to predict.
     //! \param[in] rng The random number generator to use.
-    //! \param[in] numberFolds The number of folds to use. If this is less than
-    //! two, there will be two train masks, but their size will be less than 50%
-    //! of the data.
+    //! \param[in] numberFolds The number of folds to use.
+    //! \param[in] trainFractionPerFold The fraction of train data to use per fold.
     //! \param[in] numberBuckets The number of buckets to use when stratifying
     //! by target quantiles for regression.
     //! \param[in] allTrainingRowsMask A mask of the candidate training rows.
@@ -292,7 +291,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
                                       const core::CDataFrame& frame,
                                       std::size_t targetColumn,
                                       CPRNG::CXorOShiro128Plus rng,
-                                      double numberFolds,
+                                      std::size_t numberFolds,
+                                      double trainFractionPerFold,
                                       std::size_t numberBuckets,
                                       const core::CPackedBitVector& allTrainingRowsMask);
 
diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc
index ec1c18e9f5..5d059cea08 100644
--- a/lib/api/CDataFrameAnalysisInstrumentation.cc
+++ b/lib/api/CDataFrameAnalysisInstrumentation.cc
@@ -483,6 +483,10 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::
             rapidjson::Value(static_cast<std::uint64_t>(this->m_Hyperparameters.s_NumFolds))
                 .Move(),
             parentObject);
+        writer->addMember(
+            CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD,
+            rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(),
+            parentObject);
         writer->addMember(
             CDataFrameTrainBoostedTreeRunner::MAX_TREES,
             rapidjson::Value(static_cast<std::uint64_t>(this->m_Hyperparameters.s_MaxTrees))
diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
index 4acbbecbe6..b10932a5b2 100644
--- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
@@ -14,20 +14,21 @@
 #include <core/CStateDecompressor.h>
 #include <core/CStopWatch.h>
 
-#include <limits>
 #include <maths/CBoostedTree.h>
 #include <maths/CBoostedTreeFactory.h>
 #include <maths/CBoostedTreeLoss.h>
 #include <maths/CDataFrameUtils.h>
 
+#include <api/CBoostedTreeInferenceModelBuilder.h>
 #include <api/CDataFrameAnalysisConfigReader.h>
 #include <api/CDataFrameAnalysisSpecification.h>
 #include <api/CInferenceModelDefinition.h>
 #include <api/ElasticsearchStateIndex.h>
 
-#include <api/CBoostedTreeInferenceModelBuilder.h>
 #include <rapidjson/document.h>
 
+#include <limits>
+
 namespace ml {
 namespace api {
 namespace {
@@ -60,6 +61,8 @@ const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::paramete
         theReader.addParameter(FEATURE_BAG_FRACTION,
                                CDataFrameAnalysisConfigReader::E_OptionalParameter);
         theReader.addParameter(NUM_FOLDS, CDataFrameAnalysisConfigReader::E_OptionalParameter);
+        theReader.addParameter(TRAIN_FRACTION_PER_FOLD,
+                               CDataFrameAnalysisConfigReader::E_OptionalParameter);
         theReader.addParameter(STOP_CROSS_VALIDATION_EARLY,
                                CDataFrameAnalysisConfigReader::E_OptionalParameter);
         theReader.addParameter(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER,
@@ -100,6 +103,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
 
     std::size_t maxTrees{parameters[MAX_TREES].fallback(std::size_t{0})};
     std::size_t numberFolds{parameters[NUM_FOLDS].fallback(std::size_t{0})};
+    double trainFractionPerFold{parameters[TRAIN_FRACTION_PER_FOLD].fallback(-1.0)};
     std::size_t numberRoundsPerHyperparameter{
         parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback(
             UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER)};
@@ -198,6 +202,9 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
     if (numberFolds > 1) {
         m_BoostedTreeFactory->numberFolds(numberFolds);
     }
+    if (trainFractionPerFold > 0.0) {
+        m_BoostedTreeFactory->trainFractionPerFold(trainFractionPerFold);
+    }
     if (numberRoundsPerHyperparameter != UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER) {
         m_BoostedTreeFactory->maximumOptimisationRoundsPerHyperparameter(numberRoundsPerHyperparameter);
     }
@@ -394,6 +401,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"s
 const std::string CDataFrameTrainBoostedTreeRunner::MAX_TREES{"max_trees"};
 const std::string CDataFrameTrainBoostedTreeRunner::FEATURE_BAG_FRACTION{"feature_bag_fraction"};
 const std::string CDataFrameTrainBoostedTreeRunner::NUM_FOLDS{"num_folds"};
+const std::string CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD{"train_fraction_per_fold"};
 const std::string CDataFrameTrainBoostedTreeRunner::STOP_CROSS_VALIDATION_EARLY{"stop_cross_validation_early"};
 const std::string CDataFrameTrainBoostedTreeRunner::MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER{"max_optimization_rounds_per_hyperparameter"};
 const std::string CDataFrameTrainBoostedTreeRunner::BAYESIAN_OPTIMISATION_RESTARTS{"bayesian_optimisation_restarts"};
diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index ec093d66fe..821991d496 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -104,9 +104,6 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari
         ? this->skipProgressMonitoringFeatureSelection()
         : this->startProgressMonitoringFeatureSelection();
 
-    // Find the maximum number of rows at which the selected tree depth does not change significantly.
-    // Need to call hyperparameter set up first.
-
     skipIfAfter(CBoostedTreeImpl::E_NotInitialized,
                 [&] { this->initializeCrossValidation(frame); });
     skipIfAfter(CBoostedTreeImpl::E_NotInitialized,
@@ -295,25 +292,25 @@ void CBoostedTreeFactory::initializeMissingFeatureMasks(const core::CDataFrame&
 
 void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
 
-    if (m_TreeImpl->m_NumberFoldsOverride == boost::none) {
-        auto result = frame.readRows(
-            m_NumberThreads,
-            core::bindRetrievableState(
-                [this](std::size_t& numberTrainingRows,
-                       const TRowItr& beginRows, const TRowItr& endRows) {
-                    for (auto row = beginRows; row != endRows; ++row) {
-                        double target{(*row)[m_TreeImpl->m_DependentVariable]};
-                        if (CDataFrameUtils::isMissing(target) == false) {
-                            ++numberTrainingRows;
-                        }
+    auto result = frame.readRows(
+        m_NumberThreads,
+        core::bindRetrievableState(
+            [this](std::size_t& numberTrainingRows, const TRowItr& beginRows, const TRowItr& endRows) {
+                for (auto row = beginRows; row != endRows; ++row) {
+                    double target{(*row)[m_TreeImpl->m_DependentVariable]};
+                    if (CDataFrameUtils::isMissing(target) == false) {
+                        ++numberTrainingRows;
                     }
-                },
-                std::size_t{0}));
-        std::size_t totalNumberTrainingRows{0};
-        for (const auto& numberTrainingRows : result.first) {
-            totalNumberTrainingRows += numberTrainingRows.s_FunctionState;
-        }
-        LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows);
+                }
+            },
+            std::size_t{0}));
+    std::size_t totalNumberTrainingRows{0};
+    for (const auto& numberTrainingRows : result.first) {
+        totalNumberTrainingRows += numberTrainingRows.s_FunctionState;
+    }
+    LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows);
+
+    if (m_TreeImpl->m_NumberFoldsOverride == boost::none) {
 
         // We want to choose the number of folds so we'll have enough training data
         // after leaving out one fold. We choose the initial downsample size based
@@ -335,31 +332,25 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
         //
         // In addition, we want to constrain the maximum amount of training data we'll
         // use during hyperparameter search to avoid very long run times. To do this
-        // we set the number of folds to be less than two. We define the size of the
-        // training data set to be (k - 1) / k * # rows, with k the number of folds.
-        // If k < 2 this means we end up selecting less than half the data for training.
-        // To meet the constraint on the maximum number of rows M we must choose k
-        // which satisfies M >= (k - 1) / k * # rows. This is trivially satisfied for
-        // # rows less than M and, given we also constrain the maximum number of folds,
-        // we only care if # rows > MAX_NUMBER_FOLDS * M / (MAX_NUMBER_FOLDS - 1).
+        // we use less than the implied 1 - 1/k : 1/k train : test split when it results
+        // in more train rows than the defined maximum.
 
         double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature *
                                           static_cast<double>(frame.numberColumns() - 1)) /
                                          static_cast<double>(totalNumberTrainingRows)};
-        double minimumTrainingDataConstraintNumberFolds{
+        LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction);
+        m_TreeImpl->m_NumberFolds = static_cast<std::size_t>(
             std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION,
-                                     1.0 / MAX_NUMBER_FOLDS))};
-        double maximumTrainingDataConstraintNumberFolds{
-            1.0 / (1.0 - static_cast<double>(m_MaximumNumberOfTrainRows) /
-                             std::max(static_cast<double>(frame.numberRows()),
-                                      MAX_NUMBER_FOLDS / (MAX_NUMBER_FOLDS - 1.0) *
-                                          static_cast<double>(m_MaximumNumberOfTrainRows)))};
-
-        m_TreeImpl->m_FractionalFolds = std::min(minimumTrainingDataConstraintNumberFolds,
-                                                 maximumTrainingDataConstraintNumberFolds);
-        LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction
-                  << " # folds = " << m_TreeImpl->m_FractionalFolds);
+                                     1.0 / MAX_NUMBER_FOLDS)));
+    }
+    if (m_TreeImpl->m_TrainFractionPerFoldOverride == boost::none) {
+        m_TreeImpl->m_TrainFractionPerFold =
+            std::min(1.0 - 1.0 / static_cast<double>(m_TreeImpl->m_NumberFolds),
+                     static_cast<double>(m_MaximumNumberOfTrainRows) /
+                         static_cast<double>(totalNumberTrainingRows));
     }
+    LOG_TRACE(<< "# folds = " << m_TreeImpl->m_NumberFolds
+              << ", train fraction per fold = " << m_TreeImpl->m_TrainFractionPerFold);
 }
 
 void CBoostedTreeFactory::resizeDataFrame(core::CDataFrame& frame) const {
@@ -390,8 +381,9 @@ void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) con
     std::size_t numberBuckets(m_StratifyRegressionCrossValidation ? 10 : 1);
     std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks, std::ignore) =
         CDataFrameUtils::stratifiedCrossValidationRowMasks(
-            m_TreeImpl->m_NumberThreads, frame, dependentVariable, m_TreeImpl->m_Rng,
-            m_TreeImpl->m_FractionalFolds, numberBuckets, allTrainingRowsMask);
+            m_TreeImpl->m_NumberThreads, frame, dependentVariable,
+            m_TreeImpl->m_Rng, m_TreeImpl->m_NumberFolds,
+            m_TreeImpl->m_TrainFractionPerFold, numberBuckets, allTrainingRowsMask);
 }
 
 void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFrame& frame) const {
@@ -1080,26 +1072,21 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame,
     }
 
     std::sort(testLosses.begin(), testLosses.end());
-    LOG_INFO(<< "test losses = " << core::CContainerPrinter::print(testLosses));
+    LOG_TRACE(<< "test losses = " << core::CContainerPrinter::print(testLosses));
 
     CLowess<2> lowess;
     lowess.fit(std::move(testLosses), testLosses.size());
 
-    double bestParameter, bestParameterTestLoss;
+    double bestParameter;
+    double bestParameterTestLoss;
     std::tie(bestParameter, bestParameterTestLoss) = lowess.minimum();
-    LOG_INFO(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss);
+    LOG_TRACE(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss);
 
     double width{(intervalRightEnd - intervalLeftEnd) /
                  static_cast<double>(MAX_LINE_SEARCH_ITERATIONS)};
     intervalLeftEnd = bestParameter - width;
     intervalRightEnd = bestParameter + width;
-    LOG_INFO(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]");
-    //double residualVariance{lowess.residualVariance()};
-    //std::tie(intervalLeftEnd, intervalRightEnd) =
-    //    lowess.sublevelSet(bestParameter, bestParameterTestLoss,
-    //                       bestParameterTestLoss + std::sqrt(residualVariance));
-    //LOG_INFO(<< "residual variance = " << residualVariance << " interval = ["
-    //          << intervalLeftEnd << "," << intervalRightEnd << "]");
+    LOG_TRACE(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]");
 
     return TVector{{intervalLeftEnd, bestParameter, intervalRightEnd}};
 }
@@ -1147,7 +1134,6 @@ CBoostedTreeFactory::classAssignmentObjective(CBoostedTree::EClassAssignmentObje
 
 CBoostedTreeFactory& CBoostedTreeFactory::classificationWeights(TStrDoublePrVec weights) {
     m_TreeImpl->m_ClassificationWeightsOverride = std::move(weights);
-    m_TreeImpl->m_ClassificationWeights = *m_TreeImpl->m_ClassificationWeightsOverride;
     return *this;
 }
 
@@ -1166,7 +1152,17 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) {
         numberFolds = 2;
     }
     m_TreeImpl->m_NumberFoldsOverride = numberFolds;
-    m_TreeImpl->m_FractionalFolds = static_cast<double>(numberFolds);
+    m_TreeImpl->m_NumberFolds = numberFolds;
+    return *this;
+}
+
+CBoostedTreeFactory& CBoostedTreeFactory::trainFractionPerFold(double fraction) {
+    if (fraction <= 0.0 || fraction >= 1.0) {
+        LOG_WARN(<< "Training data fraction " << fraction << " per fold out of range");
+    } else {
+        m_TreeImpl->m_TrainFractionPerFoldOverride = fraction;
+        m_TreeImpl->m_TrainFractionPerFold = fraction;
+    }
     return *this;
 }
 
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 07d9b27ec5..93d2560434 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -334,7 +334,7 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
         m_MaximumNumberTrees *
         (sizeof(TNodeVec) + maximumNumberNodes * CBoostedTreeNode::estimateMemoryUsage(
                                                      m_Loss->numberParameters()))};
-    std::size_t foldRoundLossMemoryUsage{this->numberFolds() * m_NumberRounds *
+    std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds *
                                          sizeof(TOptionalDouble)};
     std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)};
     std::size_t tunableHyperparametersMemoryUsage{
@@ -367,7 +367,9 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
     // we get a constant 8 / 64.
     std::size_t missingFeatureMaskMemoryUsage{8 * numberColumns * numberRows / 64};
     std::size_t trainTestMaskMemoryUsage{
-        2 * static_cast<std::size_t>(std::ceil(std::log2(m_FractionalFolds))) * numberRows};
+        2 * m_NumberFolds *
+        static_cast<std::size_t>(std::ceil(
+            std::min(m_TrainFractionPerFold, 1.0 - m_TrainFractionPerFold) * numberRows))};
     std::size_t bayesianOptimisationMemoryUsage{CBayesianOptimisation::estimateMemoryUsage(
         this->numberHyperparametersToTune(), m_NumberRounds)};
     std::size_t worstCaseMemoryUsage{
@@ -435,7 +437,7 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile,
 }
 
 void CBoostedTreeImpl::initializePerFoldTestLosses() {
-    m_FoldRoundTestLosses.resize(this->numberFolds());
+    m_FoldRoundTestLosses.resize(m_NumberFolds);
     for (auto& losses : m_FoldRoundTestLosses) {
         losses.resize(m_NumberRounds);
     }
@@ -522,7 +524,7 @@ CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple
 CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
 
     // We want to ensure we evaluate on equal proportions for each fold.
-    TSizeVec folds(this->numberFolds());
+    TSizeVec folds(m_NumberFolds);
     std::iota(folds.begin(), folds.end(), 0);
     CSampling::random_shuffle(m_Rng, folds.begin(), folds.end());
 
@@ -532,8 +534,8 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
         // that the test error is not close to the minimum test error. We use
         // the estimated test error for each remaining fold at two standard
         // deviations below the mean for this.
-        if (m_StopCrossValidationEarly && m_CurrentRound >= this->numberFolds() &&
-            folds.size() < this->numberFolds()) {
+        if (m_StopCrossValidationEarly && m_CurrentRound >= m_NumberFolds &&
+            folds.size() < m_NumberFolds) {
             for (const auto& testLoss : this->estimateMissingTestLosses(folds)) {
                 testLossMoments.add(
                     CBasicStatistics::mean(testLoss) -
@@ -546,7 +548,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) {
 
     TMeanVarAccumulator lossMoments;
     TDoubleVec numberTrees;
-    numberTrees.reserve(this->numberFolds());
+    numberTrees.reserve(m_NumberFolds);
     TMeanAccumulator meanForestSizeAccumulator;
 
     while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) {
@@ -972,13 +974,12 @@ double CBoostedTreeImpl::minimumTestLoss() const {
     TMinAccumulator minimumTestLoss;
     for (std::size_t round = 0; round < m_CurrentRound - 1; ++round) {
         TMeanVarAccumulator roundLossMoments;
-        for (std::size_t fold = 0; fold < this->numberFolds(); ++fold) {
+        for (std::size_t fold = 0; fold < m_NumberFolds; ++fold) {
             if (m_FoldRoundTestLosses[fold][round] != boost::none) {
                 roundLossMoments.add(*m_FoldRoundTestLosses[fold][round]);
             }
         }
-        if (static_cast<std::size_t>(CBasicStatistics::count(roundLossMoments)) ==
-            this->numberFolds()) {
+        if (static_cast<std::size_t>(CBasicStatistics::count(roundLossMoments)) == m_NumberFolds) {
             minimumTestLoss.add(CBasicStatistics::mean(roundLossMoments));
         }
     }
@@ -1027,7 +1028,7 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
     // where the indices range over the folds for which we have errors in the
     // current round.
 
-    TSizeVec present(this->numberFolds());
+    TSizeVec present(m_NumberFolds);
     std::iota(present.begin(), present.end(), 0);
     TSizeVec ordered{missing};
     std::sort(ordered.begin(), ordered.end());
@@ -1478,10 +1479,6 @@ void CBoostedTreeImpl::scaleRegularizers(double scale) {
     }
 }
 
-std::size_t CBoostedTreeImpl::numberFolds() const {
-    return static_cast<std::size_t>(std::ceil(m_FractionalFolds));
-}
-
 std::size_t CBoostedTreeImpl::numberHyperparametersToTune() const {
     return m_RegularizationOverride.countNotSet() +
            (m_DownsampleFactorOverride != boost::none ? 0 : 1) +
@@ -1504,7 +1501,8 @@ void CBoostedTreeImpl::recordHyperparameters() {
     m_Instrumentation->hyperparameters().s_Eta = m_Eta;
     m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective;
     m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor;
-    m_Instrumentation->hyperparameters().s_NumFolds = m_FractionalFolds;
+    m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds;
+    m_Instrumentation->hyperparameters().s_TrainFractionPerFold = m_TrainFractionPerFold;
     m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees;
     m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction;
     m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree;
@@ -1587,13 +1585,13 @@ void CBoostedTreeImpl::startProgressMonitoringFineTuneHyperparameters() {
 
     m_Instrumentation->startNewProgressMonitoredTask(CBoostedTreeFactory::FINE_TUNING_PARAMETERS);
 
-    std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * this->numberFolds()};
+    std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * m_NumberFolds};
     LOG_TRACE(<< "main loop total number steps = " << totalNumberSteps);
     m_TrainingProgress = core::CLoopProgress{
         totalNumberSteps, m_Instrumentation->progressCallback(), 1.0, 1024};
 
     // Make sure progress starts where it left off.
-    m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * this->numberFolds());
+    m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * m_NumberFolds);
 }
 
 void CBoostedTreeImpl::startProgressMonitoringFinalTrain() {
@@ -1633,7 +1631,6 @@ const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"};
 const std::string FEATURE_DATA_TYPES_TAG{"feature_data_types"};
 const std::string FEATURE_SAMPLE_PROBABILITIES_TAG{"feature_sample_probabilities"};
 const std::string FOLD_ROUND_TEST_LOSSES_TAG{"fold_round_test_losses"};
-const std::string FRACTIONAL_FOLDS_TAG{"number_folds"};
 const std::string INITIALIZATION_STAGE_TAG{"initialization_progress"};
 const std::string LOSS_TAG{"loss"};
 const std::string LOSS_NAME_TAG{"loss_name"};
@@ -1646,6 +1643,7 @@ const std::string MEAN_FOREST_SIZE_ACCUMULATOR_TAG{"mean_forest_size"};
 const std::string MEAN_LOSS_ACCUMULATOR_TAG{"mean_loss"};
 const std::string MISSING_FEATURE_ROW_MASKS_TAG{"missing_feature_row_masks"};
 const std::string NUMBER_FOLDS_OVERRIDE_TAG{"number_folds_override"};
+const std::string NUMBER_FOLDS_TAG{"number_folds"};
 const std::string NUMBER_ROUNDS_TAG{"number_rounds"};
 const std::string NUMBER_SPLITS_PER_FEATURE_TAG{"number_splits_per_feature"};
 const std::string NUMBER_THREADS_TAG{"number_threads"};
@@ -1656,6 +1654,8 @@ const std::string ROWS_PER_FEATURE_TAG{"rows_per_feature"};
 const std::string STOP_CROSS_VALIDATION_EARLY_TAG{"stop_cross_validation_eraly"};
 const std::string TESTING_ROW_MASKS_TAG{"testing_row_masks"};
 const std::string TRAINING_ROW_MASKS_TAG{"training_row_masks"};
+const std::string TRAIN_FRACTION_PER_FOLD_TAG{"train_fraction_per_folds"};
+const std::string TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG{"train_fraction_per_folds_override"};
 const std::string NUMBER_TOP_SHAP_VALUES_TAG{"top_shap_values"};
 const std::string STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG{"stop_hyperparameter_optimization_early"};
 }
@@ -1708,7 +1708,6 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert
     core::CPersistUtils::persist(FEATURE_SAMPLE_PROBABILITIES_TAG,
                                  m_FeatureSampleProbabilities, inserter);
     core::CPersistUtils::persist(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, inserter);
-    core::CPersistUtils::persist(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, inserter);
     core::CPersistUtils::persist(INITIALIZATION_STAGE_TAG,
                                  static_cast<int>(m_InitializationStage), inserter);
     if (m_Loss != nullptr) {
@@ -1729,6 +1728,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert
     core::CPersistUtils::persist(MISSING_FEATURE_ROW_MASKS_TAG,
                                  m_MissingFeatureRowMasks, inserter);
     core::CPersistUtils::persist(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, inserter);
+    core::CPersistUtils::persist(NUMBER_FOLDS_TAG, m_NumberFolds, inserter);
     core::CPersistUtils::persist(NUMBER_ROUNDS_TAG, m_NumberRounds, inserter);
     core::CPersistUtils::persist(NUMBER_SPLITS_PER_FEATURE_TAG,
                                  m_NumberSplitsPerFeature, inserter);
@@ -1743,6 +1743,9 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert
                                  m_StopCrossValidationEarly, inserter);
     core::CPersistUtils::persist(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, inserter);
     core::CPersistUtils::persist(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, inserter);
+    core::CPersistUtils::persist(TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG,
+                                 m_TrainFractionPerFoldOverride, inserter);
+    core::CPersistUtils::persist(TRAIN_FRACTION_PER_FOLD_TAG, m_TrainFractionPerFold, inserter);
     core::CPersistUtils::persist(STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG,
                                  m_StopHyperparameterOptimizationEarly, inserter);
     // m_TunableHyperparameters is not persisted explicitly, it is restored from overriden hyperparameters
@@ -1824,8 +1827,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav
         RESTORE(FOLD_ROUND_TEST_LOSSES_TAG,
                 core::CPersistUtils::restore(FOLD_ROUND_TEST_LOSSES_TAG,
                                              m_FoldRoundTestLosses, traverser))
-        RESTORE(FRACTIONAL_FOLDS_TAG,
-                core::CPersistUtils::restore(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, traverser))
         RESTORE(INITIALIZATION_STAGE_TAG,
                 core::CPersistUtils::restore(INITIALIZATION_STAGE_TAG,
                                              initializationStage, traverser))
@@ -1855,6 +1856,8 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav
         RESTORE(NUMBER_FOLDS_OVERRIDE_TAG,
                 core::CPersistUtils::restore(NUMBER_FOLDS_OVERRIDE_TAG,
                                              m_NumberFoldsOverride, traverser))
+        RESTORE(NUMBER_FOLDS_TAG,
+                core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser))
         RESTORE(NUMBER_ROUNDS_TAG,
                 core::CPersistUtils::restore(NUMBER_ROUNDS_TAG, m_NumberRounds, traverser))
         RESTORE(NUMBER_SPLITS_PER_FEATURE_TAG,
@@ -1880,6 +1883,12 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav
                 core::CPersistUtils::restore(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, traverser))
         RESTORE(TRAINING_ROW_MASKS_TAG,
                 core::CPersistUtils::restore(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, traverser))
+        RESTORE(TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG,
+                core::CPersistUtils::restore(TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG,
+                                             m_TrainFractionPerFoldOverride, traverser))
+        RESTORE(TRAIN_FRACTION_PER_FOLD_TAG,
+                core::CPersistUtils::restore(TRAIN_FRACTION_PER_FOLD_TAG,
+                                             m_TrainFractionPerFold, traverser))
         RESTORE(STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG,
                 core::CPersistUtils::restore(STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG,
                                              m_StopHyperparameterOptimizationEarly, traverser))
@@ -1913,7 +1922,7 @@ void CBoostedTreeImpl::checkRestoredInvariants() const {
         VIOLATES_INVARIANT(m_TunableHyperparameters.size(), ==, samples.size());
     }
     if (m_FoldRoundTestLosses.size() > 0) {
-        VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, this->numberFolds());
+        VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, m_NumberFolds);
         for (const auto& losses : m_FoldRoundTestLosses) {
             VIOLATES_INVARIANT(losses.size(), ==, m_NumberRounds);
         }
@@ -2023,7 +2032,7 @@ CBoostedTreeImpl::hyperparameterImportance() const {
         double hyperparameterValue;
         SHyperparameterImportance::EType hyperparameterType{
             boosted_tree_detail::SHyperparameterImportance::E_Double};
-        switch (i) {
+        switch (static_cast<EHyperparameters>(i)) {
         case E_Alpha:
             hyperparameterValue = m_Regularization.depthPenaltyMultiplier();
             break;
diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc
index fed947f074..4b5cc3cd94 100644
--- a/lib/maths/CDataFrameUtils.cc
+++ b/lib/maths/CDataFrameUtils.cc
@@ -494,23 +494,24 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
                                                    const core::CDataFrame& frame,
                                                    std::size_t targetColumn,
                                                    CPRNG::CXorOShiro128Plus rng,
-                                                   double numberFolds,
+                                                   std::size_t numberFolds,
+                                                   double trainFractionPerFold,
                                                    std::size_t numberBuckets,
                                                    const core::CPackedBitVector& allTrainingRowsMask) {
     TDoubleVec frequencies;
     TStratifiedSamplerPtr sampler;
 
-    double numberRows{allTrainingRowsMask.manhattan()};
-    if (numberRows < std::max(numberFolds, 2.0)) {
+    double numberTrainingRows{allTrainingRowsMask.manhattan()};
+    if (static_cast<std::size_t>(numberTrainingRows) < numberFolds) {
         HANDLE_FATAL(<< "Input error: unsufficient training data provided.");
         return {{}, {}, {}};
     }
 
-    // We sample the smaller of the test/train sets in the loop.
-    std::size_t numberTrainingRows{static_cast<std::size_t>(
-        1.0 - (numberFolds - 1.0) / numberFolds * numberRows + 0.5)};
-    std::size_t numberTestingRows{static_cast<std::size_t>(numberRows) - numberTrainingRows};
-    std::size_t sampleSize{std::min(numberTrainingRows, numberTestingRows)};
+    // We sample the smaller of the test or train set in the loop.
+    std::size_t sampleSize{static_cast<std::size_t>(
+        std::min(trainFractionPerFold, 1.0 - trainFractionPerFold) * numberTrainingRows + 0.5)};
+    double minimumSizeToSample{static_cast<double>(sampleSize + numberFolds)};
+    LOG_TRACE(<< "sample size = " << sampleSize);
 
     if (frame.columnIsCategorical()[targetColumn]) {
         std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler(
@@ -523,13 +524,14 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
 
     LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan());
 
-    TPackedBitVectorVec testingRowMasks(static_cast<std::size_t>(std::ceil(numberFolds)));
+    TPackedBitVectorVec testingRowMasks(numberFolds);
 
     TSizeVec rowIndices;
     core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask};
     for (std::size_t fold = 0; fold < testingRowMasks.size(); ++fold) {
-        if (candidateTestingRowsMask.manhattan() <
-            static_cast<double>(sampleSize - numberFolds)) {
+        if (candidateTestingRowsMask.manhattan() < minimumSizeToSample) {
+            testingRowMasks[fold] = candidateTestingRowsMask;
+        } else {
             frame.readRows(1, 0, frame.numberRows(),
                            [&](const TRowItr& beginRows, const TRowItr& endRows) {
                                for (auto row = beginRows; row != endRows; ++row) {
@@ -547,8 +549,6 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
             }
             testingRowMasks[fold].extend(false, allTrainingRowsMask.size() -
                                                     testingRowMasks[fold].size());
-        } else {
-            testingRowMasks[fold] = candidateTestingRowsMask;
         }
 
         // We exclusive or here to remove the rows we've selected for the current
@@ -558,7 +558,7 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
 
     TPackedBitVectorVec trainingRowMasks{complementRowMasks(testingRowMasks, allTrainingRowsMask)};
 
-    if (numberTrainingRows < numberTestingRows) {
+    if (trainFractionPerFold < 0.5) {
         std::swap(trainingRowMasks, testingRowMasks);
     }
 
diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index a05128368f..77bdbc19e8 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -692,6 +692,70 @@ BOOST_AUTO_TEST_CASE(testMsle) {
     // TODO #1744 test quality of MSLE on data with log-normal errors.
 }
 
+BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {
+
+    // Test regression using a very low train fraction per fold. This should
+    // run in seconds, but we don't assert on the runtime because we don't
+    // run CI on bare metal, and produce a good quality solution because the
+    // final train is still on the full training set.
+
+    test::CRandomNumbers rng;
+    double noiseVariance{100.0};
+    std::size_t trainRows{10000};
+    std::size_t testRows{200};
+    std::size_t rows{trainRows + testRows};
+    std::size_t cols{6};
+
+    auto target = [&] {
+        TDoubleVec m;
+        TDoubleVec s;
+        rng.generateUniformSamples(0.0, 10.0, cols - 1, m);
+        rng.generateUniformSamples(-10.0, 10.0, cols - 1, s);
+        return [=](const TRowRef& row) {
+            double result{0.0};
+            for (std::size_t i = 0; i < cols - 1; ++i) {
+                result += m[i] + s[i] * row[i];
+            }
+            return result;
+        };
+    }();
+
+    auto frame = core::makeMainStorageDataFrame(cols, rows).first;
+
+    TDoubleVecVec x(cols - 1);
+    for (std::size_t i = 0; i < cols - 1; ++i) {
+        rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
+    }
+
+    TDoubleVec noise;
+    rng.generateNormalSamples(0.0, noiseVariance, rows, noise);
+
+    fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame);
+
+    auto regression = maths::CBoostedTreeFactory::constructFromParameters(
+                          1, std::make_unique<maths::boosted_tree::CMse>())
+                          .trainFractionPerFold(0.05)
+                          .buildFor(*frame, cols - 1);
+
+    core::CStopWatch timer{true};
+    regression->train();
+    regression->predict();
+    LOG_DEBUG(<< "train duration " << timer.stop() << "ms");
+
+    double bias;
+    double rSquared;
+    std::tie(bias, rSquared) = computeEvaluationMetrics(
+        *frame, trainRows, rows,
+        [&](const TRowRef& row_) { return regression->readPrediction(row_)[0]; },
+        target, noiseVariance / static_cast<double>(rows));
+
+    // Unbiased...
+    BOOST_REQUIRE_CLOSE_ABSOLUTE(
+        0.0, bias, 4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
+    // Good R^2...
+    BOOST_TEST_REQUIRE(rSquared > 0.98);
+}
+
 BOOST_AUTO_TEST_CASE(testThreading) {
 
     // Test we get the same results whether we run with multiple threads or not.
diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc
index 6dc7c8b09d..07394afae1 100644
--- a/lib/maths/unittest/CDataFrameUtilsTest.cc
+++ b/lib/maths/unittest/CDataFrameUtilsTest.cc
@@ -507,7 +507,9 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
         maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
         std::tie(trainingRowMasks, testingRowMasks, std::ignore) =
             maths::CDataFrameUtils::stratifiedCrossValidationRowMasks(
-                1, *frame, 0, rng, numberFolds[0], numberBins, allTrainingRowsMask);
+                1, *frame, 0, rng, numberFolds[0],
+                1.0 - 1.0 / static_cast<double>(numberFolds[0]), numberBins,
+                allTrainingRowsMask);
 
         BOOST_REQUIRE_EQUAL(numberFolds[0], trainingRowMasks.size());
         BOOST_REQUIRE_EQUAL(numberFolds[0], testingRowMasks.size());
@@ -564,7 +566,9 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
         maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
         std::tie(std::ignore, testingRowMasks, std::ignore) =
             maths::CDataFrameUtils::stratifiedCrossValidationRowMasks(
-                1, *frame, 0, rng, numberFolds[0], numberBins, allTrainingRowsMask);
+                1, *frame, 0, rng, numberFolds[0],
+                1.0 - 1.0 / static_cast<double>(numberFolds[0]), numberBins,
+                allTrainingRowsMask);
 
         TDoubleVecVec targetDecile(numberFolds[0], TDoubleVec(numberBins));
 

From 04248eeb94700f76044f99bcb25124700f31d340 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 10:37:36 +0100
Subject: [PATCH 05/35] Adjust the validation loss variance estimate to remove
 affects of sampling bias

---
 include/maths/CBayesianOptimisation.h  |  9 ++++++--
 include/maths/CBoostedTreeFactory.h    |  2 +-
 include/maths/CBoostedTreeImpl.h       |  3 +++
 lib/maths/CBayesianOptimisation.cc     | 14 +++++++++----
 lib/maths/CBoostedTreeImpl.cc          | 29 ++++++++++++++++++++++++--
 lib/maths/unittest/CBoostedTreeTest.cc |  5 ++---
 6 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/include/maths/CBayesianOptimisation.h b/include/maths/CBayesianOptimisation.h
index 36f9bc891a..bbec6489f9 100644
--- a/include/maths/CBayesianOptimisation.h
+++ b/include/maths/CBayesianOptimisation.h
@@ -72,6 +72,10 @@ class MATHS_EXPORT CBayesianOptimisation {
     //! variance in the error in \p fx w.r.t. the true value is \p vx.
     void add(TVector x, double fx, double vx);
 
+    //! Any portion of the variance of the function error which is explained and
+    //! shouldn't be included in the kernel.
+    void explainedErrorVariance(double vx);
+
     //! Get the bounding box (in the function domain) in which we're minimizing.
     std::pair<TVector, TVector> boundingBox() const;
 
@@ -170,8 +174,9 @@ class MATHS_EXPORT CBayesianOptimisation {
 private:
     CPRNG::CXorOShiro128Plus m_Rng;
     std::size_t m_Restarts;
-    double m_RangeShift = 0.0;
-    double m_RangeScale = 1.0;
+    double m_RangeShift{0.0};
+    double m_RangeScale{1.0};
+    double m_ExplainedErrorVariance{0.0};
     TVector m_MinBoundary;
     TVector m_MaxBoundary;
     TVectorDoublePrVec m_FunctionMeanValues;
diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
index ca06cdf0f2..4c18a61354 100644
--- a/include/maths/CBoostedTreeFactory.h
+++ b/include/maths/CBoostedTreeFactory.h
@@ -279,7 +279,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TOptionalSize m_BayesianOptimisationRestarts;
     bool m_StratifyRegressionCrossValidation = true;
     double m_InitialDownsampleRowsPerFeature = 200.0;
-    std::size_t m_MaximumNumberOfTrainRows = 750000;
+    std::size_t m_MaximumNumberOfTrainRows = 500000;
     double m_GainPerNode1stPercentile = 0.0;
     double m_GainPerNode50thPercentile = 0.0;
     double m_GainPerNode90thPercentile = 0.0;
diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
index a2d948aa33..e102529607 100644
--- a/include/maths/CBoostedTreeImpl.h
+++ b/include/maths/CBoostedTreeImpl.h
@@ -290,6 +290,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Compute the mean of the loss function on the masked rows of \p frame.
     double meanLoss(const core::CDataFrame& frame, const core::CPackedBitVector& rowMask) const;
 
+    //! Compute the overall variance of the error we see between folds.
+    double betweenFoldTestLossVariance() const;
+
     //! Get the root node of \p tree.
     static const CBoostedTreeNode& root(const TNodeVec& tree);
 
diff --git a/lib/maths/CBayesianOptimisation.cc b/lib/maths/CBayesianOptimisation.cc
index 2d6705be10..457249c03d 100644
--- a/lib/maths/CBayesianOptimisation.cc
+++ b/lib/maths/CBayesianOptimisation.cc
@@ -30,8 +30,9 @@ namespace ml {
 namespace maths {
 
 namespace {
-const std::string VERSION_7_5_TAG{"7.5"};
+using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
 
+const std::string VERSION_7_5_TAG{"7.5"};
 const std::string MIN_BOUNDARY_TAG{"min_boundary"};
 const std::string MAX_BOUNDARY_TAG{"max_boundary"};
 const std::string ERROR_VARIANCES_TAG{"error_variances"};
@@ -106,6 +107,10 @@ void CBayesianOptimisation::add(TVector x, double fx, double vx) {
     m_ErrorVariances.push_back(CTools::pow2(m_RangeScale) * vx);
 }
 
+void CBayesianOptimisation::explainedErrorVariance(double vx) {
+    m_ExplainedErrorVariance = CTools::pow2(m_RangeScale) * vx;
+}
+
 std::pair<CBayesianOptimisation::TVector, CBayesianOptimisation::TVector>
 CBayesianOptimisation::boundingBox() const {
     return {m_MinBoundary, m_MaxBoundary};
@@ -114,7 +119,6 @@ CBayesianOptimisation::boundingBox() const {
 std::pair<CBayesianOptimisation::TVector, CBayesianOptimisation::TOptionalDouble>
 CBayesianOptimisation::maximumExpectedImprovement() {
 
-    using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
     using TMinAccumulator =
         CBasicStatistics::COrderStatisticsHeap<std::pair<double, TVector>>;
 
@@ -583,6 +587,7 @@ void CBayesianOptimisation::precondition() {
     for (auto& variance : m_ErrorVariances) {
         variance /= CTools::pow2(m_RangeScale);
     }
+    m_ExplainedErrorVariance /= CTools::pow2(m_RangeScale);
 
     TMeanVarAccumulator rangeMoments;
     for (const auto& value : m_FunctionMeanValues) {
@@ -599,6 +604,7 @@ void CBayesianOptimisation::precondition() {
     for (auto& variance : m_ErrorVariances) {
         variance *= CTools::pow2(m_RangeScale);
     }
+    m_ExplainedErrorVariance *= CTools::pow2(m_RangeScale);
 }
 
 CBayesianOptimisation::TVector CBayesianOptimisation::function() const {
@@ -610,10 +616,10 @@ CBayesianOptimisation::TVector CBayesianOptimisation::function() const {
 }
 
 double CBayesianOptimisation::meanErrorVariance() const {
-    using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;
     TMeanAccumulator variance;
     variance.add(m_ErrorVariances);
-    return CBasicStatistics::mean(variance);
+    return CBasicStatistics::mean(variance) -
+           std::min(m_ExplainedErrorVariance, 0.99 * CBasicStatistics::mean(variance));
 }
 
 CBayesianOptimisation::TMatrix CBayesianOptimisation::dKerneld(const TVector& a, int k) const {
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 93d2560434..0340b4f47f 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -16,6 +16,7 @@
 #include <core/Constants.h>
 #include <core/RestoreMacros.h>
 
+#include <maths/CBasicStatistics.h>
 #include <maths/CBasicStatisticsPersist.h>
 #include <maths/CBayesianOptimisation.h>
 #include <maths/CBoostedTree.h>
@@ -1278,6 +1279,20 @@ double CBoostedTreeImpl::meanLoss(const core::CDataFrame& frame,
     return CBasicStatistics::mean(loss);
 }
 
+double CBoostedTreeImpl::betweenFoldTestLossVariance() const {
+    TMeanVarAccumulator result;
+    for (const auto& testLosses : m_FoldRoundTestLosses) {
+        TMeanAccumulator meanTestLoss;
+        for (std::size_t i = 0; i <= m_CurrentRound; ++i) {
+            if (testLosses[i] != boost::none) {
+                meanTestLoss.add(*testLosses[i]);
+            }
+        }
+        result.add(CBasicStatistics::mean(meanTestLoss));
+    }
+    return CBasicStatistics::maximumLikelihoodVariance(result);
+}
+
 CBoostedTreeNode& CBoostedTreeImpl::root(TNodeVec& tree) {
     return tree[0];
 }
@@ -1358,13 +1373,23 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss
     double meanLoss{CBasicStatistics::mean(lossMoments)};
     double lossVariance{CBasicStatistics::variance(lossMoments)};
 
-    LOG_TRACE(<< "round = " << m_CurrentRound << " loss = " << meanLoss << " variance = "
-              << lossVariance << ": regularization = " << m_Regularization.print()
+    LOG_TRACE(<< "round = " << m_CurrentRound << ", loss = " << meanLoss
+              << ", total variance = " << lossVariance
+              << ", explained variance = " << this->betweenFoldTestLossVariance());
+    LOG_TRACE(<< "regularization = " << m_Regularization.print()
               << ", downsample factor = " << m_DownsampleFactor << ", eta = " << m_Eta
               << ", eta growth rate per tree = " << m_EtaGrowthRatePerTree
               << ", feature bag fraction = " << m_FeatureBagFraction);
 
     bopt.add(parameters, meanLoss, lossVariance);
+    // One fold might have examples which are harder to predict on average than
+    // another fold, particularly if the sample size is small. What we really care
+    // about is the variation between fold loss values after accounting for any
+    // systematic effect due to sampling. Running for multiple rounds allows us
+    // to estimate this effect and we remove it when characterising the uncertainty
+    // in the loss values in the Gaussian Process.
+    bopt.explainedErrorVariance(this->betweenFoldTestLossVariance());
+
     if (m_CurrentRound < m_HyperparameterSamples.size()) {
         std::copy(m_HyperparameterSamples[m_CurrentRound].begin(),
                   m_HyperparameterSamples[m_CurrentRound].end(), parameters.data());
diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index 77bdbc19e8..077f16329a 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -704,7 +704,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {
     std::size_t trainRows{10000};
     std::size_t testRows{200};
     std::size_t rows{trainRows + testRows};
-    std::size_t cols{6};
+    std::size_t cols{8};
 
     auto target = [&] {
         TDoubleVec m;
@@ -720,8 +720,6 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {
         };
     }();
 
-    auto frame = core::makeMainStorageDataFrame(cols, rows).first;
-
     TDoubleVecVec x(cols - 1);
     for (std::size_t i = 0; i < cols - 1; ++i) {
         rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
@@ -730,6 +728,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {
     TDoubleVec noise;
     rng.generateNormalSamples(0.0, noiseVariance, rows, noise);
 
+    auto frame = core::makeMainStorageDataFrame(cols, rows).first;
     fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame);
 
     auto regression = maths::CBoostedTreeFactory::constructFromParameters(

From f72dd4cdf189d5564b1c9f17cb6423c6c5dbd35e Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 10:58:13 +0100
Subject: [PATCH 06/35] Formatting

---
 lib/maths/unittest/CLowessTest.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc
index cd3073949b..7c3a44634b 100644
--- a/lib/maths/unittest/CLowessTest.cc
+++ b/lib/maths/unittest/CLowessTest.cc
@@ -53,8 +53,7 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
         [&](double x) { return scale[0] * x / 10.0; },
         [&](double x) {
             return scale[0] * (x - offset[0]) * (x - offset[0]) / 100.0;
-        }
-    };
+        }};
 
     for (std::size_t i = 0; i < 100; ++i) {
 

From fc0a3bca50c4c43b31fb47dba13e06dc290c53ff Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 12:05:18 +0100
Subject: [PATCH 07/35] Docs

---
 docs/CHANGELOG.asciidoc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index 173a2ab48e..a414fc85c8 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -47,6 +47,13 @@
 * Ensure bucket `event_count` is calculated for jobs with 1 second bucket spans.
   (See {ml-pull}1908[#1908].)
 
+== {es} version 7.15.0
+
+=== Enhancements
+
+* Speed up training of regression and classification models on very large data sets.
+  (See {ml-pull}1941[#1941].)
+
 == {es} version 7.14.0
 
 === Enhancements

From 78f6e379f3ec0c7adbe7839650cbee02e6088893 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 13:35:06 +0100
Subject: [PATCH 08/35] Avoid infinite loop

---
 lib/maths/CBoostedTreeImpl.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 0340b4f47f..7cc581cd0b 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -716,6 +716,9 @@ CBoostedTreeImpl::downsample(const core::CPackedBitVector& trainingRowMask) cons
     // curvatures for each tree we train. The sampling scheme should minimize
     // the correlation with previous trees for fixed sample size so randomly
     // sampling without replacement is appropriate.
+    if (trainingRowMask.manhattan() == 0.0) {
+        return trainingRowMask;
+    }
     core::CPackedBitVector result;
     do {
         result = core::CPackedBitVector{};

From caa7c8241485b5c93d8d025026da1f183202cdfd Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 15:48:27 +0100
Subject: [PATCH 09/35] Correct handling of eta growth rate per tree

---
 lib/maths/CBoostedTreeFactory.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index 821991d496..b52706c6b2 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -460,6 +460,8 @@ void CBoostedTreeFactory::initializeHyperparametersSetup(core::CDataFrame& frame
     if (m_TreeImpl->m_EtaOverride == boost::none) {
         m_TreeImpl->m_Eta =
             computeEta(frame.numberColumns() - this->numberExtraColumnsForTrain());
+    }
+    if (m_TreeImpl->m_EtaGrowthRatePerTreeOverride == boost::none) {
         m_TreeImpl->m_EtaGrowthRatePerTree = 1.0 + m_TreeImpl->m_Eta / 2.0;
     }
 

From b46c76ed2abb4988b03a166c7b0ee9a5f6bf5e4a Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 15:53:39 +0100
Subject: [PATCH 10/35] Correct edge case test

---
 lib/maths/unittest/CBoostedTreeTest.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index 077f16329a..2e62a00020 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -431,8 +431,8 @@ BOOST_AUTO_TEST_CASE(testEdgeCases) {
 
     auto frame = core::makeMainStorageDataFrame(cols).first;
 
-    fillDataFrame(2, 0, 2, {{1.0}, {1.0}}, {0.0, 0.0},
-                  [](const TRowRef&) { return 1.0; }, *frame);
+    fillDataFrame(5, 0, 2, {{1.0}, {1.0}, {1.0}, {1.0}, {1.0}},
+                  {0.0, 0.0, 0.0, 0.0, 0.0}, [](const TRowRef&) { return 1.0; }, *frame);
 
     try {
         auto regression = maths::CBoostedTreeFactory::constructFromParameters(

From 7318193fa66ef6a87c54dc721003f7c1fb7876f5 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 2 Jul 2021 16:08:11 +0100
Subject: [PATCH 11/35] Test threshold

---
 lib/maths/unittest/CBoostedTreeTest.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index 2e62a00020..e1c13a0abd 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -1417,7 +1417,7 @@ BOOST_AUTO_TEST_CASE(testBinomialLogisticRegression) {
         LOG_DEBUG(<< "log relative error = "
                   << maths::CBasicStatistics::mean(logRelativeError));
 
-        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.70);
+        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.71);
         meanLogRelativeError.add(maths::CBasicStatistics::mean(logRelativeError));
     }
 

From e55ea4175f4b3991cec9a064697c2cb9328053e5 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Mon, 5 Jul 2021 10:51:09 +0100
Subject: [PATCH 12/35] Handle the case we can't sample train/test folds
 without replacement and unit test

---
 lib/maths/CDataFrameUtils.cc              | 101 +++++++++++++---------
 lib/maths/unittest/CDataFrameUtilsTest.cc |  37 +++++++-
 2 files changed, 94 insertions(+), 44 deletions(-)

diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc
index 4b5cc3cd94..10a43a9b53 100644
--- a/lib/maths/CDataFrameUtils.cc
+++ b/lib/maths/CDataFrameUtils.cc
@@ -122,10 +122,10 @@ class CStratifiedSampler {
     TRowSamplerVec m_Samplers;
     TSamplerSelector m_Selector;
 };
-using TStratifiedSamplerPtr = std::unique_ptr<CStratifiedSampler>;
+using TStratifiedSamplerUPtr = std::unique_ptr<CStratifiedSampler>;
 
 //! Get a classifier stratified row sampler for cross fold validation.
-std::pair<TStratifiedSamplerPtr, TDoubleVec>
+std::pair<TStratifiedSamplerUPtr, TDoubleVec>
 classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads,
                                               const core::CDataFrame& frame,
                                               std::size_t targetColumn,
@@ -153,7 +153,7 @@ classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads,
 }
 
 //! Get a regression stratified row sampler for cross fold validation.
-TStratifiedSamplerPtr
+TStratifiedSamplerUPtr
 regressionStratifiedCrossValiationRowSampler(std::size_t numberThreads,
                                              const core::CDataFrame& frame,
                                              std::size_t targetColumn,
@@ -498,8 +498,6 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
                                                    double trainFractionPerFold,
                                                    std::size_t numberBuckets,
                                                    const core::CPackedBitVector& allTrainingRowsMask) {
-    TDoubleVec frequencies;
-    TStratifiedSamplerPtr sampler;
 
     double numberTrainingRows{allTrainingRowsMask.manhattan()};
     if (static_cast<std::size_t>(numberTrainingRows) < numberFolds) {
@@ -507,53 +505,72 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
         return {{}, {}, {}};
     }
 
+    double sampleFraction{std::min(trainFractionPerFold, 1.0 - trainFractionPerFold)};
+    double excessSampleFraction{
+        std::max(sampleFraction - 1.0 / static_cast<double>(numberFolds), 0.0)};
+
     // We sample the smaller of the test or train set in the loop.
+    std::size_t excessSampleSize{static_cast<std::size_t>(
+        std::ceil(excessSampleFraction * numberTrainingRows))};
     std::size_t sampleSize{static_cast<std::size_t>(
-        std::min(trainFractionPerFold, 1.0 - trainFractionPerFold) * numberTrainingRows + 0.5)};
-    double minimumSizeToSample{static_cast<double>(sampleSize + numberFolds)};
-    LOG_TRACE(<< "sample size = " << sampleSize);
+        (sampleFraction - excessSampleFraction) * numberTrainingRows)};
+    LOG_TRACE(<< "excess sample size = " << excessSampleSize
+              << ", sample size = " << sampleSize);
 
-    if (frame.columnIsCategorical()[targetColumn]) {
-        std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler(
-            numberThreads, frame, targetColumn, rng, sampleSize, allTrainingRowsMask);
-    } else {
-        sampler = regressionStratifiedCrossValiationRowSampler(
-            numberThreads, frame, targetColumn, rng, sampleSize, numberBuckets,
-            allTrainingRowsMask);
-    }
+    TDoubleVec frequencies;
+
+    auto makeSampler = [&](std::size_t size) {
+        TStratifiedSamplerUPtr result;
+        if (size > 0) {
+            if (frame.columnIsCategorical()[targetColumn]) {
+                std::tie(result, frequencies) = classifierStratifiedCrossValidationRowSampler(
+                    numberThreads, frame, targetColumn, rng, size, allTrainingRowsMask);
+            } else {
+                result = regressionStratifiedCrossValiationRowSampler(
+                    numberThreads, frame, targetColumn, rng, size,
+                    numberBuckets, allTrainingRowsMask);
+            }
+        }
+        return result;
+    };
+
+    auto excessSampler = makeSampler(excessSampleSize);
+    auto sampler = makeSampler(sampleSize);
 
     LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan());
 
     TPackedBitVectorVec testingRowMasks(numberFolds);
 
     TSizeVec rowIndices;
-    core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask};
-    for (std::size_t fold = 0; fold < testingRowMasks.size(); ++fold) {
-        if (candidateTestingRowsMask.manhattan() < minimumSizeToSample) {
-            testingRowMasks[fold] = candidateTestingRowsMask;
-        } else {
-            frame.readRows(1, 0, frame.numberRows(),
-                           [&](const TRowItr& beginRows, const TRowItr& endRows) {
-                               for (auto row = beginRows; row != endRows; ++row) {
-                                   sampler->sample(*row);
-                               }
-                           },
-                           &candidateTestingRowsMask);
-            sampler->finishSampling(rng, rowIndices);
-            std::sort(rowIndices.begin(), rowIndices.end());
-            LOG_TRACE(<< "# row indices = " << rowIndices.size());
-
-            for (auto row : rowIndices) {
-                testingRowMasks[fold].extend(false, row - testingRowMasks[fold].size());
-                testingRowMasks[fold].extend(true);
-            }
-            testingRowMasks[fold].extend(false, allTrainingRowsMask.size() -
-                                                    testingRowMasks[fold].size());
+    auto sample = [&](const TStratifiedSamplerUPtr& sampler_,
+                      const core::CPackedBitVector& candidateTestingRowsMask) {
+        frame.readRows(1, 0, frame.numberRows(),
+                       [&](const TRowItr& beginRows, const TRowItr& endRows) {
+                           for (auto row = beginRows; row != endRows; ++row) {
+                               sampler_->sample(*row);
+                           }
+                       },
+                       &candidateTestingRowsMask);
+        sampler_->finishSampling(rng, rowIndices);
+        std::sort(rowIndices.begin(), rowIndices.end());
+        LOG_TRACE(<< "# row indices = " << rowIndices.size());
+
+        core::CPackedBitVector result;
+        for (auto row : rowIndices) {
+            result.extend(false, row - result.size());
+            result.extend(true);
         }
+        result.extend(false, allTrainingRowsMask.size() - result.size());
+        return result;
+    };
 
-        // We exclusive or here to remove the rows we've selected for the current
-        // test/train fold. This is equivalent to sampling without replacement.
-        candidateTestingRowsMask ^= testingRowMasks[fold];
+    core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask};
+    for (auto& testingRowMask : testingRowMasks) {
+        testingRowMask = sample(sampler, candidateTestingRowsMask);
+        candidateTestingRowsMask ^= testingRowMask;
+        if (excessSampler != nullptr) {
+            testingRowMask |= sample(excessSampler, allTrainingRowsMask ^ testingRowMask);
+        }
     }
 
     TPackedBitVectorVec trainingRowMasks{complementRowMasks(testingRowMasks, allTrainingRowsMask)};
@@ -1091,7 +1108,7 @@ CDataFrameUtils::maximizeMinimumRecallForMulticlass(std::size_t numberThreads,
 
     // No need to sample if were going to use every row we've been given.
     if (numberSamples < static_cast<std::size_t>(rowMask.manhattan())) {
-        TStratifiedSamplerPtr sampler;
+        TStratifiedSamplerUPtr sampler;
         std::tie(sampler, std::ignore) = classifierStratifiedCrossValidationRowSampler(
             numberThreads, frame, targetColumn, rng, numberSamples, rowMask);
 
diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc
index 07394afae1..748a5aedc4 100644
--- a/lib/maths/unittest/CDataFrameUtilsTest.cc
+++ b/lib/maths/unittest/CDataFrameUtilsTest.cc
@@ -467,8 +467,10 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
     //   2) The test masks are disjoint for each fold,
     //   3) The train and test masks are disjoint for a given fold,
     //   4) They're all subsets of the initial mask supplied,
-    //   5) The number of examples in each category per fold is proportional to
-    //      their overall frequency.
+    //   5) The number of examples in each category per fold is proportional
+    //      to their overall frequency.
+    //   6) Test we get the correct size masks if we are using more or less
+    //      training data than implied by k-fold cross-validation.
 
     using TDoubleDoubleUMap = boost::unordered_map<double, double>;
 
@@ -609,6 +611,37 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
             BOOST_TEST_REQUIRE(maths::CBasicStatistics::variance(testTargetDecileMoments) < 0.02);
         }
     }
+
+    for (auto fraction : {0.1, 0.4}) {
+        TDoubleVec categories;
+        testRng.generateNormalSamples(0.0, 3.0, numberRows, categories);
+
+        auto frame = core::makeMainStorageDataFrame(numberCols).first;
+        frame->categoricalColumns(TBoolVec{true});
+        for (std::size_t i = 0; i < numberRows; ++i) {
+            frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
+                *column = std::floor(std::fabs(categories[i]));
+            });
+        }
+        frame->finishWritingRows();
+
+        core::CPackedBitVector allTrainingRowsMask{numberRows, true};
+
+        maths::CDataFrameUtils::TPackedBitVectorVec trainingRowMasks;
+        maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
+        std::tie(trainingRowMasks, testingRowMasks, std::ignore) =
+            maths::CDataFrameUtils::stratifiedCrossValidationRowMasks(
+                1, *frame, 0, rng, 3, fraction, numberBins, allTrainingRowsMask);
+
+        BOOST_REQUIRE_EQUAL(trainingRowMasks.size(), testingRowMasks.size());
+        for (std::size_t i = 0; i < trainingRowMasks.size(); ++i) {
+            BOOST_REQUIRE_EQUAL(
+                numberRows, static_cast<std::size_t>(
+                                (trainingRowMasks[i] | testingRowMasks[i]).manhattan()));
+            BOOST_REQUIRE_EQUAL(fraction, trainingRowMasks[i].manhattan() /
+                                              static_cast<double>(numberRows));
+        }
+    }
 }
 
 BOOST_AUTO_TEST_CASE(testMicWithColumn) {

From dd002c338e873de41e7e618e879764474dd4f659 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Wed, 7 Jul 2021 10:17:02 +0100
Subject: [PATCH 13/35] Handle edge case creating train/test splits with very
 little data

---
 lib/maths/CDataFrameUtils.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc
index 10a43a9b53..dd074eacc3 100644
--- a/lib/maths/CDataFrameUtils.cc
+++ b/lib/maths/CDataFrameUtils.cc
@@ -501,7 +501,7 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
 
     double numberTrainingRows{allTrainingRowsMask.manhattan()};
     if (static_cast<std::size_t>(numberTrainingRows) < numberFolds) {
-        HANDLE_FATAL(<< "Input error: unsufficient training data provided.");
+        HANDLE_FATAL(<< "Input error: insufficient training data provided.");
         return {{}, {}, {}};
     }
 
@@ -512,8 +512,8 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
     // We sample the smaller of the test or train set in the loop.
     std::size_t excessSampleSize{static_cast<std::size_t>(
         std::ceil(excessSampleFraction * numberTrainingRows))};
-    std::size_t sampleSize{static_cast<std::size_t>(
-        (sampleFraction - excessSampleFraction) * numberTrainingRows)};
+    std::size_t sampleSize{static_cast<std::size_t>(std::max(
+        (1.0 + 1e-8) * (sampleFraction - excessSampleFraction) * numberTrainingRows, 1.0))};
     LOG_TRACE(<< "excess sample size = " << excessSampleSize
               << ", sample size = " << sampleSize);
 
@@ -536,6 +536,10 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
 
     auto excessSampler = makeSampler(excessSampleSize);
     auto sampler = makeSampler(sampleSize);
+    if (sampler == nullptr) {
+        HANDLE_FATAL(<< "Internal error: failed to create train/test splits.");
+        return {{}, {}, {}};
+    }
 
     LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan());
 

From 37d469085955abea89ed6da3631557d873930721 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Wed, 7 Jul 2021 13:00:49 +0100
Subject: [PATCH 14/35] Slightly relax tests to pass on all platforms

---
 ...CDataFrameAnalyzerFeatureImportanceTest.cc | 48 +++++++++++++------
 lib/maths/unittest/CBoostedTreeTest.cc        |  4 +-
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
index adc367a4f7..d22a808c88 100644
--- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
+++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
@@ -544,9 +544,14 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) {
     TMeanAccumulator c2TotalShapExpected;
     TMeanAccumulator c3TotalShapExpected;
     TMeanAccumulator c4TotalShapExpected;
-    double c1Sum{0.0}, c2Sum{0.0}, c3Sum{0.0}, c4Sum{0.0};
-    double c1TotalShapActual{0.0}, c2TotalShapActual{0.0},
-        c3TotalShapActual{0.0}, c4TotalShapActual{0.0};
+    double c1Sum{0.0};
+    double c2Sum{0.0};
+    double c3Sum{0.0};
+    double c4Sum{0.0};
+    double c1TotalShapActual{0.0};
+    double c2TotalShapActual{0.0};
+    double c3TotalShapActual{0.0};
+    double c4TotalShapActual{0.0};
     bool hasTotalFeatureImportance{false};
     double baseline{readBaselineValue(results)};
     for (const auto& result : results.GetArray()) {
@@ -581,9 +586,12 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) {
         }
     }
 
-    // since target is generated using the linear model
-    // 50 c1 + 150 c2 + 50 c3 - 50 c4, with c1 categorical {-10,10}
-    // we expect c2 > c1 > c3 \approx c4
+    // Since the target is generated using the linear model
+    //
+    //   50 c1 + 150 c2 + 50 c3 - 50 c4, with c1 categorical {-10,10}
+    //
+    // we expect c2 > c1 > c3 \approx c4.
+
     BOOST_TEST_REQUIRE(c2Sum > c1Sum);
     // since c1 is categorical -10 or 10, it's influence is generally higher than that of c3 and c4 which are sampled
     // randomly on [-10, 10].
@@ -642,15 +650,20 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) {
     // values are indeed a local approximation of the predicted log-odds.
 
     std::size_t topShapValues{4};
-    auto resultsPair{runBinaryClassification(topShapValues, {0.5, -0.7, 0.2, -0.2})};
+    auto resultsPair{runBinaryClassification(topShapValues, {0.5, -0.7, 0.3, -0.3})};
     auto results{std::move(resultsPair.first)};
     TMeanAccumulator c1TotalShapExpected;
     TMeanAccumulator c2TotalShapExpected;
     TMeanAccumulator c3TotalShapExpected;
     TMeanAccumulator c4TotalShapExpected;
-    double c1Sum{0.0}, c2Sum{0.0}, c3Sum{0.0}, c4Sum{0.0};
-    double c1TotalShapActual[2], c2TotalShapActual[2], c3TotalShapActual[2],
-        c4TotalShapActual[2];
+    double c1Sum{0.0};
+    double c2Sum{0.0};
+    double c3Sum{0.0};
+    double c4Sum{0.0};
+    double c1TotalShapActual[2];
+    double c2TotalShapActual[2];
+    double c3TotalShapActual[2];
+    double c4TotalShapActual[2];
     bool hasTotalFeatureImportance{false};
     double baselineFoo{readBaselineValue(results, "foo")};
     double baselineBar{readBaselineValue(results, "bar")};
@@ -698,13 +711,20 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) {
         }
     }
 
-    // since the target using a linear model
-    // 0.5 c1 + 0.7 c2 + 0.25 c3 - 0.25 c4
-    // to generate the log odds we expect c2 > c1 > c3 \approx c4
+    // Since the target is using the linear model
+    //
+    //   0.5 c1 - 0.7 c2 + 0.2 c3 - 0.2 c4
+    //
+    // to generate the log odds we expect c2 > c1 > c3 \approx c4.
+
+    LOG_DEBUG(<< "c1Sum = " << c1Sum << ", c2Sum = " << c2Sum
+              << ", c3Sum = " << c3Sum << ", c4Sum = " << c4Sum);
+
     BOOST_TEST_REQUIRE(c2Sum > c1Sum);
     BOOST_TEST_REQUIRE(c1Sum > c3Sum);
     BOOST_TEST_REQUIRE(c1Sum > c4Sum);
-    BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 40.0); // c3 and c4 within 40% of each other
+    BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 20.0); // c3 and c4 within 20% of each other
+
     BOOST_TEST_REQUIRE(hasTotalFeatureImportance);
     for (std::size_t i = 0; i < classes.size(); ++i) {
         if (c1TotalShapActual[i] == 0 || c2TotalShapActual[i] == 0 ||
diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index e1c13a0abd..3bc9167602 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -548,7 +548,7 @@ BOOST_AUTO_TEST_CASE(testLinear) {
         // Unbiased...
         BOOST_REQUIRE_CLOSE_ABSOLUTE(
             0.0, modelBias[i][0],
-            4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
+            6.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
         // Good R^2...
         BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97);
 
@@ -1417,7 +1417,7 @@ BOOST_AUTO_TEST_CASE(testBinomialLogisticRegression) {
         LOG_DEBUG(<< "log relative error = "
                   << maths::CBasicStatistics::mean(logRelativeError));
 
-        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.71);
+        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.77);
         meanLogRelativeError.add(maths::CBasicStatistics::mean(logRelativeError));
     }
 

From 28f22f43582dbb52b22b70e0ac252f642d383588 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 14:33:48 +0100
Subject: [PATCH 15/35] Review comments

---
 include/maths/CLowess.h           | 17 ++++--------
 include/maths/CLowessDetail.h     | 35 -----------------------
 lib/maths/unittest/CLowessTest.cc | 46 ++++++++++---------------------
 3 files changed, 20 insertions(+), 78 deletions(-)

diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
index c440f497f8..16691d029b 100644
--- a/include/maths/CLowess.h
+++ b/include/maths/CLowess.h
@@ -34,7 +34,8 @@ class CLowess {
     //!
     //! \param[in] data The training data.
     //! \param[in] numberFolds The number of folds to use in cross-validation to
-    // compute the best weight function from the family exp(-k |xi - xj|).
+    //! compute the best weight function from the family exp(-k |xi - xj|) with
+    //! k a free parameter which determines the amount of smoothing to use.
     void fit(TDoubleDoublePrVec data, std::size_t numberFolds);
 
     //! Predict the value at \p x.
@@ -52,15 +53,6 @@ class CLowess {
     //! \note Defined as zero if no data have been fit.
     double residualVariance() const;
 
-    //! Compute the sublevel set of \p f containing \p xmin.
-    //!
-    //! \param[in] xmin The argument of the minimum of the interpolated function.
-    //! \param[in] fmin The value of the minimum of the function.
-    //! \param[in] f The value of the function for which to compute the sublevel set.
-    //! \note \p f should be greater than fmin.
-    //! \note Defined as (0,0) if no data have been fit.
-    TDoubleDoublePr sublevelSet(double xmin, double fmin, double f) const;
-
     //! Get how far we are prepared to extrapolate as the interval we will search
     //! in the minimum and sublevelSet functions.
     TDoubleDoublePr extrapolationInterval() const;
@@ -81,7 +73,10 @@ class CLowess {
 private:
     TDoubleDoublePrVec m_Data;
     TSizeVec m_Mask;
-    double m_K = 0.0;
+    //! The weight to assign to data points when fitting polynomial at x is given
+    //! by exp(-k |xi - xj|). This can therefore be thought of as the inverse of
+    //! the amount of smoothing.
+    double m_K{0.0};
 };
 }
 }
diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index 1c6e5be6ed..c9cd96e9bb 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -165,41 +165,6 @@ double CLowess<N>::residualVariance() const {
     return CBasicStatistics::variance(moments);
 }
 
-template<std::size_t N>
-typename CLowess<N>::TDoubleDoublePr
-CLowess<N>::sublevelSet(double xmin, double fmin, double f) const {
-
-    if (m_Data.empty()) {
-        return {0.0, 0.0};
-    }
-    if (f <= fmin) {
-        return {xmin, xmin};
-    }
-
-    auto solve = [&](double n, double stop) {
-        double fx{fmin};
-        for (double i = 1.0; i <= n; i += 1.0) {
-            double xlast{((i - 1.0) * stop + (n - i + 1.0) * xmin) / n};
-            double x{(i * stop + (n - i) * xmin) / n};
-            double flast{fx};
-            fx = this->predict(x);
-            if (fx > f) {
-                return CTools::linearlyInterpolate(flast, fx, xlast, x, f);
-            }
-        }
-        return stop;
-    };
-
-    double xa, xb;
-    std::tie(xa, xb) = this->extrapolationInterval();
-    double alpha{(xmin - xa) / (xb - xa)};
-    double beta{1.0 - alpha};
-    LOG_TRACE(<< "alpha = " << alpha << ", beta = " << beta);
-
-    return {solve(std::ceil(alpha * 40.0), xa),
-            solve(std::ceil((1.0 - alpha) * 40.0), xb)};
-}
-
 template<std::size_t N>
 typename CLowess<N>::TDoubleDoublePr CLowess<N>::extrapolationInterval() const {
     double xa{m_Data.front().first};
diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc
index 7c3a44634b..61c4bec0fc 100644
--- a/lib/maths/unittest/CLowessTest.cc
+++ b/lib/maths/unittest/CLowessTest.cc
@@ -80,18 +80,6 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
             BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea)));
             BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::min(xmin + 0.1, xeb)));
 
-            double xa, xb;
-            std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, fmin + 0.1);
-            BOOST_TEST_REQUIRE(xa <= xmin);
-            BOOST_TEST_REQUIRE(xb >= xmin);
-
-            BOOST_TEST_REQUIRE(xmin >= xea);
-            BOOST_TEST_REQUIRE(xmin <= xeb);
-            BOOST_TEST_REQUIRE(xa >= xea);
-            BOOST_TEST_REQUIRE(xb <= xeb);
-            BOOST_TEST_REQUIRE(xa >= xea);
-            BOOST_TEST_REQUIRE(xb <= xeb);
-
             TMeanVarAccumulator residualMoments;
             for (const auto& x : data) {
                 residualMoments.add(x.second - lowess.predict(x.first));
@@ -202,6 +190,7 @@ BOOST_AUTO_TEST_CASE(testTrainingLossCurves) {
     //   2. Car-parts
     //   3. Boston
 
+    using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
     using TDoubleDoublePrVecVec = std::vector<maths::CLowess<2>::TDoubleDoublePrVec>;
 
     // clang-format off
@@ -232,30 +221,23 @@ BOOST_AUTO_TEST_CASE(testTrainingLossCurves) {
          {-0.1069061, 9.885219}},
         {{-3.800451, 8.317797}, {-3.738576, 8.053429}, {-3.403612, 8.338234}, {-2.801874, 8.890816}, {-2.333564, 8.705093},
          {-2.208987, 10.69139}, {-1.803296, 9.234116}, {-1.002829, 10.67219}, {-0.9090844, 12.46085}, {-0.804719, 13.98731}}};
+
+    // Check against judged minimum for each curve.
+    TDoubleVec preferredXmin{9.5, 1.7, -0.64, 3.6, -0.1, -0.25, 11.0, 5.0, 2.0, -0.1, -0.2, -2.3, -0.8, 5.5, 3.2, -2.3, 2.0, -0.57, -3.6};
     // clang-format on
 
-    for (const auto& curve : curves) {
+    TMeanAccumulator meanRelativeError;
+    for (std::size_t i = 0; i < curves.size(); ++i) {
         maths::CLowess<2> lowess;
-        lowess.fit(curve, curve.size());
-        double xmin, fmin;
-        std::tie(xmin, fmin) = lowess.minimum();
-        double variance{lowess.residualVariance()};
-
-        double xa, xb;
-        double ftarget{fmin + std::sqrt(variance)};
-        std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, ftarget);
-
-        if (xa <= curve.front().first) {
-            BOOST_TEST_REQUIRE(lowess.predict(xa) <= 1.01 * ftarget);
-        } else {
-            BOOST_REQUIRE_CLOSE(lowess.predict(xa), ftarget, 1.0); // 1.0%
-        }
-        if (xb >= curve.back().first) {
-            BOOST_TEST_REQUIRE(lowess.predict(xb) <= 1.01 * ftarget);
-        } else {
-            BOOST_REQUIRE_CLOSE(lowess.predict(xb), ftarget, 1.0); // 1.0%
-        }
+        lowess.fit(curves[i], curves[i].size());
+        double xmin;
+        std::tie(xmin, std::ignore) = lowess.minimum();
+
+        meanRelativeError.add(std::fabs(xmin - preferredXmin[i]) /
+                              std::fabs(preferredXmin[i]));
     }
+
+    BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, maths::CBasicStatistics::mean(meanRelativeError), 0.25);
 }
 
 BOOST_AUTO_TEST_SUITE_END()

From 4f3e3f9e2e547eb9758297500a31029e6910eac1 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 14:34:56 +0100
Subject: [PATCH 16/35] Review comments

---
 include/maths/CLowess.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
index 16691d029b..598e8d46f9 100644
--- a/include/maths/CLowess.h
+++ b/include/maths/CLowess.h
@@ -7,9 +7,8 @@
 #ifndef INCLUDED_ml_maths_CLowess_h
 #define INCLUDED_ml_maths_CLowess_h
 
-#include <maths/CLeastSquaresOnlineRegression.h>
-
 #include <maths/CBasicStatistics.h>
+#include <maths/CLeastSquaresOnlineRegression.h>
 
 #include <utility>
 #include <vector>

From 5d4edbae395d0d3cb18c48c0ae0dc7857dd9c124 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 14:43:02 +0100
Subject: [PATCH 17/35] Explain p.

---
 include/maths/CLowessDetail.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index c9cd96e9bb..64b9e3ce14 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -49,8 +49,10 @@ void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
     //
     //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }
     //
-    // where w = exp(-k (x - X_i)) and (X, Y) are the data to fit. We determine k by
-    // solving
+    // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector of
+    // parameters for the polynomial function, i.e. the coefficients p_0 + p_1 x + ...
+    //
+    // We determine k by solving
     //
     //   k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*)) } }
     //

From 5748ce1cbb0aebd9ce8593160957ae30218d7161 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 14:49:04 +0100
Subject: [PATCH 18/35] Explain poly

---
 include/maths/CLowessDetail.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index 64b9e3ce14..270c57596c 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -47,10 +47,12 @@ void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
     //
     //   f(x | p^*) = poly(x | p^*(x))
     //
-    //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }
+    //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }              (2)
     //
-    // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector of
-    // parameters for the polynomial function, i.e. the coefficients p_0 + p_1 x + ...
+    // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector
+    // of parameters for the polynomial function poly(. | p), i.e. the coefficients 
+    // p_0 + p_1 x + p_2 x^2 ... (which are determined by minimizing the weighted
+    // least square prediction errors as in (2)).
     //
     // We determine k by solving
     //

From c252b2475bb2d78e3c3da179f5b5936c2e244c39 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 14:58:12 +0100
Subject: [PATCH 19/35] Add explanation of mechanics of fit

---
 include/maths/CLowessDetail.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index 270c57596c..3c6ecbad83 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -68,6 +68,12 @@ void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
     TSizeVecVec testingMasks;
     this->setupMasks(numberFolds, trainingMasks, testingMasks);
 
+    // Here, we line search different values of m_K. We aim to cover the case we have
+    // a lot of smoothing, m_K is 0, to the case m_K is large compared to the data
+    // range so most points have very low weight and don't constrain the polynomial
+    // parameters. We finish up by polishing up the minimum on the best candidate
+    // interval using Brent's method. See CSolvers::globalMaximize for details.
+
     TDoubleVec K(17);
     double range{m_Data.back().first - m_Data.front().first};
     for (std::size_t i = 0; i < K.size(); ++i) {

From 9a7feea7af5dbe58ade1c40434396be07a8375ab Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 15:02:28 +0100
Subject: [PATCH 20/35] Make k dependency clear

---
 include/maths/CLowessDetail.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index 3c6ecbad83..d10b083f46 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -56,10 +56,11 @@ void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
     //
     // We determine k by solving
     //
-    //   k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*)) } }
+    //   k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*(k))) } }
     //
-    // where H is a hold out set and we assume Y_i ~ N(poly(X_i | p^*), sigma) with
-    // sigma estimated from the training data prediction residuals.
+    // where H is a hold out set and we assume Y_i ~ N(poly(X_i | p^*(k)), sigma)
+    // with sigma estimated from the training data prediction residuals to compute
+    // the likelihood function L(Yi | f(x | p^*(k))).
 
     m_Mask.resize(m_Data.size());
     std::iota(m_Mask.begin(), m_Mask.end(), 0);

From 5b1a0183573b3c1663f0ea9a2ffe9fd5a32f42dd Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 15:05:35 +0100
Subject: [PATCH 21/35] Document test interface

---
 include/maths/CLowess.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
index 598e8d46f9..b79b1a3c11 100644
--- a/include/maths/CLowess.h
+++ b/include/maths/CLowess.h
@@ -47,6 +47,8 @@ class CLowess {
     //! \note Defined as (0,0) if no data have been fit.
     TDoubleDoublePr minimum() const;
 
+    //! \name Test Functions
+    //@{
     //! Get an estimate of residual variance at the observed values.
     //!
     //! \note Defined as zero if no data have been fit.
@@ -55,6 +57,7 @@ class CLowess {
     //! Get how far we are prepared to extrapolate as the interval we will search
     //! in the minimum and sublevelSet functions.
     TDoubleDoublePr extrapolationInterval() const;
+    //@}
 
 private:
     using TDoubleVec = std::vector<double>;

From 93d3264aeef8aefd8ec79fc28161b7089d106a4d Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 16:04:15 +0100
Subject: [PATCH 22/35] Names, explanation and coding style guideline fixes

---
 include/maths/CLowessDetail.h | 45 ++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index d10b083f46..80d914126a 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -50,7 +50,7 @@ void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
     //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }              (2)
     //
     // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector
-    // of parameters for the polynomial function poly(. | p), i.e. the coefficients 
+    // of parameters for the polynomial function poly(. | p), i.e. the coefficients
     // p_0 + p_1 x + p_2 x^2 ... (which are determined by minimizing the weighted
     // least square prediction errors as in (2)).
     //
@@ -119,17 +119,19 @@ typename CLowess<N>::TDoubleDoublePr CLowess<N>::minimum() const {
 
     TDoubleVec X;
 
-    double xa, xb;
+    double xa;
+    double xb;
     std::tie(xa, xb) = this->extrapolationInterval();
 
     // Coarse.
     X.reserve(m_Data.size() + 2);
     X.push_back(xa);
-    for (std::size_t i = 0; i < m_Data.size(); ++i) {
-        X.push_back(m_Data[i].first);
+    for (const auto& xi : m_Data) {
+        X.push_back(xi.first);
     }
     X.push_back(xb);
-    double xmin, fmin;
+    double xmin;
+    double fmin;
     CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xmin, fmin);
 
     // Refine.
@@ -141,7 +143,8 @@ typename CLowess<N>::TDoubleDoublePr CLowess<N>::minimum() const {
     for (double x = xa; x < xb; x += dx) {
         X.push_back(x);
     }
-    double xcand, fcand;
+    double xcand;
+    double fcand;
     CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xcand, fcand);
 
     if (fcand < fmin) {
@@ -166,11 +169,12 @@ double CLowess<N>::residualVariance() const {
     TSizeVec mask(n);
     std::iota(mask.begin(), mask.end(), 1);
     for (std::size_t i = 0; i < n; ++i) {
-        double xi, yi;
+        double xi;
+        double yi;
         std::tie(xi, yi) = m_Data[i];
         auto poly = this->fit(mask.begin(), mask.begin() + n - 1, m_K, xi);
         moments.add(yi - poly.predict(xi));
-        mask[i] = i;
+        mask[i] -= 1;
     }
 
     return CBasicStatistics::variance(moments);
@@ -249,7 +253,7 @@ double CLowess<N>::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMa
 
     double result{0.0};
 
-    CNormalMeanPrecConjugate::TDouble1Vec samples;
+    CNormalMeanPrecConjugate::TDouble1Vec testResiduals;
     CNormalMeanPrecConjugate::TDoubleWeightsAry1Vec weights;
 
     for (std::size_t i = 0; i < trainingMasks.size(); ++i) {
@@ -260,8 +264,13 @@ double CLowess<N>::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMa
         std::size_t last{trainingMasks[i].size() - 1};
 
         for (auto& j : trainingMasks[i]) {
-            double xj, yj;
+            double xj;
+            double yj;
             std::tie(xj, yj) = m_Data[j];
+            // Here we wish to leave out the j'th fold training mask. Since this
+            // is a vector we do this efficiently by temporarily swaping to the
+            // back of the collection so we can pass the masks as a contiguous
+            // range.
             std::swap(j, trainingMasks[i][last]);
             auto poly = this->fit(trainingMasks[i].cbegin(),
                                   trainingMasks[i].cbegin() + last, k, xj);
@@ -270,20 +279,21 @@ double CLowess<N>::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMa
         }
         LOG_TRACE(<< "residual distribution = " << residuals.print());
 
-        samples.clear();
-        samples.reserve(testingMasks[i].size());
+        testResiduals.clear();
+        testResiduals.reserve(testingMasks[i].size());
         for (auto j : testingMasks[i]) {
-            double xj, yj;
+            double xj;
+            double yj;
             std::tie(xj, yj) = m_Data[j];
             auto poly = this->fit(trainingMasks[i].cbegin(),
                                   trainingMasks[i].cend(), k, xj);
-            samples.push_back(yj - poly.predict(xj));
+            testResiduals.push_back(yj - poly.predict(xj));
         }
         weights.assign(testingMasks[i].size(), maths_t::CUnitWeights::UNIT);
-        LOG_TRACE(<< "samples = " << samples);
+        LOG_TRACE(<< "test residuals = " << testResiduals);
 
         double likelihood;
-        residuals.jointLogMarginalLikelihood(samples, weights, likelihood);
+        residuals.jointLogMarginalLikelihood(testResiduals, weights, likelihood);
         result += likelihood;
     }
     LOG_TRACE(<< "k = " << k << ", likelihood = " << result);
@@ -296,7 +306,8 @@ typename CLowess<N>::TPolynomial
 CLowess<N>::fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const {
     TPolynomial poly;
     for (auto i = beginMask; i != endMask; ++i) {
-        double xi, yi;
+        double xi;
+        double yi;
         std::tie(xi, yi) = m_Data[*i];
         poly.add(xi, yi, this->weight(k, xi, x));
     }

From ae45379852e4c1642efdcddc08b322fe222494e8 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 16:14:47 +0100
Subject: [PATCH 23/35] Explicit capture

---
 include/maths/CSolvers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/maths/CSolvers.h b/include/maths/CSolvers.h
index 8d55b162dd..ebe8efbb1d 100644
--- a/include/maths/CSolvers.h
+++ b/include/maths/CSolvers.h
@@ -859,7 +859,7 @@ class MATHS_EXPORT CSolvers {
     //! \param[out] fx Set to the value of f at \p x.
     template<typename T, typename F>
     static bool globalMaximize(const T& p, const F& f, double& x, double& fx) {
-        auto minusF = [&](double x_) { return -f(x_); };
+        auto minusF = [&f](double x_) { return -f(x_); };
         bool result{globalMinimize(p, minusF, x, fx)};
         fx = -fx;
         return result;

From efdadc0ffdda2d7604ecec687c86b988d341dfba Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 16:17:12 +0100
Subject: [PATCH 24/35] Typo

---
 lib/maths/CBoostedTreeFactory.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index b52706c6b2..2ff19ccc80 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -332,8 +332,8 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
         //
         // In addition, we want to constrain the maximum amount of training data we'll
         // use during hyperparameter search to avoid very long run times. To do this
-        // we use less than the implied 1 - 1/k : 1/k train : test split when it results
-        // in more train rows than the defined maximum.
+        // we use less than the implied 1 - 1/k : 1/k for the train : test split when
+        // it results in more train rows than the defined maximum.
 
         double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature *
                                           static_cast<double>(frame.numberColumns() - 1)) /

From 59c9addbe294616de9aefaaa53264a667f894e56 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 16:23:53 +0100
Subject: [PATCH 25/35] Capture by reference

---
 include/maths/CSolvers.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/maths/CSolvers.h b/include/maths/CSolvers.h
index ebe8efbb1d..4fd45acaf2 100644
--- a/include/maths/CSolvers.h
+++ b/include/maths/CSolvers.h
@@ -909,7 +909,8 @@ class MATHS_EXPORT CSolvers {
             std::swap(fa, fb);
         }
 
-        double x, fx;
+        double x;
+        double fx;
         {
             std::size_t n = maxIterations;
             minimize(a, b, fa, fb, f, 0.0, n, fc, x, fx);
@@ -922,7 +923,7 @@ class MATHS_EXPORT CSolvers {
 
         // [a, x] and [b, r] bracket the sublevel set end points.
 
-        auto fMinusFc = [=](double x_) { return f(x_) - fc; };
+        auto fMinusFc = [&f, fc](double x_) { return f(x_) - fc; };
 
         LOG_TRACE(<< "a = " << a << ", x = " << x << ", b = " << b);
         LOG_TRACE(<< "f_(a) = " << fa - fc << ", f_(x) = " << fx - fc

From 74c27f924e91559d1edc0db7e809b61f181b2b98 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 16:25:58 +0100
Subject: [PATCH 26/35] Rename

---
 lib/api/CDataFrameTrainBoostedTreeRunner.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
index b10932a5b2..0d6a62cac8 100644
--- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
@@ -32,7 +32,7 @@
 namespace ml {
 namespace api {
 namespace {
-const std::size_t UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER{
+const std::size_t NUMBER_ROUNDS_PER_HYPERPARAMETER_IS_UNSET{
     std::numeric_limits<std::size_t>::max()};
 }
 
@@ -106,7 +106,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
     double trainFractionPerFold{parameters[TRAIN_FRACTION_PER_FOLD].fallback(-1.0)};
     std::size_t numberRoundsPerHyperparameter{
         parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback(
-            UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER)};
+            NUMBER_ROUNDS_PER_HYPERPARAMETER_IS_UNSET)};
     std::size_t bayesianOptimisationRestarts{
         parameters[BAYESIAN_OPTIMISATION_RESTARTS].fallback(std::size_t{0})};
     bool stopCrossValidationEarly{parameters[STOP_CROSS_VALIDATION_EARLY].fallback(true)};
@@ -205,7 +205,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
     if (trainFractionPerFold > 0.0) {
         m_BoostedTreeFactory->trainFractionPerFold(trainFractionPerFold);
     }
-    if (numberRoundsPerHyperparameter != UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER) {
+    if (numberRoundsPerHyperparameter != NUMBER_ROUNDS_PER_HYPERPARAMETER_IS_UNSET) {
         m_BoostedTreeFactory->maximumOptimisationRoundsPerHyperparameter(numberRoundsPerHyperparameter);
     }
     if (bayesianOptimisationRestarts > 0) {

From ca1d910ec35041c75e9340fd37675a0b290fef81 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 16:37:56 +0100
Subject: [PATCH 27/35] Update comment to reflect the current behaviour

---
 include/maths/CBoostedTreeFactory.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
index 4c18a61354..ec19a654d3 100644
--- a/include/maths/CBoostedTreeFactory.h
+++ b/include/maths/CBoostedTreeFactory.h
@@ -209,9 +209,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TDoubleDoublePrVec estimateTreeGainAndCurvature(core::CDataFrame& frame,
                                                     const TDoubleVec& percentiles) const;
 
-    //! Perform a line search for the test loss w.r.t. a single regularization
-    //! hyperparameter and apply Newton's method to find the minimum. The plan
-    //! is to find a value near where the model starts to overfit.
+    //! Perform a line search for the test loss w.r.t. a single hyperparameter.
+    //! At the end we use a smooth curve fit through all test loss values (using
+    //! LOWESS regression) and use this to get a best estimate of where the true
+    //! minimum occurs.
     //!
     //! \return The interval to search during the main hyperparameter optimisation
     //! loop or null if this couldn't be found.

From 40eae57f73d43f8845f49c4c784ebb2ebe90af24 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 17:31:44 +0100
Subject: [PATCH 28/35] Name variable for readability

---
 lib/maths/unittest/CLowessTest.cc | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc
index 61c4bec0fc..f59227aee4 100644
--- a/lib/maths/unittest/CLowessTest.cc
+++ b/lib/maths/unittest/CLowessTest.cc
@@ -40,6 +40,8 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
 
     test::CRandomNumbers rng;
 
+    std::size_t numberFolds{5};
+
     TDoubleVec scale;
     TDoubleVec offset;
     TDoubleVec noise;
@@ -69,12 +71,14 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
             }
 
             maths::CLowess<2> lowess;
-            lowess.fit(data, 5);
+            lowess.fit(data, numberFolds);
 
-            double xea, xeb;
+            double xea;
+            double xeb;
             std::tie(xea, xeb) = lowess.extrapolationInterval();
 
-            double xmin, fmin;
+            double xmin;
+            double fmin;
             std::tie(xmin, fmin) = lowess.minimum();
             BOOST_REQUIRE_EQUAL(fmin, lowess.predict(xmin));
             BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea)));
@@ -96,6 +100,8 @@ BOOST_AUTO_TEST_CASE(testSmooth) {
 
     test::CRandomNumbers rng;
 
+    std::size_t numberFolds{5};
+
     auto trend = [](double x) {
         return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
     };
@@ -107,7 +113,7 @@ BOOST_AUTO_TEST_CASE(testSmooth) {
     }
 
     maths::CLowess<2> lowess;
-    lowess.fit(data, 5);
+    lowess.fit(data, numberFolds);
 
     TMeanVarAccumulator errorMoments;
     for (std::size_t i = 0; i < 20; ++i) {
@@ -125,6 +131,8 @@ BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) {
 
     test::CRandomNumbers rng;
 
+    std::size_t numberFolds{5};
+
     TDoubleVec noise;
     rng.generateNormalSamples(0.0, 4.0, 20, noise);
 
@@ -139,7 +147,7 @@ BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) {
     }
 
     maths::CLowess<2> lowess;
-    lowess.fit(data, 5);
+    lowess.fit(data, numberFolds);
 
     TMeanVarAccumulator errorMoments;
     for (std::size_t i = 0; i < 20; ++i) {
@@ -159,6 +167,8 @@ BOOST_AUTO_TEST_CASE(testMinimum) {
 
     test::CRandomNumbers rng;
 
+    std::size_t numberFolds{5};
+
     auto trend = [](double x) {
         return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
     };
@@ -170,9 +180,10 @@ BOOST_AUTO_TEST_CASE(testMinimum) {
     }
 
     maths::CLowess<2> lowess;
-    lowess.fit(data, 5);
+    lowess.fit(data, numberFolds);
 
-    double x, fx;
+    double x;
+    double fx;
     std::tie(x, fx) = lowess.minimum();
 
     // Expect minimum at ((3 / 2) * pi) / (2 pi / 20) = 15 and a value of around -8.0;

From 92de10f0bf5efc0473f325dc080d3b8dc02fadc8 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 17:33:23 +0100
Subject: [PATCH 29/35] Typedef

---
 lib/maths/unittest/CLowessTest.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc
index f59227aee4..57a2a4acbd 100644
--- a/lib/maths/unittest/CLowessTest.cc
+++ b/lib/maths/unittest/CLowessTest.cc
@@ -24,6 +24,7 @@ using namespace ml;
 
 using TDoubleVec = std::vector<double>;
 using TDoubleVecVec = std::vector<TDoubleVec>;
+using TDoubleDoublePrVec = maths::CLowess<2>::TDoubleDoublePrVec;
 using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
 
 BOOST_AUTO_TEST_CASE(testInvariants) {
@@ -45,7 +46,7 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
     TDoubleVec scale;
     TDoubleVec offset;
     TDoubleVec noise;
-    maths::CLowess<2>::TDoubleDoublePrVec data;
+    TDoubleDoublePrVec data;
 
     std::function<double(double)> trends[]{
         [&](double x) {
@@ -106,7 +107,7 @@ BOOST_AUTO_TEST_CASE(testSmooth) {
         return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
     };
 
-    maths::CLowess<2>::TDoubleDoublePrVec data;
+    TDoubleDoublePrVec data;
     for (std::size_t i = 0; i < 20; ++i) {
         double x{static_cast<double>(i)};
         data.emplace_back(x, trend(x));
@@ -140,7 +141,7 @@ BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) {
         return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
     };
 
-    maths::CLowess<2>::TDoubleDoublePrVec data;
+    TDoubleDoublePrVec data;
     for (std::size_t i = 0; i < noise.size(); ++i) {
         double x{static_cast<double>(i)};
         data.emplace_back(x, trend(x) + noise[i]);
@@ -173,7 +174,7 @@ BOOST_AUTO_TEST_CASE(testMinimum) {
         return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x);
     };
 
-    maths::CLowess<2>::TDoubleDoublePrVec data;
+    TDoubleDoublePrVec data;
     for (std::size_t i = 0; i < 20; ++i) {
         double x{static_cast<double>(i)};
         data.emplace_back(x, trend(x));
@@ -202,7 +203,7 @@ BOOST_AUTO_TEST_CASE(testTrainingLossCurves) {
     //   3. Boston
 
     using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
-    using TDoubleDoublePrVecVec = std::vector<maths::CLowess<2>::TDoubleDoublePrVec>;
+    using TDoubleDoublePrVecVec = std::vector<TDoubleDoublePrVec>;
 
     // clang-format off
     TDoubleDoublePrVecVec curves{

From d0be22f7f510fc797548d0f994b0ce62e5d3bedc Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Thu, 8 Jul 2021 17:41:09 +0100
Subject: [PATCH 30/35] Define small constant used to prefer fast training if
 test error is similar

---
 lib/maths/CBoostedTreeFactory.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
index 2ff19ccc80..5b6822e47a 100644
--- a/lib/maths/CBoostedTreeFactory.cc
+++ b/lib/maths/CBoostedTreeFactory.cc
@@ -39,6 +39,7 @@ const std::size_t BEST_PARAMETER_INDEX{1};
 const std::size_t MAX_PARAMETER_INDEX{2};
 const std::size_t MAX_LINE_SEARCH_ITERATIONS{10};
 const double LINE_SEARCH_MINIMUM_RELATIVE_EI_TO_CONTINUE{0.01};
+const double SMALL_RELATIVE_TEST_LOSS_INCREASE{0.01};
 const double MIN_ROWS_PER_FEATURE{20.0};
 const double MIN_SOFT_DEPTH_LIMIT{2.0};
 const double MIN_SOFT_DEPTH_LIMIT_TOLERANCE{0.05};
@@ -801,7 +802,8 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram
                                           double minTestLoss, double testLoss) {
                     return testLoss + CTools::linearlyInterpolate(
                                           logMinDownsampleFactor, logMaxDownsampleFactor,
-                                          0.0, 0.01 * minTestLoss, logDownsampleFactor);
+                                          0.0, SMALL_RELATIVE_TEST_LOSS_INCREASE * minTestLoss,
+                                          logDownsampleFactor);
                 };
 
                 TVector fallback;
@@ -865,10 +867,10 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr
                 // larger than the minimum.
                 auto adjustTestLoss = [=](double logFeatureBagFraction,
                                           double minTestLoss, double testLoss) {
-                    return testLoss +
-                           CTools::linearlyInterpolate(
-                               logMinFeatureBagFraction, logMaxFeatureBagFraction,
-                               0.0, 0.01 * minTestLoss, logFeatureBagFraction);
+                    return testLoss + CTools::linearlyInterpolate(
+                                          logMinFeatureBagFraction, logMaxFeatureBagFraction,
+                                          0.0, SMALL_RELATIVE_TEST_LOSS_INCREASE * minTestLoss,
+                                          logFeatureBagFraction);
                 };
 
                 TVector fallback;

From a380b204ca8d3ebf6d5c12dbd472ecd608c9ae38 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 9 Jul 2021 09:49:48 +0100
Subject: [PATCH 31/35] We should record the fraction and number of training
 rows in the model meta data

---
 include/api/CInferenceModelMetadata.h         | 15 ++++++++---
 include/maths/CBoostedTree.h                  |  8 +++++-
 include/maths/CBoostedTreeFactory.h           | 18 ++++++-------
 include/maths/CBoostedTreeImpl.h              | 12 ++++++---
 ...ataFrameAnalysisInstrumentationInterface.h |  2 +-
 include/maths/CDataFramePredictiveModel.h     |  6 +++++
 include/maths/CLowess.h                       |  2 +-
 lib/api/CDataFrameAnalysisInstrumentation.cc  |  9 ++++---
 ...taFrameTrainBoostedTreeClassifierRunner.cc |  6 +++--
 ...taFrameTrainBoostedTreeRegressionRunner.cc |  5 ++--
 lib/api/CInferenceModelMetadata.cc            | 25 +++++++++++++++++++
 lib/maths/CBoostedTree.cc                     |  8 ++++++
 lib/maths/CBoostedTreeImpl.cc                 | 21 +++++++++++++---
 13 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h
index 099a374096..d2f124431c 100644
--- a/include/api/CInferenceModelMetadata.h
+++ b/include/api/CInferenceModelMetadata.h
@@ -40,8 +40,10 @@ class API_EXPORT CInferenceModelMetadata {
     static const std::string JSON_MEAN_MAGNITUDE_TAG;
     static const std::string JSON_MIN_TAG;
     static const std::string JSON_MODEL_METADATA_TAG;
+    static const std::string JSON_NUM_TRAINING_ROWS_TAG;
     static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
     static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;
+    static const std::string JSON_TRAIN_PARAMETERS_TAG;
 
 public:
     using TVector = maths::CDenseVector<double>;
@@ -64,6 +66,10 @@ class API_EXPORT CInferenceModelMetadata {
     //! to the baseline value).
     void featureImportanceBaseline(TVector&& baseline);
     void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);
+    //! Set the number of rows used to train the model.
+    void numberTrainingRows(std::size_t numberRows);
+    //! Set the fraction of data per fold used for training when tuning hyperparameters.
+    void trainFractionPerFold(double fraction);
 
 private:
     struct SHyperparameterImportance {
@@ -86,8 +92,9 @@ class API_EXPORT CInferenceModelMetadata {
 
 private:
     void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
-    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
     void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;
+    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
+    void writeTrainParameters(TRapidJsonWriter& writer) const;
 
 private:
     TSizeMeanAccumulatorUMap m_TotalShapValuesMean;
@@ -95,11 +102,13 @@ class API_EXPORT CInferenceModelMetadata {
     TOptionalVector m_ShapBaseline;
     TStrVec m_ColumnNames;
     TStrVec m_ClassValues;
-    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter =
+    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter{
         [](const std::string& value, TRapidJsonWriter& writer) {
             writer.String(value);
-        };
+        }};
     THyperparametersVec m_HyperparameterImportance;
+    std::size_t m_NumberTrainingRows{0};
+    double m_TrainFractionPerFold{0.0};
 };
 }
 }
diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
index 028c92cbde..a1a3898f4c 100644
--- a/include/maths/CBoostedTree.h
+++ b/include/maths/CBoostedTree.h
@@ -208,7 +208,7 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
                                   public CBoostedTreeNode::CVisitor {
     public:
-        virtual ~CVisitor() = default;
+        ~CVisitor() override = default;
         virtual void addTree() = 0;
         virtual void addClassificationWeights(TDoubleVec weights) = 0;
         virtual void addLossFunction(const TLossFunction& lossFunction) = 0;
@@ -236,6 +236,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     //! Get the vector of hyperparameter importances.
     THyperparameterImportanceVec hyperparameterImportance() const;
 
+    //! Get the number of rows used to train the model.
+    std::size_t numberTrainingRows() const override;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    double trainFractionPerFold() const override;
+
     //! Get the column containing the dependent variable.
     std::size_t columnHoldingDependentVariable() const override;
 
diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
index ec19a654d3..537c6eb933 100644
--- a/include/maths/CBoostedTreeFactory.h
+++ b/include/maths/CBoostedTreeFactory.h
@@ -278,14 +278,14 @@ class MATHS_EXPORT CBoostedTreeFactory final {
 private:
     TOptionalDouble m_MinimumFrequencyToOneHotEncode;
     TOptionalSize m_BayesianOptimisationRestarts;
-    bool m_StratifyRegressionCrossValidation = true;
-    double m_InitialDownsampleRowsPerFeature = 200.0;
-    std::size_t m_MaximumNumberOfTrainRows = 500000;
-    double m_GainPerNode1stPercentile = 0.0;
-    double m_GainPerNode50thPercentile = 0.0;
-    double m_GainPerNode90thPercentile = 0.0;
-    double m_TotalCurvaturePerNode1stPercentile = 0.0;
-    double m_TotalCurvaturePerNode90thPercentile = 0.0;
+    bool m_StratifyRegressionCrossValidation{true};
+    double m_InitialDownsampleRowsPerFeature{200.0};
+    std::size_t m_MaximumNumberOfTrainRows{500000};
+    double m_GainPerNode1stPercentile{0.0};
+    double m_GainPerNode50thPercentile{0.0};
+    double m_GainPerNode90thPercentile{0.0};
+    double m_TotalCurvaturePerNode1stPercentile{0.0};
+    double m_TotalCurvaturePerNode90thPercentile{0.0};
     std::size_t m_NumberThreads;
     TBoostedTreeImplUPtr m_TreeImpl;
     TVector m_LogDownsampleFactorSearchInterval;
@@ -295,7 +295,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TVector m_LogLeafWeightPenaltyMultiplierSearchInterval;
     TVector m_SoftDepthLimitSearchInterval;
     TVector m_LogEtaSearchInterval;
-    TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState;
+    TTrainingStateCallback m_RecordTrainingState{noopRecordTrainingState};
 };
 }
 }
diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
index e102529607..b76c996b0d 100644
--- a/include/maths/CBoostedTreeImpl.h
+++ b/include/maths/CBoostedTreeImpl.h
@@ -150,6 +150,13 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \return The best hyperparameters for validation error found so far.
     const CBoostedTreeHyperparameters& bestHyperparameters() const;
 
+    //! \return The fraction of data we use for train per fold when tuning hyperparameters.
+    double trainFractionPerFold() const;
+
+    //! \return The full training set data mask, i.e. all rows which aren't missing
+    //! the dependent variable.
+    core::CPackedBitVector allTrainingRowsMask() const;
+
     //!\ name Test Only
     //@{
     //! The name of the object holding the best hyperaparameters in the state document.
@@ -203,9 +210,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Check if we can train a model.
     bool canTrain() const;
 
-    //! Get the full training set data mask, i.e. all rows which aren't missing
-    //! the dependent variable.
-    core::CPackedBitVector allTrainingRowsMask() const;
+    //! Get the mean number of training examples which are used in each fold.
+    double meanNumberTrainingRowsPerFold() const;
 
     //! Compute the \p percentile percentile gain per split and the sum of row
     //! curvatures per internal node of \p forest.
diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
index f6b35916b0..bd2685d24a 100644
--- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h
+++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
@@ -116,7 +116,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
         SRegularization s_Regularization;
         double s_DownsampleFactor{-1.0};
         std::size_t s_NumFolds{0};
-        double s_TrainFractionPerFold{0.0};
+        double s_NumTrainingRows{0};
         std::size_t s_MaxTrees{0};
         double s_FeatureBagFraction{-1.0};
         double s_EtaGrowthRatePerTree{-1.0};
diff --git a/include/maths/CDataFramePredictiveModel.h b/include/maths/CDataFramePredictiveModel.h
index 57ac26cb31..df22fe6d0a 100644
--- a/include/maths/CDataFramePredictiveModel.h
+++ b/include/maths/CDataFramePredictiveModel.h
@@ -61,6 +61,12 @@ class MATHS_EXPORT CDataFramePredictiveModel {
     //! \warning Will return a nullptr if a trained model isn't available.
     virtual CTreeShapFeatureImportance* shap() const = 0;
 
+    //! Get the number of rows used to train the model.
+    virtual std::size_t numberTrainingRows() const = 0;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    virtual double trainFractionPerFold() const = 0;
+
     //! Get the column containing the dependent variable.
     virtual std::size_t columnHoldingDependentVariable() const = 0;
 
diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
index b79b1a3c11..07cf5de50c 100644
--- a/include/maths/CLowess.h
+++ b/include/maths/CLowess.h
@@ -47,7 +47,7 @@ class CLowess {
     //! \note Defined as (0,0) if no data have been fit.
     TDoubleDoublePr minimum() const;
 
-    //! \name Test Functions
+    //! \name Test Only
     //@{
     //! Get an estimate of residual variance at the observed values.
     //!
diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc
index 5d059cea08..9beb495ee3 100644
--- a/lib/api/CDataFrameAnalysisInstrumentation.cc
+++ b/lib/api/CDataFrameAnalysisInstrumentation.cc
@@ -483,10 +483,11 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::
             rapidjson::Value(static_cast<std::uint64_t>(this->m_Hyperparameters.s_NumFolds))
                 .Move(),
             parentObject);
-        writer->addMember(
-            CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD,
-            rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(),
-            parentObject);
+        // TODO enable with Java changes.
+        //writer->addMember(
+        //    CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD,
+        //    rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(),
+        //    parentObject);
         writer->addMember(
             CDataFrameTrainBoostedTreeRunner::MAX_TREES,
             rapidjson::Value(static_cast<std::uint64_t>(this->m_Hyperparameters.s_MaxTrees))
diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
index bdf46cb515..8831ab5d7a 100644
--- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
@@ -322,12 +322,14 @@ CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelDefinition(
 
 CDataFrameAnalysisRunner::TOptionalInferenceModelMetadata
 CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelMetadata() const {
-    const auto& featureImportance = this->boostedTree().shap();
-    if (featureImportance) {
+    auto* featureImportance = this->boostedTree().shap();
+    if (featureImportance != nullptr) {
         m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
     }
     m_InferenceModelMetadata.hyperparameterImportance(
         this->boostedTree().hyperparameterImportance());
+    m_InferenceModelMetadata.numberTrainingRows(this->boostedTree().numberTrainingRows());
+    m_InferenceModelMetadata.trainFractionPerFold(this->boostedTree().trainFractionPerFold());
     return m_InferenceModelMetadata;
 }
 
diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
index 04613276b5..37e563bddf 100644
--- a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
@@ -156,12 +156,13 @@ CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelDefinition(
 
 CDataFrameAnalysisRunner::TOptionalInferenceModelMetadata
 CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelMetadata() const {
-    const auto& featureImportance = this->boostedTree().shap();
-    if (featureImportance) {
+    auto* featureImportance = this->boostedTree().shap();
+    if (featureImportance != nullptr) {
         m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline());
     }
     m_InferenceModelMetadata.hyperparameterImportance(
         this->boostedTree().hyperparameterImportance());
+    m_InferenceModelMetadata.trainFractionPerFold(this->boostedTree().trainFractionPerFold());
     return m_InferenceModelMetadata;
 }
 
diff --git a/lib/api/CInferenceModelMetadata.cc b/lib/api/CInferenceModelMetadata.cc
index c7a8af2a50..2d0948445e 100644
--- a/lib/api/CInferenceModelMetadata.cc
+++ b/lib/api/CInferenceModelMetadata.cc
@@ -19,6 +19,7 @@ void CInferenceModelMetadata::write(TRapidJsonWriter& writer) const {
     this->writeTotalFeatureImportance(writer);
     this->writeFeatureImportanceBaseline(writer);
     this->writeHyperparameterImportance(writer);
+    this->writeTrainParameters(writer);
 }
 
 void CInferenceModelMetadata::writeTotalFeatureImportance(TRapidJsonWriter& writer) const {
@@ -171,6 +172,20 @@ void CInferenceModelMetadata::writeHyperparameterImportance(TRapidJsonWriter& wr
     writer.EndArray();
 }
 
+void CInferenceModelMetadata::writeTrainParameters(TRapidJsonWriter& writer) const {
+    // TODO enable with Java changes.
+    // Only write out if it has been set.
+    //if (m_TrainingFractionPerFold > 0.0) {
+    //    writer.Key(JSON_TRAIN_PARAMETERS_TAG);
+    //    writer.StartObject();
+    //    writer.Key(JSON_NUM_TRAINING_ROWS_TAG);
+    //    writer.Uint64(m_NumberRowsUsedForTrain);
+    //    writer.Key(CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD);
+    //    writer.Double(m_TrainingFractionPerFold);
+    //    writer.EndObject();
+    //}
+}
+
 const std::string& CInferenceModelMetadata::typeString() {
     return JSON_MODEL_METADATA_TAG;
 }
@@ -260,6 +275,14 @@ void CInferenceModelMetadata::hyperparameterImportance(
               });
 }
 
+void CInferenceModelMetadata::numberTrainingRows(std::size_t numberRows) {
+    m_NumberTrainingRows = numberRows;
+}
+
+void CInferenceModelMetadata::trainFractionPerFold(double fraction) {
+    m_TrainFractionPerFold = fraction;
+}
+
 // clang-format off
 const std::string CInferenceModelMetadata::JSON_ABSOLUTE_IMPORTANCE_TAG{"absolute_importance"};
 const std::string CInferenceModelMetadata::JSON_BASELINE_TAG{"baseline"};
@@ -276,8 +299,10 @@ const std::string CInferenceModelMetadata::JSON_MAX_TAG{"max"};
 const std::string CInferenceModelMetadata::JSON_MEAN_MAGNITUDE_TAG{"mean_magnitude"};
 const std::string CInferenceModelMetadata::JSON_MIN_TAG{"min"};
 const std::string CInferenceModelMetadata::JSON_MODEL_METADATA_TAG{"model_metadata"};
+const std::string CInferenceModelMetadata::JSON_NUM_TRAINING_ROWS_TAG{"num_training_rows"};
 const std::string CInferenceModelMetadata::JSON_RELATIVE_IMPORTANCE_TAG{"relative_importance"};
 const std::string CInferenceModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG{"total_feature_importance"};
+const std::string CInferenceModelMetadata::JSON_TRAIN_PARAMETERS_TAG{"train_parameters"};
 // clang-format on
 }
 }
diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc
index 2af5a5ab07..aa67fad1d1 100644
--- a/lib/maths/CBoostedTree.cc
+++ b/lib/maths/CBoostedTree.cc
@@ -167,6 +167,14 @@ CBoostedTree::THyperparameterImportanceVec CBoostedTree::hyperparameterImportanc
     return m_Impl->hyperparameterImportance();
 }
 
+std::size_t CBoostedTree::numberTrainingRows() const {
+    return static_cast<std::size_t>(m_Impl->allTrainingRowsMask().manhattan());
+}
+
+double CBoostedTree::trainFractionPerFold() const {
+    return m_Impl->trainFractionPerFold();
+}
+
 std::size_t CBoostedTree::columnHoldingDependentVariable() const {
     return m_Impl->columnHoldingDependentVariable();
 }
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 7cc581cd0b..921d1df5cf 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -262,7 +262,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame,
 
         this->restoreBestHyperparameters();
         this->scaleRegularizers(allTrainingRowsMask.manhattan() /
-                                m_TrainingRowMasks[0].manhattan());
+                                this->meanNumberTrainingRowsPerFold());
         this->startProgressMonitoringFinalTrain();
         // reinitialize random number generator for reproducible results
         // TODO #1866 introduce accept randomize_seed configuration parameter
@@ -404,8 +404,12 @@ bool CBoostedTreeImpl::canTrain() const {
                            m_FeatureSampleProbabilities.end(), 0.0) > 0.0;
 }
 
-core::CPackedBitVector CBoostedTreeImpl::allTrainingRowsMask() const {
-    return ~m_MissingFeatureRowMasks[m_DependentVariable];
+double CBoostedTreeImpl::meanNumberTrainingRowsPerFold() const {
+    TMeanAccumulator result;
+    for (const auto& mask : m_TrainingRowMasks) {
+        result.add(mask.manhattan());
+    }
+    return CBasicStatistics::mean(result);
 }
 
 CBoostedTreeImpl::TDoubleDoublePr
@@ -1530,7 +1534,8 @@ void CBoostedTreeImpl::recordHyperparameters() {
     m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective;
     m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor;
     m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds;
-    m_Instrumentation->hyperparameters().s_TrainFractionPerFold = m_TrainFractionPerFold;
+    m_Instrumentation->hyperparameters().s_NumTrainingRows =
+        this->meanNumberTrainingRowsPerFold();
     m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees;
     m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction;
     m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree;
@@ -2128,6 +2133,14 @@ const CBoostedTreeImpl::TVector& CBoostedTreeImpl::classificationWeights() const
     return m_ClassificationWeights;
 }
 
+double CBoostedTreeImpl::trainFractionPerFold() const {
+    return m_TrainFractionPerFold;
+}
+
+core::CPackedBitVector CBoostedTreeImpl::allTrainingRowsMask() const {
+    return ~m_MissingFeatureRowMasks[m_DependentVariable];
+}
+
 const double CBoostedTreeImpl::MINIMUM_RELATIVE_GAIN_PER_SPLIT{1e-7};
 }
 }

From 06460e666852a71b8dd27b581b378646fdc9b181 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 9 Jul 2021 11:59:19 +0100
Subject: [PATCH 32/35] Handle case we don't need to sample for last fold

---
 include/maths/CLowessDetail.h | 4 ++--
 lib/maths/CDataFrameUtils.cc  | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h
index 80d914126a..30cd67d0ff 100644
--- a/include/maths/CLowessDetail.h
+++ b/include/maths/CLowessDetail.h
@@ -47,12 +47,12 @@ void CLowess<N>::fit(TDoubleDoublePrVec data, std::size_t numberFolds) {
     //
     //   f(x | p^*) = poly(x | p^*(x))
     //
-    //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }              (2)
+    //   p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } }              (1)
     //
     // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector
     // of parameters for the polynomial function poly(. | p), i.e. the coefficients
     // p_0 + p_1 x + p_2 x^2 ... (which are determined by minimizing the weighted
-    // least square prediction errors as in (2)).
+    // least square prediction errors as in (1)).
     //
     // We determine k by solving
     //
diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc
index dd074eacc3..203ce3ef78 100644
--- a/lib/maths/CDataFrameUtils.cc
+++ b/lib/maths/CDataFrameUtils.cc
@@ -570,8 +570,13 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
 
     core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask};
     for (auto& testingRowMask : testingRowMasks) {
-        testingRowMask = sample(sampler, candidateTestingRowsMask);
-        candidateTestingRowsMask ^= testingRowMask;
+        if (static_cast<std::size_t>(candidateTestingRowsMask.manhattan()) <= sampleSize) {
+            testingRowMask = std::move(candidateTestingRowsMask);
+            candidateTestingRowsMask = core::CPackedBitVector{testingRowMask.size(), false};
+        } else {
+            testingRowMask = sample(sampler, candidateTestingRowsMask);
+            candidateTestingRowsMask ^= testingRowMask;
+        }
         if (excessSampler != nullptr) {
             testingRowMask |= sample(excessSampler, allTrainingRowsMask ^ testingRowMask);
         }

From ad037ec87029264a23fabab82505798bc9cba05c Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 9 Jul 2021 15:02:35 +0100
Subject: [PATCH 33/35] Add an explanation of variance treatment in BO

---
 lib/maths/CBayesianOptimisation.cc | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/lib/maths/CBayesianOptimisation.cc b/lib/maths/CBayesianOptimisation.cc
index 457249c03d..b4296d125d 100644
--- a/lib/maths/CBayesianOptimisation.cc
+++ b/lib/maths/CBayesianOptimisation.cc
@@ -616,6 +616,35 @@ CBayesianOptimisation::TVector CBayesianOptimisation::function() const {
 }
 
 double CBayesianOptimisation::meanErrorVariance() const {
+
+    // So what are we doing here? When we supply function values we also supply their
+    // error variance. Typically these might be the mean test loss function across
+    // folds and their variance for a particular choice of hyperparameters. Sticking
+    // with this example, the variance allows us to estimate the error w.r.t. the
+    // true generalisation error due to finite sample size. We can think of the source
+    // of this variance as being due to two effects: one which shifts the loss values
+    // in each fold (this might be due to some folds simply having more hard examples)
+    // and another which permutes the order of loss values. A shift in the loss function
+    // is not something we wish to capture in the GP: it shouldn't materially affect
+    // where to choose points to test since any sensible optimisation strategy should
+    // only care about the difference in loss between points, which is unaffected by a
+    // shift. More formally, if we assume the shift and permutation errors are independent
+    // we have for losses l_i, mean loss per fold m_i and mean loss for a given set of
+    // hyperparameters m that the variance is
+    //
+    //   sum_i{ (l_i - m)^2 } = sum_i{ (l_i - m_i + m_i - m)^2 }
+    //                        = sum_i{ (l_i - m_i)^2 } + sum_i{ (m_i - m)^2 }
+    //                        = "permutation variance" + "shift variance"          (1)
+    //
+    // with the cross-term expected to be small by independence. (Note, the independence
+    // assumption is reasonable if one assumes that the shift is due to mismatch in hard
+    // examples since the we choose folds independently at random.) We can estimate the
+    // shift variance by looking at mean loss over all distinct hyperparameter settings
+    // and we assume it is supplied as the parameter m_ExplainedErrorVariance. It should
+    // also be smaller than the variance by construction although for numerical stability
+    // we prevent the difference becoming too small. As discussed, here we wish return
+    // the permutation variance which we get by rearranging (1).
+
     TMeanAccumulator variance;
     variance.add(m_ErrorVariances);
     return CBasicStatistics::mean(variance) -

From e58ed73a77373d6db59663cfa73f5680a8f32280 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Fri, 9 Jul 2021 15:27:48 +0100
Subject: [PATCH 34/35] Comments

---
 lib/maths/unittest/CLowessTest.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc
index 57a2a4acbd..7c1343a918 100644
--- a/lib/maths/unittest/CLowessTest.cc
+++ b/lib/maths/unittest/CLowessTest.cc
@@ -31,14 +31,6 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
 
     // Test invariants are satisfied on random input.
 
-    // We check:
-    //   1. Minimum is a local minimum.
-    //   2. The sublevel set contains the minimum.
-    //   3. The minimum is within 10% of the training data interval.
-    //   4. The ends of the sublevel set is within 10% of the training data interval.
-    //   5. The variance is greater than or equal to the variance of the residuals at
-    //      the training data.
-
     test::CRandomNumbers rng;
 
     std::size_t numberFolds{5};
@@ -58,6 +50,8 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
             return scale[0] * (x - offset[0]) * (x - offset[0]) / 100.0;
         }};
 
+    // We check...
+
     for (std::size_t i = 0; i < 100; ++i) {
 
         for (const auto& trend : trends) {
@@ -78,6 +72,7 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
             double xeb;
             std::tie(xea, xeb) = lowess.extrapolationInterval();
 
+            // 1. The minimum is a local minimum.
             double xmin;
             double fmin;
             std::tie(xmin, fmin) = lowess.minimum();
@@ -85,6 +80,12 @@ BOOST_AUTO_TEST_CASE(testInvariants) {
             BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea)));
             BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::min(xmin + 0.1, xeb)));
 
+            // 2. The minimum is within the maximum extrapolation interval.
+            BOOST_TEST_REQUIRE(xmin >= xea);
+            BOOST_TEST_REQUIRE(xmin <= xeb);
+
+            // 3. The variance is greater than the variance of the residual at the
+            //    training data.
             TMeanVarAccumulator residualMoments;
             for (const auto& x : data) {
                 residualMoments.add(x.second - lowess.predict(x.first));

From e0a61bfe43abdac2c4a82e97e29301c2a412398c Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Mon, 12 Jul 2021 11:43:33 +0100
Subject: [PATCH 35/35] Move fraction of training data into its own section in
 instrumentation

---
 .../api/CDataFrameAnalysisInstrumentation.h   | 26 ++++++++++--------
 ...ataFrameAnalysisInstrumentationInterface.h | 19 ++++++-------
 lib/api/CDataFrameAnalysisInstrumentation.cc  | 27 ++++++++++++++-----
 lib/maths/CBoostedTreeImpl.cc                 |  3 +--
 4 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h
index e5c7fefd22..e0247c5e80 100644
--- a/include/api/CDataFrameAnalysisInstrumentation.h
+++ b/include/api/CDataFrameAnalysisInstrumentation.h
@@ -184,17 +184,19 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final
     CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId, std::size_t memoryLimit)
         : CDataFrameAnalysisInstrumentation(jobId, memoryLimit) {}
 
-    //! Supervised learning job \p type, can be E_Regression or E_Classification.
+    //! Set the supervised learning job \p type, can be E_Regression or E_Classification.
     void type(EStatsType type) override;
-    //! Current \p iteration number.
+    //! Set the current \p iteration number.
     void iteration(std::size_t iteration) override;
-    //! Run time of the iteration.
+    //! Set the run time of the current iteration.
     void iterationTime(std::uint64_t delta) override;
-    //! Type of the validation loss result, e.g. "mse".
+    //! Set the type of the validation loss result, e.g. "mse".
     void lossType(const std::string& lossType) override;
-    //! List of \p lossValues of validation error for the given \p fold.
+    //! Set the validation loss values for \p fold for each forest size to \p lossValues.
     void lossValues(std::size_t fold, TDoubleVec&& lossValues) override;
-    //! \return Structure contains hyperparameters.
+    //! Set the fraction of data used for training per fold.
+    void trainingFractionPerFold(double fraction) override;
+    //! \return A writable object containing the training hyperparameters.
     SHyperparameters& hyperparameters() override { return m_Hyperparameters; }
 
 protected:
@@ -206,19 +208,21 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final
 
 private:
     void writeAnalysisStats(std::int64_t timestamp) override;
+    void writeMetaData(rapidjson::Value& parentObject);
     void writeHyperparameters(rapidjson::Value& parentObject);
     void writeValidationLoss(rapidjson::Value& parentObject);
     void writeTimingStats(rapidjson::Value& parentObject);
     void reset();
 
 private:
-    EStatsType m_Type = E_Regression;
-    std::size_t m_Iteration = 0;
-    std::uint64_t m_IterationTime = 0;
-    std::uint64_t m_ElapsedTime = 0;
-    bool m_AnalysisStatsInitialized = false;
+    EStatsType m_Type{E_Regression};
+    std::size_t m_Iteration{0};
+    std::uint64_t m_IterationTime{0};
+    std::uint64_t m_ElapsedTime{0};
+    bool m_AnalysisStatsInitialized{false};
     std::string m_LossType;
     TLossVec m_LossValues;
+    double m_TrainingFractionPerFold{0.0};
     SHyperparameters m_Hyperparameters;
 };
 }
diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
index bd2685d24a..294648237f 100644
--- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h
+++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
@@ -33,7 +33,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
     //! Adds \p delta to the memory usage statistics.
     virtual void updateMemoryUsage(std::int64_t delta) = 0;
 
-    //! Start progress monitoring for \p phase.
+    //! Start progress monitoring of \p task.
     //!
     //! \note This resets the current progress to zero.
     virtual void startNewProgressMonitoredTask(const std::string& task) = 0;
@@ -116,7 +116,6 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
         SRegularization s_Regularization;
         double s_DownsampleFactor{-1.0};
         std::size_t s_NumFolds{0};
-        double s_NumTrainingRows{0};
         std::size_t s_MaxTrees{0};
         double s_FeatureBagFraction{-1.0};
         double s_EtaGrowthRatePerTree{-1.0};
@@ -127,18 +126,19 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
     using TDoubleVec = std::vector<double>;
 
 public:
-    virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default;
-    //! Supervised learning job \p type, can be E_Regression or E_Classification.
+    //! Set the supervised learning job \p type, can be E_Regression or E_Classification.
     virtual void type(EStatsType type) = 0;
-    //! Current \p iteration number.
+    //! Set the current \p iteration number.
     virtual void iteration(std::size_t iteration) = 0;
-    //! Run time of the iteration.
+    //! Set the run time of the current iteration.
     virtual void iterationTime(std::uint64_t delta) = 0;
-    //! Type of the validation loss result, e.g. "mse".
+    //! Set the type of the validation loss result, e.g. "mse".
     virtual void lossType(const std::string& lossType) = 0;
-    //! List of \p lossValues of validation error for the given \p fold.
+    //! Set the validation loss values for \p fold for each forest size to \p lossValues.
     virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0;
-    //! \return Structure contains hyperparameters.
+    //! Set the fraction of data used for training per fold.
+    virtual void trainingFractionPerFold(double fraction) = 0;
+    //! \return A writable object containing the training hyperparameters.
     virtual SHyperparameters& hyperparameters() = 0;
 };
 
@@ -168,6 +168,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub
     void iterationTime(std::uint64_t /* delta */) override {}
     void lossType(const std::string& /* lossType */) override {}
     void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override {}
+    void trainingFractionPerFold(double /* fraction */) override {}
     SHyperparameters& hyperparameters() override { return m_Hyperparameters; }
 
 private:
diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc
index 9beb495ee3..c552ad4ebb 100644
--- a/lib/api/CDataFrameAnalysisInstrumentation.cc
+++ b/lib/api/CDataFrameAnalysisInstrumentation.cc
@@ -46,6 +46,7 @@ const std::string MEMORY_STATUS_HARD_LIMIT_TAG{"hard_limit"};
 const std::string MEMORY_STATUS_OK_TAG{"ok"};
 const std::string MEMORY_STATUS_TAG{"status"};
 const std::string MEMORY_TYPE_TAG{"analytics_memory_usage"};
+const std::string META_DATA_TAG{"meta_data"};
 const std::string OUTLIER_DETECTION_STATS{"outlier_detection_stats"};
 const std::string PARAMETERS_TAG{"parameters"};
 const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"};
@@ -387,7 +388,11 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossType(const std::string& loss
 
 void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::size_t fold,
                                                            TDoubleVec&& lossValues) {
-    m_LossValues.emplace_back(std::move(fold), std::move(lossValues));
+    m_LossValues.emplace_back(fold, std::move(lossValues));
+}
+
+void CDataFrameTrainBoostedTreeInstrumentation::trainingFractionPerFold(double fraction) {
+    m_TrainingFractionPerFold = fraction;
 }
 
 void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) {
@@ -424,6 +429,12 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t
         writer->Key(TIMING_STATS_TAG);
         writer->write(timingStatsObject);
 
+        // TODO enable with Java changes.
+        //rapidjson::Value metaDataObject{writer->makeObject()};
+        //this->writeMetaData(metaDataObject);
+        //writer->Key(META_DATA_TAG);
+        //writer->write(metaDataObject);
+
         writer->EndObject();
     }
     this->reset();
@@ -434,6 +445,14 @@ void CDataFrameTrainBoostedTreeInstrumentation::reset() {
     m_LossValues.clear();
 }
 
+void CDataFrameTrainBoostedTreeInstrumentation::writeMetaData(rapidjson::Value& parentObject) {
+    auto* writer = this->writer();
+    if (writer != nullptr) {
+        writer->addMember(CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD,
+                          rapidjson::Value(m_TrainingFractionPerFold).Move(), parentObject);
+    }
+}
+
 void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) {
     auto* writer = this->writer();
 
@@ -483,11 +502,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::
             rapidjson::Value(static_cast<std::uint64_t>(this->m_Hyperparameters.s_NumFolds))
                 .Move(),
             parentObject);
-        // TODO enable with Java changes.
-        //writer->addMember(
-        //    CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD,
-        //    rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(),
-        //    parentObject);
         writer->addMember(
             CDataFrameTrainBoostedTreeRunner::MAX_TREES,
             rapidjson::Value(static_cast<std::uint64_t>(this->m_Hyperparameters.s_MaxTrees))
@@ -539,6 +553,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::V
         writer->addMember(VALIDATION_FOLD_VALUES_TAG, lossValuesArray, parentObject);
     }
 }
+
 void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& parentObject) {
     auto* writer = this->writer();
     if (writer != nullptr) {
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 921d1df5cf..c53594ff91 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -1530,12 +1530,11 @@ std::size_t CBoostedTreeImpl::maximumTreeSize(std::size_t numberRows) const {
 }
 
 void CBoostedTreeImpl::recordHyperparameters() {
+    m_Instrumentation->trainingFractionPerFold(m_TrainFractionPerFold);
     m_Instrumentation->hyperparameters().s_Eta = m_Eta;
     m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective;
     m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor;
     m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds;
-    m_Instrumentation->hyperparameters().s_NumTrainingRows =
-        this->meanNumberTrainingRowsPerFold();
     m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees;
     m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction;
     m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree;