elastic · tveasey · Jul 12, 2021 · Feb 22, 2021 · Jun 24, 2021 · Jun 29, 2021
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -47,6 +47,13 @@
 * Ensure bucket `event_count` is calculated for jobs with 1 second bucket spans.
   (See {ml-pull}1908[#1908].)
 
+== {es} version 7.15.0
+
+=== Enhancements
+
+* Speed up training of regression and classification models on very large data sets.
+  (See {ml-pull}1941[#1941].)
+
 == {es} version 7.14.0
 
 === Enhancements

diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -48,6 +48,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string MAX_TREES;
     static const std::string FEATURE_BAG_FRACTION;
     static const std::string NUM_FOLDS;
+    static const std::string TRAIN_FRACTION_PER_FOLD;
     static const std::string STOP_CROSS_VALIDATION_EARLY;
     static const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER;
     static const std::string BAYESIAN_OPTIMISATION_RESTARTS;

diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h
@@ -40,8 +40,10 @@ class API_EXPORT CInferenceModelMetadata {
     static const std::string JSON_MEAN_MAGNITUDE_TAG;
     static const std::string JSON_MIN_TAG;
     static const std::string JSON_MODEL_METADATA_TAG;
+    static const std::string JSON_NUM_TRAINING_ROWS_TAG;
     static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
     static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;
+    static const std::string JSON_TRAIN_PARAMETERS_TAG;
 
 public:
     using TVector = maths::CDenseVector<double>;
@@ -64,6 +66,10 @@ class API_EXPORT CInferenceModelMetadata {
     //! to the baseline value).
     void featureImportanceBaseline(TVector&& baseline);
     void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);
+    //! Set the number of rows used to train the model.
+    void numberTrainingRows(std::size_t numberRows);
+    //! Set the fraction of data per fold used for training when tuning hyperparameters.
+    void trainFractionPerFold(double fraction);
 
 private:
     struct SHyperparameterImportance {
@@ -86,20 +92,23 @@ class API_EXPORT CInferenceModelMetadata {
 
 private:
     void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
-    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
     void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;
+    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
+    void writeTrainParameters(TRapidJsonWriter& writer) const;
 
 private:
     TSizeMeanAccumulatorUMap m_TotalShapValuesMean;
     TSizeMinMaxAccumulatorUMap m_TotalShapValuesMinMax;
     TOptionalVector m_ShapBaseline;
     TStrVec m_ColumnNames;
     TStrVec m_ClassValues;
-    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter =
+    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter{
         [](const std::string& value, TRapidJsonWriter& writer) {
             writer.String(value);
-        };
+        }};
     THyperparametersVec m_HyperparameterImportance;
+    std::size_t m_NumberTrainingRows{0};
+    double m_TrainFractionPerFold{0.0};
 };
 }
 }

diff --git a/include/maths/CBayesianOptimisation.h b/include/maths/CBayesianOptimisation.h
@@ -72,6 +72,10 @@ class MATHS_EXPORT CBayesianOptimisation {
     //! variance in the error in \p fx w.r.t. the true value is \p vx.
     void add(TVector x, double fx, double vx);
 
+    //! Any portion of the variance of the function error which is explained and
+    //! shouldn't be included in the kernel.
+    void explainedErrorVariance(double vx);
+
     //! Get the bounding box (in the function domain) in which we're minimizing.
     std::pair<TVector, TVector> boundingBox() const;
 
@@ -170,8 +174,9 @@ class MATHS_EXPORT CBayesianOptimisation {
 private:
     CPRNG::CXorOShiro128Plus m_Rng;
     std::size_t m_Restarts;
-    double m_RangeShift = 0.0;
-    double m_RangeScale = 1.0;
+    double m_RangeShift{0.0};
+    double m_RangeScale{1.0};
+    double m_ExplainedErrorVariance{0.0};
     TVector m_MinBoundary;
     TVector m_MaxBoundary;
     TVectorDoublePrVec m_FunctionMeanValues;

diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
@@ -208,7 +208,7 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
                                   public CBoostedTreeNode::CVisitor {
     public:
-        virtual ~CVisitor() = default;
+        ~CVisitor() override = default;
         virtual void addTree() = 0;
         virtual void addClassificationWeights(TDoubleVec weights) = 0;
         virtual void addLossFunction(const TLossFunction& lossFunction) = 0;
@@ -236,6 +236,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     //! Get the vector of hyperparameter importances.
     THyperparameterImportanceVec hyperparameterImportance() const;
 
+    //! Get the number of rows used to train the model.
+    std::size_t numberTrainingRows() const override;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    double trainFractionPerFold() const override;
+
     //! Get the column containing the dependent variable.
     std::size_t columnHoldingDependentVariable() const override;
 

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
@@ -80,6 +80,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency);
     //! Set the number of folds to use for estimating the generalisation error.
     CBoostedTreeFactory& numberFolds(std::size_t numberFolds);
+    //! Set the fraction fold data to use for training.
+    CBoostedTreeFactory& trainFractionPerFold(double fraction);
+    //! Set the maximum number of rows to use for training when tuning hyperparameters.
+    CBoostedTreeFactory& maximumNumberTrainRows(std::size_t rows);
     //! Stratify the cross-validation we do for regression.
     CBoostedTreeFactory& stratifyRegressionCrossValidation(bool stratify);
     //! Stop cross-validation early if the test loss is not promising.
@@ -205,18 +209,17 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TDoubleDoublePrVec estimateTreeGainAndCurvature(core::CDataFrame& frame,
                                                     const TDoubleVec& percentiles) const;
 
-    //! Perform a line search for the test loss w.r.t. a single regularization
-    //! hyperparameter and apply Newton's method to find the minimum. The plan
-    //! is to find a value near where the model starts to overfit.
+    //! Perform a line search for the test loss w.r.t. a single hyperparameter.
+    //! At the end we use a smooth curve fit through all test loss values (using
+    //! LOWESS regression) and use this to get a best estimate of where the true
+    //! minimum occurs.
     //!
     //! \return The interval to search during the main hyperparameter optimisation
     //! loop or null if this couldn't be found.
     TOptionalVector testLossLineSearch(core::CDataFrame& frame,
                                        const TApplyParameter& applyParameterStep,
                                        double intervalLeftEnd,
                                        double intervalRightEnd,
-                                       double returnedIntervalLeftEndOffset,
-                                       double returnedIntervalRightEndOffset,
                                        const TAdjustTestLoss& adjustTestLoss = noopAdjustTestLoss) const;
 
     //! Initialize the state for hyperparameter optimisation.
@@ -275,13 +278,14 @@ class MATHS_EXPORT CBoostedTreeFactory final {
 private:
     TOptionalDouble m_MinimumFrequencyToOneHotEncode;
     TOptionalSize m_BayesianOptimisationRestarts;
-    bool m_StratifyRegressionCrossValidation = true;
-    double m_InitialDownsampleRowsPerFeature = 200.0;
-    double m_GainPerNode1stPercentile = 0.0;
-    double m_GainPerNode50thPercentile = 0.0;
-    double m_GainPerNode90thPercentile = 0.0;
-    double m_TotalCurvaturePerNode1stPercentile = 0.0;
-    double m_TotalCurvaturePerNode90thPercentile = 0.0;
+    bool m_StratifyRegressionCrossValidation{true};
+    double m_InitialDownsampleRowsPerFeature{200.0};
+    std::size_t m_MaximumNumberOfTrainRows{500000};
+    double m_GainPerNode1stPercentile{0.0};
+    double m_GainPerNode50thPercentile{0.0};
+    double m_GainPerNode90thPercentile{0.0};
+    double m_TotalCurvaturePerNode1stPercentile{0.0};
+    double m_TotalCurvaturePerNode90thPercentile{0.0};
     std::size_t m_NumberThreads;
     TBoostedTreeImplUPtr m_TreeImpl;
     TVector m_LogDownsampleFactorSearchInterval;
@@ -291,7 +295,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TVector m_LogLeafWeightPenaltyMultiplierSearchInterval;
     TVector m_SoftDepthLimitSearchInterval;
     TVector m_LogEtaSearchInterval;
-    TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState;
+    TTrainingStateCallback m_RecordTrainingState{noopRecordTrainingState};
 };
 }
 }

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -150,6 +150,13 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \return The best hyperparameters for validation error found so far.
     const CBoostedTreeHyperparameters& bestHyperparameters() const;
 
+    //! \return The fraction of data we use for train per fold when tuning hyperparameters.
+    double trainFractionPerFold() const;
+
+    //! \return The full training set data mask, i.e. all rows which aren't missing
+    //! the dependent variable.
+    core::CPackedBitVector allTrainingRowsMask() const;
+
     //!\ name Test Only
     //@{
     //! The name of the object holding the best hyperaparameters in the state document.
@@ -203,9 +210,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Check if we can train a model.
     bool canTrain() const;
 
-    //! Get the full training set data mask, i.e. all rows which aren't missing
-    //! the dependent variable.
-    core::CPackedBitVector allTrainingRowsMask() const;
+    //! Get the mean number of training examples which are used in each fold.
+    double meanNumberTrainingRowsPerFold() const;
 
     //! Compute the \p percentile percentile gain per split and the sum of row
     //! curvatures per internal node of \p forest.
@@ -290,6 +296,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Compute the mean of the loss function on the masked rows of \p frame.
     double meanLoss(const core::CDataFrame& frame, const core::CPackedBitVector& rowMask) const;
 
+    //! Compute the overall variance of the error we see between folds.
+    double betweenFoldTestLossVariance() const;
+
     //! Get the root node of \p tree.
     static const CBoostedTreeNode& root(const TNodeVec& tree);
 
@@ -372,6 +381,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TOptionalDouble m_EtaOverride;
     TOptionalDouble m_EtaGrowthRatePerTreeOverride;
     TOptionalSize m_NumberFoldsOverride;
+    TOptionalSize m_TrainFractionPerFoldOverride;
     TOptionalSize m_MaximumNumberTreesOverride;
     TOptionalDouble m_FeatureBagFractionOverride;
     TOptionalStrDoublePrVec m_ClassificationWeightsOverride;
@@ -381,6 +391,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     double m_Eta = 0.1;
     double m_EtaGrowthRatePerTree = 1.05;
     std::size_t m_NumberFolds = 4;
+    double m_TrainFractionPerFold = 0.75;
     std::size_t m_MaximumNumberTrees = 20;
     std::size_t m_MaximumAttemptsToAddTree = 3;
     std::size_t m_NumberSplitsPerFeature = 75;

diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
@@ -103,25 +103,26 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
               s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance},
               s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier},
               s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}
-        double s_DepthPenaltyMultiplier = -1.0;
-        double s_SoftTreeDepthLimit = -1.0;
-        double s_SoftTreeDepthTolerance = -1.0;
-        double s_TreeSizePenaltyMultiplier = -1.0;
-        double s_LeafWeightPenaltyMultiplier = -1.0;
+        double s_DepthPenaltyMultiplier{-1.0};
+        double s_SoftTreeDepthLimit{-1.0};
+        double s_SoftTreeDepthTolerance{-1.0};
+        double s_TreeSizePenaltyMultiplier{-1.0};
+        double s_LeafWeightPenaltyMultiplier{-1.0};
     };
     struct SHyperparameters {
-        double s_Eta = -1.0;
+        double s_Eta{-1.0};
         CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective =
             CBoostedTree::E_MinimumRecall;
         SRegularization s_Regularization;
-        double s_DownsampleFactor = -1.0;
-        std::size_t s_NumFolds = 0;
-        std::size_t s_MaxTrees = 0;
-        double s_FeatureBagFraction = -1.0;
-        double s_EtaGrowthRatePerTree = -1.0;
-        std::size_t s_MaxAttemptsToAddTree = 0;
-        std::size_t s_NumSplitsPerFeature = 0;
-        std::size_t s_MaxOptimizationRoundsPerHyperparameter = 0;
+        double s_DownsampleFactor{-1.0};
+        std::size_t s_NumFolds{0};
+        double s_NumTrainingRows{0};
+        std::size_t s_MaxTrees{0};
+        double s_FeatureBagFraction{-1.0};
+        double s_EtaGrowthRatePerTree{-1.0};
+        std::size_t s_MaxAttemptsToAddTree{0};
+        std::size_t s_NumSplitsPerFeature{0};
+        std::size_t s_MaxOptimizationRoundsPerHyperparameter{0};
     };
     using TDoubleVec = std::vector<double>;
 

diff --git a/include/maths/CDataFramePredictiveModel.h b/include/maths/CDataFramePredictiveModel.h
@@ -61,6 +61,12 @@ class MATHS_EXPORT CDataFramePredictiveModel {
     //! \warning Will return a nullptr if a trained model isn't available.
     virtual CTreeShapFeatureImportance* shap() const = 0;
 
+    //! Get the number of rows used to train the model.
+    virtual std::size_t numberTrainingRows() const = 0;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    virtual double trainFractionPerFold() const = 0;
+
     //! Get the column containing the dependent variable.
     virtual std::size_t columnHoldingDependentVariable() const = 0;
 

diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h
@@ -281,8 +281,9 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
     //! \param[in] targetColumn The index of the column to predict.
     //! \param[in] rng The random number generator to use.
     //! \param[in] numberFolds The number of folds to use.
-    //! \param[in] numberBuckets The number of buckets to use when stratifying by
-    //! target quantiles for regression.
+    //! \param[in] trainFractionPerFold The fraction of train data to use per fold.
+    //! \param[in] numberBuckets The number of buckets to use when stratifying
+    //! by target quantiles for regression.
     //! \param[in] allTrainingRowsMask A mask of the candidate training rows.
     //! \warning This fails if the target is not categorical.
     static std::tuple<TPackedBitVectorVec, TPackedBitVectorVec, TDoubleVec>
@@ -291,6 +292,7 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
                                       std::size_t targetColumn,
                                       CPRNG::CXorOShiro128Plus rng,
                                       std::size_t numberFolds,
+                                      double trainFractionPerFold,
                                       std::size_t numberBuckets,
                                       const core::CPackedBitVector& allTrainingRowsMask);
 

diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#ifndef INCLUDED_ml_maths_CLowess_h
+#define INCLUDED_ml_maths_CLowess_h
+
+#include <maths/CBasicStatistics.h>
+#include <maths/CLeastSquaresOnlineRegression.h>
+
+#include <utility>
+#include <vector>
+
+namespace ml {
+namespace maths {
+
+//! \brief LOWESS regression using order N polynomial.
+//!
+//! DESCRIPTION:\n
+//! For more details see https://en.wikipedia.org/wiki/Local_regression.
+template<std::size_t N>
+class CLowess {
+public:
+    using TDoubleDoublePr = std::pair<double, double>;
+    using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
+    using TPolynomial = CLeastSquaresOnlineRegression<N>;
+
+public:
+    //! Fit a polynomial LOWESS model to \p data choosing the weight function to
+    //! maximize the likelihood of \p numberFolds hold out sets.
+    //!
+    //! \param[in] data The training data.
+    //! \param[in] numberFolds The number of folds to use in cross-validation to
+    //! compute the best weight function from the family exp(-k |xi - xj|) with
+    //! k a free parameter which determines the amount of smoothing to use.
+    void fit(TDoubleDoublePrVec data, std::size_t numberFolds);
+
+    //! Predict the value at \p x.
+    //!
+    //! \note Defined as zero if no data have been fit.
+    double predict(double x) const;
+
+    //! Compute the minimum of the function on the training data interval.
+    //!
+    //! \note Defined as (0,0) if no data have been fit.
+    TDoubleDoublePr minimum() const;
+
+    //! \name Test Only
+    //@{
+    //! Get an estimate of residual variance at the observed values.
+    //!
+    //! \note Defined as zero if no data have been fit.
+    double residualVariance() const;
+
+    //! Get how far we are prepared to extrapolate as the interval we will search
+    //! in the minimum and sublevelSet functions.
+    TDoubleDoublePr extrapolationInterval() const;
+    //@}
+
+private:
+    using TDoubleVec = std::vector<double>;
+    using TSizeVec = std::vector<std::size_t>;
+    using TSizeVecVec = std::vector<TSizeVec>;
+    using TSizeVecCItr = TSizeVec::const_iterator;
+    using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
+
+private:
+    void setupMasks(std::size_t numberFolds, TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks) const;
+    double likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks, double k) const;
+    TPolynomial fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const;
+    double weight(double k, double x1, double x2) const;
+
+private:
+    TDoubleDoublePrVec m_Data;
+    TSizeVec m_Mask;
+    //! The weight to assign to data points when fitting polynomial at x is given
+    //! by exp(-k |xi - xj|). This can therefore be thought of as the inverse of
+    //! the amount of smoothing.
+    double m_K{0.0};
+};
+}
+}
+
+#endif // INCLUDED_ml_maths_CLowess_h