elastic · tveasey · Jul 12, 2021 · Feb 22, 2021 · Jun 24, 2021 · Jun 29, 2021
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -47,6 +47,13 @@
 * Ensure bucket `event_count` is calculated for jobs with 1 second bucket spans.
   (See {ml-pull}1908[#1908].)
 
+== {es} version 7.15.0
+
+=== Enhancements
+
+* Speed up training of regression and classification models on very large data sets.
+  (See {ml-pull}1941[#1941].)
+
 == {es} version 7.14.0
 
 === Enhancements

diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h
@@ -184,17 +184,19 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final
     CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId, std::size_t memoryLimit)
         : CDataFrameAnalysisInstrumentation(jobId, memoryLimit) {}
 
-    //! Supervised learning job \p type, can be E_Regression or E_Classification.
+    //! Set the supervised learning job \p type, can be E_Regression or E_Classification.
     void type(EStatsType type) override;
-    //! Current \p iteration number.
+    //! Set the current \p iteration number.
     void iteration(std::size_t iteration) override;
-    //! Run time of the iteration.
+    //! Set the run time of the current iteration.
     void iterationTime(std::uint64_t delta) override;
-    //! Type of the validation loss result, e.g. "mse".
+    //! Set the type of the validation loss result, e.g. "mse".
     void lossType(const std::string& lossType) override;
-    //! List of \p lossValues of validation error for the given \p fold.
+    //! Set the validation loss values for \p fold for each forest size to \p lossValues.
     void lossValues(std::size_t fold, TDoubleVec&& lossValues) override;
-    //! \return Structure contains hyperparameters.
+    //! Set the fraction of data used for training per fold.
+    void trainingFractionPerFold(double fraction) override;
+    //! \return A writable object containing the training hyperparameters.
     SHyperparameters& hyperparameters() override { return m_Hyperparameters; }
 
 protected:
@@ -206,19 +208,21 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final
 
 private:
     void writeAnalysisStats(std::int64_t timestamp) override;
+    void writeMetaData(rapidjson::Value& parentObject);
     void writeHyperparameters(rapidjson::Value& parentObject);
     void writeValidationLoss(rapidjson::Value& parentObject);
     void writeTimingStats(rapidjson::Value& parentObject);
     void reset();
 
 private:
-    EStatsType m_Type = E_Regression;
-    std::size_t m_Iteration = 0;
-    std::uint64_t m_IterationTime = 0;
-    std::uint64_t m_ElapsedTime = 0;
-    bool m_AnalysisStatsInitialized = false;
+    EStatsType m_Type{E_Regression};
+    std::size_t m_Iteration{0};
+    std::uint64_t m_IterationTime{0};
+    std::uint64_t m_ElapsedTime{0};
+    bool m_AnalysisStatsInitialized{false};
     std::string m_LossType;
     TLossVec m_LossValues;
+    double m_TrainingFractionPerFold{0.0};
     SHyperparameters m_Hyperparameters;
 };
 }

diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -48,6 +48,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string MAX_TREES;
     static const std::string FEATURE_BAG_FRACTION;
     static const std::string NUM_FOLDS;
+    static const std::string TRAIN_FRACTION_PER_FOLD;
     static const std::string STOP_CROSS_VALIDATION_EARLY;
     static const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER;
     static const std::string BAYESIAN_OPTIMISATION_RESTARTS;

diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h
@@ -40,8 +40,10 @@ class API_EXPORT CInferenceModelMetadata {
     static const std::string JSON_MEAN_MAGNITUDE_TAG;
     static const std::string JSON_MIN_TAG;
     static const std::string JSON_MODEL_METADATA_TAG;
+    static const std::string JSON_NUM_TRAINING_ROWS_TAG;
     static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
     static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;
+    static const std::string JSON_TRAIN_PARAMETERS_TAG;
 
 public:
     using TVector = maths::CDenseVector<double>;
@@ -64,6 +66,10 @@ class API_EXPORT CInferenceModelMetadata {
     //! to the baseline value).
     void featureImportanceBaseline(TVector&& baseline);
     void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);
+    //! Set the number of rows used to train the model.
+    void numberTrainingRows(std::size_t numberRows);
+    //! Set the fraction of data per fold used for training when tuning hyperparameters.
+    void trainFractionPerFold(double fraction);
 
 private:
     struct SHyperparameterImportance {
@@ -86,20 +92,23 @@ class API_EXPORT CInferenceModelMetadata {
 
 private:
     void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
-    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
     void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;
+    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
+    void writeTrainParameters(TRapidJsonWriter& writer) const;
 
 private:
     TSizeMeanAccumulatorUMap m_TotalShapValuesMean;
     TSizeMinMaxAccumulatorUMap m_TotalShapValuesMinMax;
     TOptionalVector m_ShapBaseline;
     TStrVec m_ColumnNames;
     TStrVec m_ClassValues;
-    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter =
+    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter{
         [](const std::string& value, TRapidJsonWriter& writer) {
             writer.String(value);
-        };
+        }};
     THyperparametersVec m_HyperparameterImportance;
+    std::size_t m_NumberTrainingRows{0};
+    double m_TrainFractionPerFold{0.0};
 };
 }
 }

diff --git a/include/maths/CBayesianOptimisation.h b/include/maths/CBayesianOptimisation.h
@@ -72,6 +72,10 @@ class MATHS_EXPORT CBayesianOptimisation {
     //! variance in the error in \p fx w.r.t. the true value is \p vx.
     void add(TVector x, double fx, double vx);
 
+    //! Any portion of the variance of the function error which is explained and
+    //! shouldn't be included in the kernel.
+    void explainedErrorVariance(double vx);
+
     //! Get the bounding box (in the function domain) in which we're minimizing.
     std::pair<TVector, TVector> boundingBox() const;
 
@@ -170,8 +174,9 @@ class MATHS_EXPORT CBayesianOptimisation {
 private:
     CPRNG::CXorOShiro128Plus m_Rng;
     std::size_t m_Restarts;
-    double m_RangeShift = 0.0;
-    double m_RangeScale = 1.0;
+    double m_RangeShift{0.0};
+    double m_RangeScale{1.0};
+    double m_ExplainedErrorVariance{0.0};
     TVector m_MinBoundary;
     TVector m_MaxBoundary;
     TVectorDoublePrVec m_FunctionMeanValues;

diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
@@ -208,7 +208,7 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
                                   public CBoostedTreeNode::CVisitor {
     public:
-        virtual ~CVisitor() = default;
+        ~CVisitor() override = default;
         virtual void addTree() = 0;
         virtual void addClassificationWeights(TDoubleVec weights) = 0;
         virtual void addLossFunction(const TLossFunction& lossFunction) = 0;
@@ -236,6 +236,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     //! Get the vector of hyperparameter importances.
     THyperparameterImportanceVec hyperparameterImportance() const;
 
+    //! Get the number of rows used to train the model.
+    std::size_t numberTrainingRows() const override;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    double trainFractionPerFold() const override;
+
     //! Get the column containing the dependent variable.
     std::size_t columnHoldingDependentVariable() const override;
 

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
@@ -80,6 +80,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency);
     //! Set the number of folds to use for estimating the generalisation error.
     CBoostedTreeFactory& numberFolds(std::size_t numberFolds);
+    //! Set the fraction fold data to use for training.
+    CBoostedTreeFactory& trainFractionPerFold(double fraction);
+    //! Set the maximum number of rows to use for training when tuning hyperparameters.
+    CBoostedTreeFactory& maximumNumberTrainRows(std::size_t rows);
     //! Stratify the cross-validation we do for regression.
     CBoostedTreeFactory& stratifyRegressionCrossValidation(bool stratify);
     //! Stop cross-validation early if the test loss is not promising.
@@ -205,18 +209,17 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TDoubleDoublePrVec estimateTreeGainAndCurvature(core::CDataFrame& frame,
                                                     const TDoubleVec& percentiles) const;
 
-    //! Perform a line search for the test loss w.r.t. a single regularization
-    //! hyperparameter and apply Newton's method to find the minimum. The plan
-    //! is to find a value near where the model starts to overfit.
+    //! Perform a line search for the test loss w.r.t. a single hyperparameter.
+    //! At the end we use a smooth curve fit through all test loss values (using
+    //! LOWESS regression) and use this to get a best estimate of where the true
+    //! minimum occurs.
     //!
     //! \return The interval to search during the main hyperparameter optimisation
     //! loop or null if this couldn't be found.
     TOptionalVector testLossLineSearch(core::CDataFrame& frame,
                                        const TApplyParameter& applyParameterStep,
                                        double intervalLeftEnd,
                                        double intervalRightEnd,
-                                       double returnedIntervalLeftEndOffset,
-                                       double returnedIntervalRightEndOffset,
                                        const TAdjustTestLoss& adjustTestLoss = noopAdjustTestLoss) const;
 
     //! Initialize the state for hyperparameter optimisation.
@@ -275,13 +278,14 @@ class MATHS_EXPORT CBoostedTreeFactory final {
 private:
     TOptionalDouble m_MinimumFrequencyToOneHotEncode;
     TOptionalSize m_BayesianOptimisationRestarts;
-    bool m_StratifyRegressionCrossValidation = true;
-    double m_InitialDownsampleRowsPerFeature = 200.0;
-    double m_GainPerNode1stPercentile = 0.0;
-    double m_GainPerNode50thPercentile = 0.0;
-    double m_GainPerNode90thPercentile = 0.0;
-    double m_TotalCurvaturePerNode1stPercentile = 0.0;
-    double m_TotalCurvaturePerNode90thPercentile = 0.0;
+    bool m_StratifyRegressionCrossValidation{true};
+    double m_InitialDownsampleRowsPerFeature{200.0};
+    std::size_t m_MaximumNumberOfTrainRows{500000};
+    double m_GainPerNode1stPercentile{0.0};
+    double m_GainPerNode50thPercentile{0.0};
+    double m_GainPerNode90thPercentile{0.0};
+    double m_TotalCurvaturePerNode1stPercentile{0.0};
+    double m_TotalCurvaturePerNode90thPercentile{0.0};
     std::size_t m_NumberThreads;
     TBoostedTreeImplUPtr m_TreeImpl;
     TVector m_LogDownsampleFactorSearchInterval;
@@ -291,7 +295,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TVector m_LogLeafWeightPenaltyMultiplierSearchInterval;
     TVector m_SoftDepthLimitSearchInterval;
     TVector m_LogEtaSearchInterval;
-    TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState;
+    TTrainingStateCallback m_RecordTrainingState{noopRecordTrainingState};
 };
 }
 }

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -150,6 +150,13 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \return The best hyperparameters for validation error found so far.
     const CBoostedTreeHyperparameters& bestHyperparameters() const;
 
+    //! \return The fraction of data we use for train per fold when tuning hyperparameters.
+    double trainFractionPerFold() const;
+
+    //! \return The full training set data mask, i.e. all rows which aren't missing
+    //! the dependent variable.
+    core::CPackedBitVector allTrainingRowsMask() const;
+
     //!\ name Test Only
     //@{
     //! The name of the object holding the best hyperaparameters in the state document.
@@ -203,9 +210,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Check if we can train a model.
     bool canTrain() const;
 
-    //! Get the full training set data mask, i.e. all rows which aren't missing
-    //! the dependent variable.
-    core::CPackedBitVector allTrainingRowsMask() const;
+    //! Get the mean number of training examples which are used in each fold.
+    double meanNumberTrainingRowsPerFold() const;
 
     //! Compute the \p percentile percentile gain per split and the sum of row
     //! curvatures per internal node of \p forest.
@@ -290,6 +296,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Compute the mean of the loss function on the masked rows of \p frame.
     double meanLoss(const core::CDataFrame& frame, const core::CPackedBitVector& rowMask) const;
 
+    //! Compute the overall variance of the error we see between folds.
+    double betweenFoldTestLossVariance() const;
+
     //! Get the root node of \p tree.
     static const CBoostedTreeNode& root(const TNodeVec& tree);
 
@@ -372,6 +381,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TOptionalDouble m_EtaOverride;
     TOptionalDouble m_EtaGrowthRatePerTreeOverride;
     TOptionalSize m_NumberFoldsOverride;
+    TOptionalSize m_TrainFractionPerFoldOverride;
     TOptionalSize m_MaximumNumberTreesOverride;
     TOptionalDouble m_FeatureBagFractionOverride;
     TOptionalStrDoublePrVec m_ClassificationWeightsOverride;
@@ -381,6 +391,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     double m_Eta = 0.1;
     double m_EtaGrowthRatePerTree = 1.05;
     std::size_t m_NumberFolds = 4;
+    double m_TrainFractionPerFold = 0.75;
     std::size_t m_MaximumNumberTrees = 20;
     std::size_t m_MaximumAttemptsToAddTree = 3;
     std::size_t m_NumberSplitsPerFeature = 75;

diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
@@ -33,7 +33,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
     //! Adds \p delta to the memory usage statistics.
     virtual void updateMemoryUsage(std::int64_t delta) = 0;
 
-    //! Start progress monitoring for \p phase.
+    //! Start progress monitoring of \p task.
     //!
     //! \note This resets the current progress to zero.
     virtual void startNewProgressMonitoredTask(const std::string& task) = 0;
@@ -103,41 +103,42 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
               s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance},
               s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier},
               s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}
-        double s_DepthPenaltyMultiplier = -1.0;
-        double s_SoftTreeDepthLimit = -1.0;
-        double s_SoftTreeDepthTolerance = -1.0;
-        double s_TreeSizePenaltyMultiplier = -1.0;
-        double s_LeafWeightPenaltyMultiplier = -1.0;
+        double s_DepthPenaltyMultiplier{-1.0};
+        double s_SoftTreeDepthLimit{-1.0};
+        double s_SoftTreeDepthTolerance{-1.0};
+        double s_TreeSizePenaltyMultiplier{-1.0};
+        double s_LeafWeightPenaltyMultiplier{-1.0};
     };
     struct SHyperparameters {
-        double s_Eta = -1.0;
+        double s_Eta{-1.0};
         CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective =
             CBoostedTree::E_MinimumRecall;
         SRegularization s_Regularization;
-        double s_DownsampleFactor = -1.0;
-        std::size_t s_NumFolds = 0;
-        std::size_t s_MaxTrees = 0;
-        double s_FeatureBagFraction = -1.0;
-        double s_EtaGrowthRatePerTree = -1.0;
-        std::size_t s_MaxAttemptsToAddTree = 0;
-        std::size_t s_NumSplitsPerFeature = 0;
-        std::size_t s_MaxOptimizationRoundsPerHyperparameter = 0;
+        double s_DownsampleFactor{-1.0};
+        std::size_t s_NumFolds{0};
+        std::size_t s_MaxTrees{0};
+        double s_FeatureBagFraction{-1.0};
+        double s_EtaGrowthRatePerTree{-1.0};
+        std::size_t s_MaxAttemptsToAddTree{0};
+        std::size_t s_NumSplitsPerFeature{0};
+        std::size_t s_MaxOptimizationRoundsPerHyperparameter{0};
     };
     using TDoubleVec = std::vector<double>;
 
 public:
-    virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default;
-    //! Supervised learning job \p type, can be E_Regression or E_Classification.
+    //! Set the supervised learning job \p type, can be E_Regression or E_Classification.
     virtual void type(EStatsType type) = 0;
-    //! Current \p iteration number.
+    //! Set the current \p iteration number.
     virtual void iteration(std::size_t iteration) = 0;
-    //! Run time of the iteration.
+    //! Set the run time of the current iteration.
     virtual void iterationTime(std::uint64_t delta) = 0;
-    //! Type of the validation loss result, e.g. "mse".
+    //! Set the type of the validation loss result, e.g. "mse".
     virtual void lossType(const std::string& lossType) = 0;
-    //! List of \p lossValues of validation error for the given \p fold.
+    //! Set the validation loss values for \p fold for each forest size to \p lossValues.
     virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0;
-    //! \return Structure contains hyperparameters.
+    //! Set the fraction of data used for training per fold.
+    virtual void trainingFractionPerFold(double fraction) = 0;
+    //! \return A writable object containing the training hyperparameters.
     virtual SHyperparameters& hyperparameters() = 0;
 };
 
@@ -167,6 +168,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub
     void iterationTime(std::uint64_t /* delta */) override {}
     void lossType(const std::string& /* lossType */) override {}
     void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override {}
+    void trainingFractionPerFold(double /* fraction */) override {}
     SHyperparameters& hyperparameters() override { return m_Hyperparameters; }
 
 private:

diff --git a/include/maths/CDataFramePredictiveModel.h b/include/maths/CDataFramePredictiveModel.h
@@ -61,6 +61,12 @@ class MATHS_EXPORT CDataFramePredictiveModel {
     //! \warning Will return a nullptr if a trained model isn't available.
     virtual CTreeShapFeatureImportance* shap() const = 0;
 
+    //! Get the number of rows used to train the model.
+    virtual std::size_t numberTrainingRows() const = 0;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    virtual double trainFractionPerFold() const = 0;
+
     //! Get the column containing the dependent variable.
     virtual std::size_t columnHoldingDependentVariable() const = 0;
 

diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h
@@ -281,8 +281,9 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
     //! \param[in] targetColumn The index of the column to predict.
     //! \param[in] rng The random number generator to use.
     //! \param[in] numberFolds The number of folds to use.
-    //! \param[in] numberBuckets The number of buckets to use when stratifying by
-    //! target quantiles for regression.
+    //! \param[in] trainFractionPerFold The fraction of train data to use per fold.
+    //! \param[in] numberBuckets The number of buckets to use when stratifying
+    //! by target quantiles for regression.
     //! \param[in] allTrainingRowsMask A mask of the candidate training rows.
     //! \warning This fails if the target is not categorical.
     static std::tuple<TPackedBitVectorVec, TPackedBitVectorVec, TDoubleVec>
@@ -291,6 +292,7 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable {
                                       std::size_t targetColumn,
                                       CPRNG::CXorOShiro128Plus rng,
                                       std::size_t numberFolds,
+                                      double trainFractionPerFold,
                                       std::size_t numberBuckets,
                                       const core::CPackedBitVector& allTrainingRowsMask);