elastic · tveasey · Jul 12, 2021 · Feb 22, 2021 · Jun 24, 2021 · Jun 29, 2021
diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h
@@ -40,8 +40,10 @@ class API_EXPORT CInferenceModelMetadata {
     static const std::string JSON_MEAN_MAGNITUDE_TAG;
     static const std::string JSON_MIN_TAG;
     static const std::string JSON_MODEL_METADATA_TAG;
+    static const std::string JSON_NUM_TRAINING_ROWS_TAG;
     static const std::string JSON_RELATIVE_IMPORTANCE_TAG;
     static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG;
+    static const std::string JSON_TRAIN_PARAMETERS_TAG;
 
 public:
     using TVector = maths::CDenseVector<double>;
@@ -64,6 +66,10 @@ class API_EXPORT CInferenceModelMetadata {
     //! to the baseline value).
     void featureImportanceBaseline(TVector&& baseline);
     void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance);
+    //! Set the number of rows used to train the model.
+    void numberTrainingRows(std::size_t numberRows);
+    //! Set the fraction of data per fold used for training when tuning hyperparameters.
+    void trainFractionPerFold(double fraction);
 
 private:
     struct SHyperparameterImportance {
@@ -86,20 +92,23 @@ class API_EXPORT CInferenceModelMetadata {
 
 private:
     void writeTotalFeatureImportance(TRapidJsonWriter& writer) const;
-    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
     void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const;
+    void writeHyperparameterImportance(TRapidJsonWriter& writer) const;
+    void writeTrainParameters(TRapidJsonWriter& writer) const;
 
 private:
     TSizeMeanAccumulatorUMap m_TotalShapValuesMean;
     TSizeMinMaxAccumulatorUMap m_TotalShapValuesMinMax;
     TOptionalVector m_ShapBaseline;
     TStrVec m_ColumnNames;
     TStrVec m_ClassValues;
-    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter =
+    TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter{
         [](const std::string& value, TRapidJsonWriter& writer) {
             writer.String(value);
-        };
+        }};
     THyperparametersVec m_HyperparameterImportance;
+    std::size_t m_NumberTrainingRows{0};
+    double m_TrainFractionPerFold{0.0};
 };
 }
 }

diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h
@@ -208,7 +208,7 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor,
                                   public CBoostedTreeNode::CVisitor {
     public:
-        virtual ~CVisitor() = default;
+        ~CVisitor() override = default;
         virtual void addTree() = 0;
         virtual void addClassificationWeights(TDoubleVec weights) = 0;
         virtual void addLossFunction(const TLossFunction& lossFunction) = 0;
@@ -236,6 +236,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel {
     //! Get the vector of hyperparameter importances.
     THyperparameterImportanceVec hyperparameterImportance() const;
 
+    //! Get the number of rows used to train the model.
+    std::size_t numberTrainingRows() const override;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    double trainFractionPerFold() const override;
+
     //! Get the column containing the dependent variable.
     std::size_t columnHoldingDependentVariable() const override;
 

diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h
@@ -209,9 +209,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TDoubleDoublePrVec estimateTreeGainAndCurvature(core::CDataFrame& frame,
                                                     const TDoubleVec& percentiles) const;
 
-    //! Perform a line search for the test loss w.r.t. a single regularization
-    //! hyperparameter and apply Newton's method to find the minimum. The plan
-    //! is to find a value near where the model starts to overfit.
+    //! Perform a line search for the test loss w.r.t. a single hyperparameter.
+    //! At the end we use a smooth curve fit through all test loss values (using
+    //! LOWESS regression) and use this to get a best estimate of where the true
+    //! minimum occurs.
     //!
     //! \return The interval to search during the main hyperparameter optimisation
     //! loop or null if this couldn't be found.
@@ -277,14 +278,14 @@ class MATHS_EXPORT CBoostedTreeFactory final {
 private:
     TOptionalDouble m_MinimumFrequencyToOneHotEncode;
     TOptionalSize m_BayesianOptimisationRestarts;
-    bool m_StratifyRegressionCrossValidation = true;
-    double m_InitialDownsampleRowsPerFeature = 200.0;
-    std::size_t m_MaximumNumberOfTrainRows = 500000;
-    double m_GainPerNode1stPercentile = 0.0;
-    double m_GainPerNode50thPercentile = 0.0;
-    double m_GainPerNode90thPercentile = 0.0;
-    double m_TotalCurvaturePerNode1stPercentile = 0.0;
-    double m_TotalCurvaturePerNode90thPercentile = 0.0;
+    bool m_StratifyRegressionCrossValidation{true};
+    double m_InitialDownsampleRowsPerFeature{200.0};
+    std::size_t m_MaximumNumberOfTrainRows{500000};
+    double m_GainPerNode1stPercentile{0.0};
+    double m_GainPerNode50thPercentile{0.0};
+    double m_GainPerNode90thPercentile{0.0};
+    double m_TotalCurvaturePerNode1stPercentile{0.0};
+    double m_TotalCurvaturePerNode90thPercentile{0.0};
     std::size_t m_NumberThreads;
     TBoostedTreeImplUPtr m_TreeImpl;
     TVector m_LogDownsampleFactorSearchInterval;
@@ -294,7 +295,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {
     TVector m_LogLeafWeightPenaltyMultiplierSearchInterval;
     TVector m_SoftDepthLimitSearchInterval;
     TVector m_LogEtaSearchInterval;
-    TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState;
+    TTrainingStateCallback m_RecordTrainingState{noopRecordTrainingState};
 };
 }
 }

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -150,6 +150,13 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! \return The best hyperparameters for validation error found so far.
     const CBoostedTreeHyperparameters& bestHyperparameters() const;
 
+    //! \return The fraction of data we use for train per fold when tuning hyperparameters.
+    double trainFractionPerFold() const;
+
+    //! \return The full training set data mask, i.e. all rows which aren't missing
+    //! the dependent variable.
+    core::CPackedBitVector allTrainingRowsMask() const;
+
     //!\ name Test Only
     //@{
     //! The name of the object holding the best hyperaparameters in the state document.
@@ -203,9 +210,8 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Check if we can train a model.
     bool canTrain() const;
 
-    //! Get the full training set data mask, i.e. all rows which aren't missing
-    //! the dependent variable.
-    core::CPackedBitVector allTrainingRowsMask() const;
+    //! Get the mean number of training examples which are used in each fold.
+    double meanNumberTrainingRowsPerFold() const;
 
     //! Compute the \p percentile percentile gain per split and the sum of row
     //! curvatures per internal node of \p forest.

diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h
@@ -116,7 +116,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
         SRegularization s_Regularization;
         double s_DownsampleFactor{-1.0};
         std::size_t s_NumFolds{0};
-        double s_TrainFractionPerFold{0.0};
+        double s_NumTrainingRows{0};
         std::size_t s_MaxTrees{0};
         double s_FeatureBagFraction{-1.0};
         double s_EtaGrowthRatePerTree{-1.0};

diff --git a/include/maths/CDataFramePredictiveModel.h b/include/maths/CDataFramePredictiveModel.h
@@ -61,6 +61,12 @@ class MATHS_EXPORT CDataFramePredictiveModel {
     //! \warning Will return a nullptr if a trained model isn't available.
     virtual CTreeShapFeatureImportance* shap() const = 0;
 
+    //! Get the number of rows used to train the model.
+    virtual std::size_t numberTrainingRows() const = 0;
+
+    //! Get the fraction of data per fold used for training when tuning hyperparameters.
+    virtual double trainFractionPerFold() const = 0;
+
     //! Get the column containing the dependent variable.
     virtual std::size_t columnHoldingDependentVariable() const = 0;
 

diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h
@@ -7,9 +7,8 @@
 #ifndef INCLUDED_ml_maths_CLowess_h
 #define INCLUDED_ml_maths_CLowess_h
 
-#include <maths/CLeastSquaresOnlineRegression.h>
-
 #include <maths/CBasicStatistics.h>
+#include <maths/CLeastSquaresOnlineRegression.h>
 
 #include <utility>
 #include <vector>
@@ -34,7 +33,8 @@ class CLowess {
     //!
     //! \param[in] data The training data.
     //! \param[in] numberFolds The number of folds to use in cross-validation to
-    // compute the best weight function from the family exp(-k |xi - xj|).
+    //! compute the best weight function from the family exp(-k |xi - xj|) with
+    //! k a free parameter which determines the amount of smoothing to use.
     void fit(TDoubleDoublePrVec data, std::size_t numberFolds);
 
     //! Predict the value at \p x.
@@ -47,23 +47,17 @@ class CLowess {
     //! \note Defined as (0,0) if no data have been fit.
     TDoubleDoublePr minimum() const;
 
+    //! \name Test Only
+    //@{
     //! Get an estimate of residual variance at the observed values.
     //!
     //! \note Defined as zero if no data have been fit.
     double residualVariance() const;
 
-    //! Compute the sublevel set of \p f containing \p xmin.
-    //!
-    //! \param[in] xmin The argument of the minimum of the interpolated function.
-    //! \param[in] fmin The value of the minimum of the function.
-    //! \param[in] f The value of the function for which to compute the sublevel set.
-    //! \note \p f should be greater than fmin.
-    //! \note Defined as (0,0) if no data have been fit.
-    TDoubleDoublePr sublevelSet(double xmin, double fmin, double f) const;
-
     //! Get how far we are prepared to extrapolate as the interval we will search
     //! in the minimum and sublevelSet functions.
     TDoubleDoublePr extrapolationInterval() const;
+    //@}
 
 private:
     using TDoubleVec = std::vector<double>;
@@ -81,7 +75,10 @@ class CLowess {
 private:
     TDoubleDoublePrVec m_Data;
     TSizeVec m_Mask;
-    double m_K = 0.0;
+    //! The weight to assign to data points when fitting polynomial at x is given
+    //! by exp(-k |xi - xj|). This can therefore be thought of as the inverse of
+    //! the amount of smoothing.
+    double m_K{0.0};
 };
 }
 }