Skip to content

Commit

Permalink
[ML] Early stopping in the line searches to compute initial regularis…
Browse files Browse the repository at this point in the history
…er values (#903)
  • Loading branch information
tveasey committed Dec 17, 2019
1 parent e37684b commit a96c501
Show file tree
Hide file tree
Showing 14 changed files with 413 additions and 286 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ is no longer decreasing. (See {ml-pull}875[#875].)
* Emit `prediction_field_name` in ml results using the type provided as
`prediction_field_type` parameter. (See {ml-pull}877[#877].)
* Improve performance updating quantile estimates. (See {ml-pull}881[#881].)
* Migrate to use Bayesian Optimisation for initial hyperparameter value line searches and
stop early if the expected improvement is too small. (See {ml-pull}903[#903].)

=== Bug Fixes
* Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].)
Expand Down
8 changes: 0 additions & 8 deletions include/api/CDataFrameTrainBoostedTreeRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
//! The boosted tree factory.
maths::CBoostedTreeFactory& boostedTreeFactory();

//! Factory for the largest SHAP value accumulator.
template<typename LESS>
maths::CBasicStatistics::COrderStatisticsHeap<std::size_t, LESS>
makeLargestShapAccumulator(std::size_t n, LESS less) const {
return maths::CBasicStatistics::COrderStatisticsHeap<std::size_t, LESS>{
n, std::size_t{}, less};
}

private:
using TBoostedTreeFactoryUPtr = std::unique_ptr<maths::CBoostedTreeFactory>;
using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr;
Expand Down
12 changes: 12 additions & 0 deletions include/maths/CBasicStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,12 @@ class MATHS_EXPORT CBasicStatistics {
}
};

//! Make a stack based order statistics accumulator from \p less.
template<typename T, std::size_t N, typename LESS>
static COrderStatisticsStack<T, N, LESS> orderStatisticsAccumulator(LESS less) {
return COrderStatisticsStack<T, N, LESS>{less};
}

//! \brief A heap based accumulator class for order statistics.
//!
//! DESCRIPTION:\n
Expand Down Expand Up @@ -1298,6 +1304,12 @@ class MATHS_EXPORT CBasicStatistics {
}
};

//! Make a heap based order statistics accumulator from \p less.
template<typename T, typename LESS>
static COrderStatisticsHeap<T, LESS> orderStatisticsAccumulator(std::size_t n, LESS less) {
return COrderStatisticsHeap<T, LESS>{n, T{}, less};
}

//! \name Accumulator Typedefs
//@{
//! Accumulator object to compute the sample maximum.
Expand Down
5 changes: 4 additions & 1 deletion include/maths/CBayesianOptimisation.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <maths/CPRNG.h>
#include <maths/ImportExport.h>

#include <boost/optional.hpp>

#include <functional>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -52,6 +54,7 @@ class MATHS_EXPORT CBayesianOptimisation {
public:
using TDoubleDoublePr = std::pair<double, double>;
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TOptionalDouble = boost::optional<double>;
using TVector = CDenseVector<double>;
using TLikelihoodFunc = std::function<double(const TVector&)>;
using TLikelihoodGradientFunc = std::function<TVector(const TVector&)>;
Expand All @@ -74,7 +77,7 @@ class MATHS_EXPORT CBayesianOptimisation {

//! Compute the location which maximizes the expected improvement given the
//! function evaluations added so far.
TVector maximumExpectedImprovement();
std::pair<TVector, TOptionalDouble> maximumExpectedImprovement();

//! Persist by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
Expand Down
12 changes: 6 additions & 6 deletions include/maths/CBoostedTreeFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class CBoostedTreeImpl;
//! Factory for CBoostedTree objects.
class MATHS_EXPORT CBoostedTreeFactory final {
public:
using TVector = CVectorNx1<double, 3>;
using TBoostedTreeUPtr = std::unique_ptr<CBoostedTree>;
using TProgressCallback = CBoostedTree::TProgressCallback;
using TMemoryUsageCallback = CBoostedTree::TMemoryUsageCallback;
Expand Down Expand Up @@ -121,12 +122,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TOptionalDouble = boost::optional<double>;
using TOptionalSize = boost::optional<std::size_t>;
using TVector = CVectorNx1<double, 3>;
using TOptionalVector = boost::optional<TVector>;
using TPackedBitVectorVec = std::vector<core::CPackedBitVector>;
using TBoostedTreeImplUPtr = std::unique_ptr<CBoostedTreeImpl>;
using TApplyRegularizerStep =
std::function<bool(CBoostedTreeImpl&, double, std::size_t)>;
using TApplyRegularizer = std::function<bool(CBoostedTreeImpl&, double)>;

private:
CBoostedTreeFactory(std::size_t numberThreads);
Expand Down Expand Up @@ -169,10 +168,11 @@ class MATHS_EXPORT CBoostedTreeFactory final {
//! \return The interval to search during the main hyperparameter optimisation
//! loop or null if this couldn't be found.
TOptionalVector testLossLineSearch(core::CDataFrame& frame,
const TApplyRegularizerStep& applyRegularizerStep,
const TApplyRegularizer& applyRegularizerStep,
double intervalLeftEnd,
double intervalRightEnd,
double returnedIntervalLeftEndOffset,
double returnedIntervalRightEndOffset,
double stepSize) const;
double returnedIntervalRightEndOffset) const;

//! Initialize the state for hyperparameter optimisation.
void initializeHyperparameterOptimisation() const;
Expand Down
9 changes: 5 additions & 4 deletions lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,11 @@ void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
}

if (this->topShapValues() > 0) {
auto largestShapValues = this->makeLargestShapAccumulator(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
auto largestShapValues =
maths::CBasicStatistics::orderStatisticsAccumulator<std::size_t>(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
for (auto col : this->boostedTree().columnsHoldingShapValues()) {
largestShapValues.add(col);
}
Expand Down
9 changes: 5 additions & 4 deletions lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,11 @@ void CDataFrameTrainBoostedTreeRegressionRunner::writeOneRow(
writer.Key(IS_TRAINING_FIELD_NAME);
writer.Bool(maths::CDataFrameUtils::isMissing(row[columnHoldingDependentVariable]) == false);
if (this->topShapValues() > 0) {
auto largestShapValues = this->makeLargestShapAccumulator(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
auto largestShapValues =
maths::CBasicStatistics::orderStatisticsAccumulator<std::size_t>(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
for (auto col : this->boostedTree().columnsHoldingShapValues()) {
largestShapValues.add(col);
}
Expand Down
Loading

0 comments on commit a96c501

Please sign in to comment.