Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Early stopping in the line searches to compute initial regulariser values #903

Merged
merged 12 commits into from
Dec 17, 2019
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ is no longer decreasing. (See {ml-pull}875[#875].)
* Emit `prediction_field_name` in ml results using the type provided as
`prediction_field_type` parameter. (See {ml-pull}877[#877].)
* Improve performance updating quantile estimates. (See {ml-pull}881[#881].)
* Migrate to use Bayesian Optimisation for initial hyperparameter value line searches and
stop early if the expected improvement is too small. (See {ml-pull}903[#903].)

=== Bug Fixes
* Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].)
Expand Down
8 changes: 0 additions & 8 deletions include/api/CDataFrameTrainBoostedTreeRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
//! The boosted tree factory.
maths::CBoostedTreeFactory& boostedTreeFactory();

//! Factory for the largest SHAP value accumulator.
template<typename LESS>
maths::CBasicStatistics::COrderStatisticsHeap<std::size_t, LESS>
makeLargestShapAccumulator(std::size_t n, LESS less) const {
return maths::CBasicStatistics::COrderStatisticsHeap<std::size_t, LESS>{
n, std::size_t{}, less};
}

private:
using TBoostedTreeFactoryUPtr = std::unique_ptr<maths::CBoostedTreeFactory>;
using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr;
Expand Down
12 changes: 12 additions & 0 deletions include/maths/CBasicStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,12 @@ class MATHS_EXPORT CBasicStatistics {
}
};

//! Make a stack based order statistics accumulator from \p less.
template<typename T, std::size_t N, typename LESS>
static COrderStatisticsStack<T, N, LESS> orderStatisticsAccumulator(LESS less) {
return COrderStatisticsStack<T, N, LESS>{less};
}

//! \brief A heap based accumulator class for order statistics.
//!
//! DESCRIPTION:\n
Expand Down Expand Up @@ -1298,6 +1304,12 @@ class MATHS_EXPORT CBasicStatistics {
}
};

//! Make a heap based order statistics accumulator from \p less.
template<typename T, typename LESS>
static COrderStatisticsHeap<T, LESS> orderStatisticsAccumulator(std::size_t n, LESS less) {
return COrderStatisticsHeap<T, LESS>{n, T{}, less};
}

//! \name Accumulator Typedefs
//@{
//! Accumulator object to compute the sample maximum.
Expand Down
5 changes: 4 additions & 1 deletion include/maths/CBayesianOptimisation.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <maths/CPRNG.h>
#include <maths/ImportExport.h>

#include <boost/optional.hpp>

#include <functional>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -52,6 +54,7 @@ class MATHS_EXPORT CBayesianOptimisation {
public:
using TDoubleDoublePr = std::pair<double, double>;
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TOptionalDouble = boost::optional<double>;
using TVector = CDenseVector<double>;
using TLikelihoodFunc = std::function<double(const TVector&)>;
using TLikelihoodGradientFunc = std::function<TVector(const TVector&)>;
Expand All @@ -74,7 +77,7 @@ class MATHS_EXPORT CBayesianOptimisation {

//! Compute the location which maximizes the expected improvement given the
//! function evaluations added so far.
TVector maximumExpectedImprovement();
std::pair<TVector, TOptionalDouble> maximumExpectedImprovement();

//! Persist by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
Expand Down
12 changes: 6 additions & 6 deletions include/maths/CBoostedTreeFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class CBoostedTreeImpl;
//! Factory for CBoostedTree objects.
class MATHS_EXPORT CBoostedTreeFactory final {
public:
using TVector = CVectorNx1<double, 3>;
using TBoostedTreeUPtr = std::unique_ptr<CBoostedTree>;
using TProgressCallback = CBoostedTree::TProgressCallback;
using TMemoryUsageCallback = CBoostedTree::TMemoryUsageCallback;
Expand Down Expand Up @@ -121,12 +122,10 @@ class MATHS_EXPORT CBoostedTreeFactory final {
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TOptionalDouble = boost::optional<double>;
using TOptionalSize = boost::optional<std::size_t>;
using TVector = CVectorNx1<double, 3>;
using TOptionalVector = boost::optional<TVector>;
using TPackedBitVectorVec = std::vector<core::CPackedBitVector>;
using TBoostedTreeImplUPtr = std::unique_ptr<CBoostedTreeImpl>;
using TApplyRegularizerStep =
std::function<bool(CBoostedTreeImpl&, double, std::size_t)>;
using TApplyRegularizer = std::function<bool(CBoostedTreeImpl&, double)>;

private:
CBoostedTreeFactory(std::size_t numberThreads);
Expand Down Expand Up @@ -169,10 +168,11 @@ class MATHS_EXPORT CBoostedTreeFactory final {
//! \return The interval to search during the main hyperparameter optimisation
//! loop or null if this couldn't be found.
TOptionalVector testLossLineSearch(core::CDataFrame& frame,
const TApplyRegularizerStep& applyRegularizerStep,
const TApplyRegularizer& applyRegularizerStep,
double intervalLeftEnd,
double intervalRightEnd,
double returnedIntervalLeftEndOffset,
double returnedIntervalRightEndOffset,
double stepSize) const;
double returnedIntervalRightEndOffset) const;

//! Initialize the state for hyperparameter optimisation.
void initializeHyperparameterOptimisation() const;
Expand Down
9 changes: 5 additions & 4 deletions lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,11 @@ void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
}

if (this->topShapValues() > 0) {
auto largestShapValues = this->makeLargestShapAccumulator(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
auto largestShapValues =
maths::CBasicStatistics::orderStatisticsAccumulator<std::size_t>(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
for (auto col : this->boostedTree().columnsHoldingShapValues()) {
largestShapValues.add(col);
}
Expand Down
9 changes: 5 additions & 4 deletions lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,11 @@ void CDataFrameTrainBoostedTreeRegressionRunner::writeOneRow(
writer.Key(IS_TRAINING_FIELD_NAME);
writer.Bool(maths::CDataFrameUtils::isMissing(row[columnHoldingDependentVariable]) == false);
if (this->topShapValues() > 0) {
auto largestShapValues = this->makeLargestShapAccumulator(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
auto largestShapValues =
maths::CBasicStatistics::orderStatisticsAccumulator<std::size_t>(
this->topShapValues(), [&row](std::size_t lhs, std::size_t rhs) {
return std::fabs(row[lhs]) > std::fabs(row[rhs]);
});
for (auto col : this->boostedTree().columnsHoldingShapValues()) {
largestShapValues.add(col);
}
Expand Down
Loading