diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 9c8f936e55..23b67e1d94 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -42,6 +42,8 @@ classification and regression models. (See {ml-pull}2251[#2251].) * Fix possible source of "x = NaN, distribution = class boost::math::normal_distribution<..." log errors training classification and regression models. (See {ml-pull}2249[#2249].) +* Fix some bugs affecting decision to stop optimising hyperparameters for training + classification and regression models. (See {ml-pull}2259[#2259].) == {es} version 8.2.0 diff --git a/include/maths/common/CBayesianOptimisation.h b/include/maths/common/CBayesianOptimisation.h index 81bf6f25c2..f4719bfda3 100644 --- a/include/maths/common/CBayesianOptimisation.h +++ b/include/maths/common/CBayesianOptimisation.h @@ -125,9 +125,6 @@ class MATHS_COMMON_EXPORT CBayesianOptimisation { //! of the total variance. TDoubleDoublePrVec anovaMainEffects() const; - //! Set kernel \p parameters explicitly. - void kernelParameters(const TVector& parameters); - //! Get the memory used by this object. std::size_t memoryUsage() const; @@ -142,6 +139,9 @@ class MATHS_COMMON_EXPORT CBayesianOptimisation { //! \name Test Interface //@{ + //! Set kernel \p parameters explicitly. + void kernelParameters(const TVector& parameters); + //! Get minus the data likelihood and its gradient as a function of the kernel //! hyperparameters. std::pair minusLikelihoodAndGradient() const; @@ -189,8 +189,9 @@ class MATHS_COMMON_EXPORT CBayesianOptimisation { TVectorDoublePr kernelCovariates(const TVector& a, const TVector& x, double vx) const; double kernel(const TVector& a, const TVector& x, const TVector& y) const; TVector kinvf() const; - TVector transformTo01(const TVector& x) const; double dissimilarity(const TVector& x) const; + TVector to01(TVector x) const; + TVector from01(TVector x) const; void checkRestoredInvariants() const; private: diff --git a/lib/maths/analytics/unittest/CBoostedTreeTest.cc b/lib/maths/analytics/unittest/CBoostedTreeTest.cc index 60cd3ae7bd..8d3ec34b08 100644 --- a/lib/maths/analytics/unittest/CBoostedTreeTest.cc +++ b/lib/maths/analytics/unittest/CBoostedTreeTest.cc @@ -569,7 +569,7 @@ BOOST_AUTO_TEST_CASE(testLinear) { 0.0, modelBias[i][0], 6.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... - BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97); + BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.94); meanModelRSquared.add(modelRSquared[i][0]); } @@ -879,7 +879,7 @@ BOOST_AUTO_TEST_CASE(testLowCardinalityFeatures) { target, noiseVariance / static_cast(rows)); LOG_DEBUG(<< "bias = " << bias << ", rSquared = " << rSquared); - BOOST_TEST_REQUIRE(rSquared > 0.96); + BOOST_TEST_REQUIRE(rSquared > 0.95); } BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) { @@ -1189,7 +1189,7 @@ BOOST_AUTO_TEST_CASE(testCategoricalRegressors) { LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, modelBias, 0.1); - BOOST_TEST_REQUIRE(modelRSquared > 0.98); + BOOST_TEST_REQUIRE(modelRSquared > 0.97); } BOOST_AUTO_TEST_CASE(testFeatureBags) { diff --git a/lib/maths/common/CBayesianOptimisation.cc b/lib/maths/common/CBayesianOptimisation.cc index fc6411853d..577dafe1d6 100644 --- a/lib/maths/common/CBayesianOptimisation.cc +++ b/lib/maths/common/CBayesianOptimisation.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ #include #include +#include #include #include @@ -61,6 +63,16 @@ const std::string RNG_TAG{"rng"}; // narrow deep valley that the Gaussian Process hasn't sampled. const double MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION{1e-8}; +//! Affine transform \p scale * (\p fx - \p shift). +double toScaled(double shift, double scale, double fx) { + return scale * (fx - shift); +} + +//! Affine transform \p shift + \p scale / \p fx. +double fromScaled(double shift, double scale, double fx) { + return shift + fx / scale; +} + //! A version of the normal c.d.f. which is stable across our target platforms. double stableNormCdf(double z) { return CTools::stable(CTools::safeCdf(boost::math::normal{0.0, 1.0}, z)); @@ -114,9 +126,32 @@ CBayesianOptimisation::CBayesianOptimisation(core::CStateRestoreTraverser& trave } void CBayesianOptimisation::add(TVector x, double fx, double vx) { - m_FunctionMeanValues.emplace_back(x.cwiseQuotient(m_MaxBoundary - m_MinBoundary), - m_RangeScale * (fx - m_RangeShift)); - m_ErrorVariances.push_back(CTools::pow2(m_RangeScale) * vx); + if (CMathsFuncs::isFinite(fx) == false || CMathsFuncs::isFinite(vx) == false) { + LOG_ERROR(<< "Discarding point (" << x.transpose() << "," << fx << "," << vx << ")"); + return; + } + + x = this->to01(std::move(x)); + fx = toScaled(m_RangeShift, m_RangeScale, fx); + vx = CTools::pow2(m_RangeScale) * vx; + + std::size_t duplicate(std::find_if(m_FunctionMeanValues.begin(), + m_FunctionMeanValues.end(), + [&](const auto& value) { + return (x - value.first).norm() == 0.0; + }) - + m_FunctionMeanValues.begin()); + if (duplicate < m_FunctionMeanValues.size()) { + auto& f = m_FunctionMeanValues[duplicate].second; + auto& v = m_ErrorVariances[duplicate]; + auto moments = CBasicStatistics::momentsAccumulator(1.0, f, v) + + CBasicStatistics::momentsAccumulator(1.0, fx, vx); + f = CBasicStatistics::mean(moments); + v = CBasicStatistics::maximumLikelihoodVariance(moments); + } else { + m_FunctionMeanValues.emplace_back(std::move(x), fx); + m_ErrorVariances.push_back(vx); + } } void CBayesianOptimisation::explainedErrorVariance(double vx) { @@ -146,8 +181,8 @@ CBayesianOptimisation::maximumExpectedImprovement(double negligibleExpectedImpro TDoubleVec interpolates; CSampling::uniformSample(m_Rng, 0.0, 1.0, 10 * m_Restarts * interpolate.size(), interpolates); - TVector a{m_MinBoundary.cwiseQuotient(m_MaxBoundary - m_MinBoundary)}; - TVector b{m_MaxBoundary.cwiseQuotient(m_MaxBoundary - m_MinBoundary)}; + TVector a{TVector::Zero(m_MinBoundary.size())}; + TVector b{TVector::Ones(m_MaxBoundary.size())}; TVector x; TMeanAccumulator rho_; TMinAccumulator probes{m_Restarts}; @@ -217,7 +252,7 @@ CBayesianOptimisation::maximumExpectedImprovement(double negligibleExpectedImpro expectedImprovement = fmax / m_RangeScale; } - xmax = xmax.cwiseProduct(m_MaxBoundary - m_MinBoundary); + xmax = this->from01(std::move(xmax)); LOG_TRACE(<< "best = " << xmax.transpose() << " EI(best) = " << expectedImprovement); return {std::move(xmax), expectedImprovement}; @@ -230,9 +265,8 @@ double CBayesianOptimisation::evaluate(const TVector& input) const { double CBayesianOptimisation::evaluate(const TVector& Kinvf, const TVector& input) const { TVector Kxn; std::tie(Kxn, std::ignore) = this->kernelCovariates( - m_KernelParameters, input.cwiseQuotient(m_MaxBoundary - m_MinBoundary), - this->meanErrorVariance()); - return Kxn.transpose() * Kinvf; + m_KernelParameters, this->to01(input), this->meanErrorVariance()); + return fromScaled(m_RangeShift, m_RangeScale, Kxn.transpose() * Kinvf); } double CBayesianOptimisation::evaluate1D(const TVector& Kinvf, double input, int dimension) const { @@ -250,20 +284,29 @@ double CBayesianOptimisation::evaluate1D(const TVector& Kinvf, double input, int input = (input - m_MinBoundary(dimension)) / (m_MaxBoundary(dimension) - m_MinBoundary(dimension)); for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { - TVector x{this->transformTo01(m_FunctionMeanValues[i].first)}; + const TVector& x{m_FunctionMeanValues[i].first}; sum += Kinvf(static_cast(i)) * CTools::stableExp(-(CTools::pow2(m_KernelParameters[dimension + 1]) + MINIMUM_KERNEL_COORDINATE_DISTANCE_SCALE) * CTools::pow2(input - x(dimension))) * prodXt(x, dimension); } - - // We rewrite theta_0^2 sum - f_0 as (theta + f) * (theta - f) where - // theta = theta_0 sum^(1/2) and f = f_0^(1/2) because it has better - // numerics. - double theta{m_KernelParameters(0) * std::sqrt(sum)}; - double f{std::sqrt(this->anovaConstantFactor(Kinvf))}; - return (theta + f) * (theta - f); + double f2{this->anovaConstantFactor(Kinvf)}; + + // We only get cancellation if the signs are the same (and we need also + // to take the square root of both sum and f2 for which they need to be + // positive). + if (std::signbit(sum) == std::signbit(f2)) { + // We rewrite theta_0^2 sum - f_0 as (theta + f) * (theta - f) where + // theta = theta_0 sum^(1/2) and f = f_0^(1/2) because it has better + // numerics. + double theta{m_KernelParameters(0) * std::sqrt(std::fabs(sum))}; + double f{std::sqrt(std::fabs(f2))}; + return fromScaled(m_RangeShift, m_RangeScale, + std::copysign(1.0, sum) * (theta + f) * (theta - f)); + } + return fromScaled(m_RangeShift, m_RangeScale, + CTools::pow2(m_KernelParameters(0)) * sum - f2); } double CBayesianOptimisation::evaluate1D(double input, int dimension) const { @@ -279,7 +322,7 @@ double CBayesianOptimisation::anovaConstantFactor(const TVector& Kinvf) const { double sum{0.0}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { double prod{1.0}; - TVector x{this->transformTo01(m_FunctionMeanValues[i].first)}; + const TVector& x{m_FunctionMeanValues[i].first}; for (int d = 0; d < x.size(); ++d) { prod *= integrate1dKernel(m_KernelParameters(d + 1), x(d)); } @@ -294,8 +337,8 @@ double CBayesianOptimisation::anovaConstantFactor() const { double CBayesianOptimisation::anovaTotalVariance(const TVector& Kinvf) const { auto prodIj = [&Kinvf, this](std::size_t i, std::size_t j) -> double { - TVector xi{this->transformTo01(m_FunctionMeanValues[i].first)}; - TVector xj{this->transformTo01(m_FunctionMeanValues[j].first)}; + const TVector& xi{m_FunctionMeanValues[i].first}; + const TVector& xj{m_FunctionMeanValues[j].first}; double prod{1.0}; for (int d = 0; d < xi.size(); ++d) { prod *= integrate1dKernelProduct(m_KernelParameters[d + 1], xi(d), xj(d)); @@ -311,17 +354,22 @@ double CBayesianOptimisation::anovaTotalVariance(const TVector& Kinvf) const { } } - // We rewrite theta_0^4 sum - f_0^2 as (theta^2 + f_0) * (theta^2 - f_0) - // where theta^2 = theta_0^2 sum^(1/2) because it has better numerics. - double theta2{CTools::pow2(m_KernelParameters(0)) * std::sqrt(sum)}; + double theta2{CTools::pow2(m_KernelParameters(0))}; double f0{this->anovaConstantFactor(Kinvf)}; - double variance{(theta2 + f0) * (theta2 - f0)}; - return std::max(0.0, variance); + double scale2{CTools::pow2(m_RangeScale)}; + if (sum > 0.0) { + // We rewrite theta_0^4 sum - f_0^2 as (theta^2 + f_0) * (theta^2 - f_0) + // where theta^2 = theta_0^2 sum^(1/2) because it has better numerics. + theta2 *= std::sqrt(sum); + double variance{(theta2 + f0) * (theta2 - f0)}; + return std::max(0.0, variance / scale2); + } + return std::max(0.0, (theta2 * theta2 * sum - f0 * f0) / scale2); } double CBayesianOptimisation::anovaTotalCoefficientOfVariation() { this->precondition(); - return std::sqrt(this->anovaTotalVariance()) * m_RangeScale / m_RangeShift; + return std::sqrt(this->anovaTotalVariance()) / m_RangeShift; } double CBayesianOptimisation::anovaTotalVariance() const { @@ -341,9 +389,9 @@ double CBayesianOptimisation::anovaMainEffect(const TVector& Kinvf, int dimensio double sum1{0.0}; double sum2{0.0}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { - TVector xi{this->transformTo01(m_FunctionMeanValues[i].first)}; + const TVector& xi{m_FunctionMeanValues[i].first}; for (std::size_t j = 0; j < m_FunctionMeanValues.size(); ++j) { - TVector xj{this->transformTo01(m_FunctionMeanValues[j].first)}; + const TVector& xj{m_FunctionMeanValues[j].first}; sum1 += Kinvf(static_cast(i)) * Kinvf(static_cast(j)) * prodXt(xi, dimension) * prodXt(xj, dimension) * integrate1dKernelProduct(m_KernelParameters(dimension + 1), @@ -353,12 +401,11 @@ double CBayesianOptimisation::anovaMainEffect(const TVector& Kinvf, int dimensio integrate1dKernel(m_KernelParameters(dimension + 1), xi(dimension)) * prodXt(xi, dimension); } + double scale2{CTools::pow2(m_RangeScale)}; double theta02{CTools::pow2(m_KernelParameters(0))}; - double theta04{CTools::pow2(theta02)}; double f0{this->anovaConstantFactor()}; double f02{CTools::pow2(f0)}; - double scale{std::max(1.0, theta04)}; // prevent cancellation errors - return scale * (theta04 * sum1 / scale - 2 * theta02 * sum2 * f0 / scale + f02 / scale); + return (theta02 * (theta02 * sum1 - 2.0 * f0 * sum2) + f02) / scale2; } double CBayesianOptimisation::anovaMainEffect(int dimension) const { @@ -375,7 +422,7 @@ CBayesianOptimisation::TDoubleDoublePrVec CBayesianOptimisation::anovaMainEffect mainEffects.reserve(static_cast(m_MinBoundary.size())); TVector Kinvf{this->kinvf()}; double f0{this->anovaConstantFactor(Kinvf)}; - double totalVariance(this->anovaTotalVariance(Kinvf)); + double totalVariance{this->anovaTotalVariance(Kinvf)}; for (int i = 0; i < m_MinBoundary.size(); ++i) { double effect{this->anovaMainEffect(Kinvf, i)}; mainEffects.emplace_back(effect, effect / totalVariance); @@ -389,6 +436,8 @@ CBayesianOptimisation::TDoubleDoublePrVec CBayesianOptimisation::anovaMainEffect void CBayesianOptimisation::kernelParameters(const TVector& parameters) { if (m_KernelParameters.size() == parameters.size()) { m_KernelParameters = parameters; + m_RangeShift = 0.0; + m_RangeScale = 1.0; } } @@ -399,40 +448,45 @@ CBayesianOptimisation::minusLikelihoodAndGradient() const { double v{this->meanErrorVariance()}; TVector ones; TVector gradient; - Eigen::LDLT Kldl; + Eigen::ColPivHouseholderQR Kqr; TMatrix K; TVector Kinvf; TMatrix Kinv; TMatrix dKdai; + double eps{1e-4}; + + // We need to be careful when we compute the kernel decomposition. Basically, + // if the kernel matrix is singular to working precision then if the function + // value vector projection onto the null-space has non-zero length the likelihood + // function is effectively -infinity. This follow from the fact that although + // log(1 / lambda_i) -> +infinity, -1/2 sum_i{ ||f_i||^2 / lambda_i } -> -infinity + // faster for all ||f_i|| > 0 and lambda_i sufficiently small. Here {lambda_i} + // denote the Eigenvalues of the nullspace. We use a rank revealing decomposition + // and compute the likelihood on the row space. auto minusLogLikelihood = [=](const TVector& a) mutable -> double { - K = this->kernel(a, v); - Kldl.compute(K); - Kinvf.noalias() = f; - Kldl.solveInPlace(Kinvf); - - // We can only determine values up to eps * "max diagonal". If the diagonal - // has a zero it blows up the determinant term. In practice, we know the - // kernel can't be singular by construction so we perturb the diagonal by - // the numerical error in such a way as to recover a non-singular matrix. - // (Note that the solve routine deals with the zero for us.) - double eps{std::numeric_limits::epsilon() * Kldl.vectorD().maxCoeff()}; - return 0.5 * (f.transpose() * Kinvf + - Kldl.vectorD().cwiseMax(eps).array().log().sum()); + K = this->kernel(a, v + eps); + Kqr.compute(K); + Kinvf.noalias() = Kqr.solve(f); + // Note that Kqr.logAbsDeterminant() = -infinity if K is singular. + double logAbsDet{0.0}; + for (int i = 0; i < Kqr.rank(); ++i) { + logAbsDet += std::log(std::fabs(Kqr.matrixR()(i, i))); + } + logAbsDet = CTools::stable(logAbsDet); + return 0.5 * (f.transpose() * Kinvf + logAbsDet); }; auto minusLogLikelihoodGradient = [=](const TVector& a) mutable -> TVector { - K = this->kernel(a, v); - Kldl.compute(K); + K = this->kernel(a, v + eps); + Kqr.compute(K); - Kinvf.noalias() = f; - Kldl.solveInPlace(Kinvf); + Kinvf.noalias() = Kqr.solve(f); ones = TVector::Ones(f.size()); - Kinv.noalias() = TMatrix::Identity(f.size(), f.size()); - Kldl.solveInPlace(Kinv); + Kinv.noalias() = Kqr.solve(TMatrix::Identity(f.size(), f.size())); - K.diagonal() -= v * ones; + K.diagonal() -= (v + eps) * ones; gradient = TVector::Zero(a.size()); for (int i = 0; i < Kinvf.size(); ++i) { @@ -553,46 +607,57 @@ CBayesianOptimisation::minusExpectedImprovementAndGradient() const { const CBayesianOptimisation::TVector& CBayesianOptimisation::maximumLikelihoodKernel() { - // Use random restarts of L-BFGS to find maximum likelihood parameters. + if (m_FunctionMeanValues.size() < 2) { + return m_KernelParameters; + } - this->precondition(); + using TDoubleVecVec = std::vector; - std::size_t n(m_KernelParameters.size()); - - // We restart optimization with initial guess on different scales for global probing. - TDoubleVec scales; - scales.reserve(10 * (m_Restarts - 1) * n); - CSampling::uniformSample(m_Rng, CTools::stableLog(0.2), CTools::stableLog(5.0), - 10 * (m_Restarts - 1) * n, scales); + this->precondition(); TLikelihoodFunc l; TLikelihoodGradientFunc g; std::tie(l, g) = this->minusLikelihoodAndGradient(); + CLbfgs lbfgs{10}; + + double lmax{l(m_KernelParameters)}; + TVector amax{m_KernelParameters}; + + // Try the current values first. + double la; + TVector a; + std::tie(a, la) = lbfgs.minimize(l, g, m_KernelParameters, 1e-8, 75); + if (COrderings::lexicographical_compare(la, a.norm(), lmax, amax.norm())) { + lmax = la; + amax = a; + } + TMinAccumulator probes{m_Restarts - 1}; - TVector scale{n}; - for (std::size_t i = 0; i < scales.size(); /**/) { - TVector a{m_KernelParameters}; - for (std::size_t j = 0; j < n; ++i, ++j) { - scale(j) = CTools::stableExp(scales[i]); + // We restart optimization with scales of the current values for global probing. + std::size_t n(m_KernelParameters.size()); + TDoubleVecVec scales; + scales.reserve(10 * (m_Restarts - 1)); + CSampling::sobolSequenceSample(n, 10 * (m_Restarts - 1), scales); + + for (const auto& scale : scales) { + a.noalias() = m_KernelParameters; + for (std::size_t j = 0; j < n; ++j) { + a(j) *= CTools::stableExp(CTools::linearlyInterpolate( + 0.0, 1.0, std::log(0.2), std::log(2.0), scale[j])); + } + la = l(a); + if (COrderings::lexicographical_compare(la, a.norm(), lmax, amax.norm())) { + lmax = la; + amax = a; } - a.array() *= scale.array(); - double la{l(a)}; probes.add({la, std::move(a)}); } - CLbfgs lbfgs{10}; - - double lmax; - TVector amax; - std::tie(amax, lmax) = lbfgs.minimize(l, g, m_KernelParameters, 1e-8, 75); - - double la; - TVector a; for (auto& a0 : probes) { std::tie(a, la) = lbfgs.minimize(l, g, std::move(a0.second), 1e-8, 75); - if (COrderings::lexicographical_compare(la, a, lmax, amax)) { + if (COrderings::lexicographical_compare(la, a.norm(), lmax, amax.norm())) { lmax = la; amax = std::move(a); } @@ -615,7 +680,7 @@ void CBayesianOptimisation::precondition() { // for different loss functions. for (auto& value : m_FunctionMeanValues) { - value.second = m_RangeShift + value.second / m_RangeScale; + value.second = fromScaled(m_RangeShift, m_RangeScale, value.second); } for (auto& variance : m_ErrorVariances) { variance /= CTools::pow2(m_RangeScale); @@ -633,7 +698,7 @@ void CBayesianOptimisation::precondition() { : 1.0 / std::sqrt(CBasicStatistics::variance(valueMoments)); for (auto& value : m_FunctionMeanValues) { - value.second = m_RangeScale * (value.second - m_RangeShift); + value.second = toScaled(m_RangeShift, m_RangeScale, value.second); } for (auto& variance : m_ErrorVariances) { variance *= CTools::pow2(m_RangeScale); @@ -737,10 +802,6 @@ CBayesianOptimisation::TVector CBayesianOptimisation::kinvf() const { return Kinvf; } -CBayesianOptimisation::TVector CBayesianOptimisation::transformTo01(const TVector& x) const { - return x - m_MinBoundary.cwiseQuotient(m_MaxBoundary - m_MinBoundary); -} - double CBayesianOptimisation::dissimilarity(const TVector& x) const { // This is used as a fallback when GP is very unsure we can actually make progress, // i.e. EI is miniscule. In this case we fallback to a different strategy to break @@ -761,6 +822,17 @@ double CBayesianOptimisation::dissimilarity(const TVector& x) const { return sum / static_cast(m_FunctionMeanValues.size()) + min; } +CBayesianOptimisation::TVector CBayesianOptimisation::to01(TVector x) const { + // Self assign so operations are performed inplace. + x = (x - m_MinBoundary).cwiseQuotient(m_MaxBoundary - m_MinBoundary); + return x; +} + +CBayesianOptimisation::TVector CBayesianOptimisation::from01(TVector x) const { + x = m_MinBoundary + x.cwiseProduct(m_MaxBoundary - m_MinBoundary); + return x; +} + void CBayesianOptimisation::acceptPersistInserter(core::CStatePersistInserter& inserter) const { try { core::CPersistUtils::persist(VERSION_7_5_TAG, "", inserter); diff --git a/lib/maths/common/unittest/CBayesianOptimisationTest.cc b/lib/maths/common/unittest/CBayesianOptimisationTest.cc index ff9ef01e77..4c1c93cf14 100644 --- a/lib/maths/common/unittest/CBayesianOptimisationTest.cc +++ b/lib/maths/common/unittest/CBayesianOptimisationTest.cc @@ -9,6 +9,7 @@ * limitation. */ +#include #include #include @@ -23,6 +24,7 @@ #include +#include #include BOOST_AUTO_TEST_SUITE(CBayesianOptimisationTest) @@ -33,6 +35,15 @@ namespace { using TDoubleVec = std::vector; using TDoubleVecVec = std::vector; using TVector = maths::common::CDenseVector; +using TVectorVec = std::vector; +using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; +struct SFunctionParams { + double s_Xl; + double s_Xu; + double s_F0; + double s_Scale; +}; +using TFunctionParamsVec = std::vector; TVector vector(TDoubleVec components) { TVector result(components.size()); @@ -176,16 +187,14 @@ BOOST_AUTO_TEST_CASE(testMaximumLikelihoodKernel) { for (std::size_t test = 0; test < 50; ++test) { maths::common::CBayesianOptimisation bopt{ - {{-10.0, 10.0}, {-10.0, 10.0}, {-10.0, 10.0}, {-10.0, 10.0}}}; - - double scale{5.0 / std::sqrt(static_cast(test + 1))}; + {{0.0, 10.0}, {0.0, 10.0}, {0.0, 10.0}, {0.0, 10.0}}}; for (std::size_t i = 0; i < 10; ++i) { rng.generateUniformSamples(-10.0, 10.0, 4, coordinates); rng.generateNormalSamples(0.0, 2.0, 1, noise); TVector x{vector(coordinates)}; - double fx{scale * x.squaredNorm() + noise[0]}; - bopt.add(x, fx, 1.0); + double fx{x.squaredNorm() + noise[0]}; + bopt.add(x, fx, 0.1); } TVector parameters{bopt.maximumLikelihoodKernel()}; @@ -202,9 +211,9 @@ BOOST_AUTO_TEST_CASE(testMaximumLikelihoodKernel) { TVector eps{parameters.size()}; eps.setZero(); for (std::size_t i = 0; i < 4; ++i) { - eps(i) = 0.1; + eps(i) = 0.01; double minusLPlusEps{l(parameters + eps)}; - eps(i) = -0.1; + eps(i) = -0.01; double minusLMinusEps{l(parameters + eps)}; eps(i) = 0.0; BOOST_TEST_REQUIRE(minusML < minusLPlusEps); @@ -268,8 +277,6 @@ BOOST_AUTO_TEST_CASE(testMaximumExpectedImprovement) { // We check the value of the function we find after fixed number of iterations // vs a random search baseline. - using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; - test::CRandomNumbers rng; TDoubleVec centreCoordinates; TDoubleVec coordinateScales; @@ -359,14 +366,130 @@ BOOST_AUTO_TEST_CASE(testMaximumExpectedImprovement) { << 100.0 * maths::common::CBasicStatistics::mean(meanImprovementRs)); BOOST_TEST_REQUIRE(wins > static_cast(0.95 * 50)); // 95% better BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanImprovementBopt) > - 1.6 * maths::common::CBasicStatistics::mean(meanImprovementRs)); // 60% mean improvement + 1.5 * maths::common::CBasicStatistics::mean(meanImprovementRs)); // 50% mean improvement +} + +BOOST_AUTO_TEST_CASE(testKernelInvariants) { + + // Test that the kernel parameters we estimate do not change when: + // 1. Changing the function domain, + // 2. Changing the function level, + // 3. Linearly scaling the function. + + TFunctionParamsVec tests{{0.0, 100.0, 0.0, 1.0}, + {0.0, 1000.0, 0.0, 1.0}, + {0.0, 100.0, 10.0, 1.0}, + {0.0, 100.0, 0.0, 2.0}}; + + TVectorVec kernelParameters; + + for (const auto& test : tests) { + + test::CRandomNumbers rng; + + std::size_t dimension{2}; + double xl{test.s_Xl}; + double xu{test.s_Xu}; + double f0{test.s_F0}; + double scale{test.s_Scale}; + + TDoubleVec coords; + rng.generateUniformSamples(xl, xu, dimension * 20, coords); + + maths::common::CBayesianOptimisation::TDoubleDoublePrVec bb; + for (std::size_t i = 0; i < dimension; ++i) { + bb.emplace_back(xl, xu); + } + + maths::common::CBayesianOptimisation bopt{bb}; + for (std::size_t i = 0; i < 10; ++i) { + TVector x{dimension}; + for (std::size_t j = 0; j < dimension; ++j) { + x(j) = coords[i * dimension + j]; + } + bopt.maximumLikelihoodKernel(); + bopt.add(x, scale * x.norm() + f0, scale * scale * (xu - xl) * (xu - xl) * 0.0001); + } + + kernelParameters.push_back(bopt.maximumLikelihoodKernel()); + } + + for (std::size_t i = 1; i < kernelParameters.size(); ++i) { + BOOST_TEST_REQUIRE((kernelParameters[i] - kernelParameters[0]).norm() < 1e-6); + } +} + +BOOST_AUTO_TEST_CASE(testForSingularKernel) { + + // Explore some edge cases where the kernel can go singular. + + // Test that decreasing additive variance. In this case the maximum likelihood + // can decide it is a good idea to force a singular kernel matrix if we don't + // compute the likelihood carefully. We should see the kernel parameters smoothly + // converge towards the case the additive variance is zero as we reduce it. + TVectorVec kernels; + std::size_t dimension{3}; + std::size_t n{30}; + double xl{-100.0}; + double xu{100.0}; + for (auto v : {0.1, 0.01, 0.001, 0.00001, 0.0}) { + test::CRandomNumbers rng; + TDoubleVec coords; + rng.generateUniformSamples(xl, xu, dimension * n, coords); + + maths::common::CBayesianOptimisation::TDoubleDoublePrVec bb; + for (std::size_t i = 0; i < dimension; ++i) { + bb.emplace_back(xl, xu); + } + + maths::common::CBayesianOptimisation bopt{bb}; + for (std::size_t i = 0; i < n; ++i) { + TVector x{dimension}; + for (std::size_t j = 0; j < dimension; ++j) { + x(j) = coords[i * dimension + j]; + } + bopt.maximumLikelihoodKernel(); + bopt.add(x, x.norm(), v); + } + + auto kernel = bopt.maximumLikelihoodKernel(); + LOG_DEBUG(<< "kernel = " << kernel.transpose()); + kernels.push_back(kernel); + } + + double lastNorm{std::numeric_limits::max()}; + for (std::size_t i = 0; i + 1 < kernels.size(); ++i) { + double norm{(kernels[kernels.size() - 1] - kernels[i]).norm()}; + BOOST_TEST_REQUIRE(norm < lastNorm); + BOOST_TEST_REQUIRE(norm / kernels[i].norm() < 0.05); + lastNorm = norm; + } + + // Adding a duplicate point would create a singular kernel if we didn't explicitly + // deduplicate. + maths::common::CBayesianOptimisation::TDoubleDoublePrVec bb{{0.0, 1.0}}; + maths::common::CBayesianOptimisation bopt{bb}; + for (auto x : {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}) { + bopt.add(x * TVector::Ones(1), 1.0 + x, 0.0); + bopt.maximumLikelihoodKernel(); + } + auto kernelBefore = bopt.maximumLikelihoodKernel(); + LOG_DEBUG(<< "kernel before duplicate = " << kernelBefore.transpose()); + + bopt.add(0.1 * TVector::Ones(1), 1.0 + 0.1, 0.0); + auto kernelAfter = bopt.maximumLikelihoodKernel(); + LOG_DEBUG(<< "kernel after duplicate = " << kernelAfter.transpose()); + + // Check that the decay rate is not significantly changed. + BOOST_TEST_REQUIRE(std::fabs(kernelAfter(1) - kernelBefore(1)) < + 0.001 * std::fabs(kernelBefore(1))); } BOOST_AUTO_TEST_CASE(testPersistRestore) { // 1d { - TDoubleVec minBoundary{0.}; - TDoubleVec maxBoundary{10.}; + TDoubleVec minBoundary{0.0}; + TDoubleVec maxBoundary{10.0}; // empty { std::vector parameterFunctionValues{}; @@ -375,8 +498,8 @@ BOOST_AUTO_TEST_CASE(testPersistRestore) { // with data { std::vector parameterFunctionValues{ - {5., 1., 0.2}, - {7., 1., 0.2}, + {5.0, 1.0, 0.2}, + {7.0, 1.0, 0.2}, }; testPersistRestoreIsIdempotent(minBoundary, maxBoundary, parameterFunctionValues); } @@ -384,8 +507,8 @@ BOOST_AUTO_TEST_CASE(testPersistRestore) { // 2d { - TDoubleVec minBoundary{0., -1.}; - TDoubleVec maxBoundary{10., 1.}; + TDoubleVec minBoundary{0.0, -1.0}; + TDoubleVec maxBoundary{10.0, 1.0}; // empty { std::vector parameterFunctionValues{}; @@ -394,8 +517,8 @@ BOOST_AUTO_TEST_CASE(testPersistRestore) { // with data { std::vector parameterFunctionValues{ - {5., 0., 1., 0.2}, - {7., 0., 1., 0.2}, + {5.0, 0.0, 1.0, 0.2}, + {7.0, 0.0, 1.0, 0.2}, }; testPersistRestoreIsIdempotent(minBoundary, maxBoundary, parameterFunctionValues); } @@ -433,7 +556,6 @@ BOOST_AUTO_TEST_CASE(testEvaluate) { } BOOST_AUTO_TEST_CASE(testEvaluate1D) { - using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; test::CRandomNumbers rng; std::size_t dim{2}; std::size_t mcSamples{100}; @@ -460,8 +582,6 @@ BOOST_AUTO_TEST_CASE(testEvaluate1D) { } BOOST_AUTO_TEST_CASE(testAnovaConstantFactor) { - using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; - std::size_t dim{2}; std::size_t mcSamples{1000}; TDoubleVecVec testSamples; @@ -484,7 +604,6 @@ BOOST_AUTO_TEST_CASE(testAnovaConstantFactor) { } BOOST_AUTO_TEST_CASE(testAnovaTotalVariance) { - using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; std::size_t dim{2}; std::size_t mcSamples{1000}; TDoubleVecVec testSamples; @@ -508,7 +627,6 @@ BOOST_AUTO_TEST_CASE(testAnovaTotalVariance) { } BOOST_AUTO_TEST_CASE(testAnovaMainEffect) { - using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; std::size_t dim{2}; std::size_t mcSamples{1000}; TDoubleVecVec testSamples; @@ -533,4 +651,84 @@ BOOST_AUTO_TEST_CASE(testAnovaMainEffect) { verify(0.2, 0.8); } +BOOST_AUTO_TEST_CASE(testAnovaInvariants) { + + // Test that the various parts of ANOVA change as we expect when: + // 1. Changing the function level, + // 2. Linearly scaling the function. + + TFunctionParamsVec tests{ + {0.0, 100.0, 0.0, 1.0}, {0.0, 100.0, 10.0, 1.0}, {0.0, 100.0, 0.0, 2.0}}; + + TDoubleVec evaluateResults; + TDoubleVecVec evaluate1DResults; + TDoubleVec totalVarianceResults; + TDoubleVec totalCoefficientOfVariationResults; + + for (const auto& test : tests) { + + test::CRandomNumbers rng; + + std::size_t dimension{2}; + double xl{test.s_Xl}; + double xu{test.s_Xu}; + double f0{test.s_F0}; + double scale{test.s_Scale}; + + TDoubleVec coords; + rng.generateUniformSamples(xl, xu, dimension * 20, coords); + + maths::common::CBayesianOptimisation::TDoubleDoublePrVec bb; + for (std::size_t i = 0; i < dimension; ++i) { + bb.emplace_back(xl, xu); + } + + maths::common::CBayesianOptimisation bopt{bb}; + for (std::size_t i = 0; i < 10; ++i) { + TVector x{dimension}; + for (std::size_t j = 0; j < dimension; ++j) { + x(j) = coords[i * dimension + j]; + } + bopt.maximumLikelihoodKernel(); + bopt.add(x, scale * x.norm() + f0, scale * scale * (xu - xl) * (xu - xl) * 0.001); + } + + TVector probe{dimension}; + rng.generateUniformSamples(xl, xu, dimension, coords); + for (std::size_t i = 0; i < dimension; ++i) { + probe(i) = coords[i]; + } + evaluateResults.push_back(bopt.evaluate(probe)); + evaluate1DResults.emplace_back(); + for (std::size_t i = 0; i < dimension; ++i) { + evaluate1DResults.back().push_back( + bopt.evaluate1D(probe[i], static_cast(i))); + } + totalVarianceResults.push_back(bopt.anovaTotalVariance()); + totalCoefficientOfVariationResults.push_back(bopt.anovaTotalCoefficientOfVariation()); + } + + LOG_DEBUG(<< "evaluate = " << core::CContainerPrinter::print(evaluateResults)); + LOG_DEBUG(<< "evaluate1D = " << core::CContainerPrinter::print(evaluate1DResults)); + LOG_DEBUG(<< "totalVariance = " << core::CContainerPrinter::print(totalVarianceResults)); + LOG_DEBUG(<< "totalCoefficientOfVariationResults = " + << core::CContainerPrinter::print(totalCoefficientOfVariationResults)); + + for (std::size_t i = 1; i < tests.size(); ++i) { + double f0{tests[i].s_F0}; + double scale{tests[i].s_Scale}; + BOOST_REQUIRE_CLOSE(evaluateResults[i], scale * evaluateResults[0] + f0, 1e-3); + for (std::size_t j = 0; j < evaluate1DResults[i].size(); ++j) { + BOOST_REQUIRE_CLOSE(evaluate1DResults[i][j], + scale * evaluate1DResults[0][j] + f0, 1e-3); + } + BOOST_REQUIRE_CLOSE(totalVarianceResults[i], + scale * scale * totalVarianceResults[0], 1e-3); + } + BOOST_TEST_REQUIRE(totalCoefficientOfVariationResults[1] < + totalCoefficientOfVariationResults[0]); + BOOST_REQUIRE_CLOSE(totalCoefficientOfVariationResults[2], + totalCoefficientOfVariationResults[0], 1e-3); +} + BOOST_AUTO_TEST_SUITE_END()