diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index daff481f91..0b331d7ee0 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -41,6 +41,7 @@ * Add instrumentation to report statistics related to data frame analytics jobs, i.e. progress, memory usage, etc. (See {ml-pull}906[#906].) +* Multiclass classification. (See {ml-pull}1037[#1037].) === Enhancements diff --git a/include/maths/CBoostedTreeLoss.h b/include/maths/CBoostedTreeLoss.h index 20637d5b0e..788d867f21 100644 --- a/include/maths/CBoostedTreeLoss.h +++ b/include/maths/CBoostedTreeLoss.h @@ -8,8 +8,10 @@ #define INCLUDED_ml_maths_CBoostedTreeLoss_h #include +#include #include #include +#include #include #include @@ -66,9 +68,26 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl { //! \brief Finds the value to add to a set of predicted log-odds which minimises //! regularised cross entropy loss w.r.t. the actual categories. -class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { +//! +//! DESCRIPTION:\n +//! We want to find the weight which minimizes the log-loss, i.e. which satisfies +//!
+//!   \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_i{ a_i \log(S(p_i + w)) + (1 - a_i) \log(1 - S(p_i + w)) } }\f$
+//! 
+//! +//! Rather than working with this function directly we bucket the predictions `p_i` +//! in a first pass over the data and compute weight which minimizes the approximate +//! function +//!
+//! \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_{B}{ c_{1,B} \log(S(\bar{p}_B + w)) + c_{0,B} \log(1 - S(\bar{p}_B + w)) } }\f$
+//! 
+//! +//! Here, \f$B\f$ ranges over the buckets, \f$\bar{p}_B\f$ denotes the B'th bucket +//! centre and \f$c_{0,B}\f$ and \f$c_{1,B}\f$ denote the counts of actual classes +//! 0 and 1, respectively, in the bucket \f$B\f$. +class MATHS_EXPORT CArgMinBinomialLogisticLossImpl final : public CArgMinLossImpl { public: - CArgMinLogisticImpl(double lambda); + CArgMinBinomialLogisticLossImpl(double lambda); std::unique_ptr clone() const override; bool nextPass() override; void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; @@ -80,11 +99,13 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { using TDoubleVector2x1 = CVectorNx1; using TDoubleVector2x1Vec = std::vector; +private: + static constexpr std::size_t NUMBER_BUCKETS = 128; + private: std::size_t bucket(double prediction) const { double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()}; - return std::min(static_cast(bucket), - m_BucketCategoryCounts.size() - 1); + return std::min(static_cast(bucket), m_BucketsClassCounts.size() - 1); } double bucketCentre(std::size_t bucket) const { @@ -95,15 +116,74 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl { double bucketWidth() const { return m_PredictionMinMax.initialized() ? m_PredictionMinMax.range() / - static_cast(m_BucketCategoryCounts.size()) + static_cast(m_BucketsClassCounts.size()) : 0.0; } private: std::size_t m_CurrentPass = 0; TMinMaxAccumulator m_PredictionMinMax; - TDoubleVector2x1 m_CategoryCounts; - TDoubleVector2x1Vec m_BucketCategoryCounts; + TDoubleVector2x1 m_ClassCounts; + TDoubleVector2x1Vec m_BucketsClassCounts; +}; + +//! \brief Finds the value to add to a set of predicted multinomial logit which +//! minimises regularised cross entropy loss w.r.t. the actual classes. +//! +//! DESCRIPTION:\n +//! We want to find the weight which minimizes the log-loss, i.e. which satisfies +//!
+//!   \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_i{ \log([softmax(p_i + w)]_{a_i}) } }\f$
+//! 
+//! +//! Here, \f$a_i\f$ is the index of the i'th example's true class. Rather than +//! working with this function directly we approximate it by the means and count +//! of predictions in a partition of the original data, i.e. we compute the weight +//! weight which satisfies +//!
+//! \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_P{ c_{a_i, P} \log([softmax(\bar{p}_P + w)]) } }\f$
+//! 
+//! +//! Here, \f$P\f$ ranges over the subsets of the partition, \f$\bar{p}_P\f$ denotes +//! the mean of the predictions in the P'th subset and \f$c_{a_i, P}\f$ denote the +//! counts of each classes \f$\{a_i\}\f$ in the subset \f$P\f$. We compute this +//! partition by k-means. +class MATHS_EXPORT CArgMinMultinomialLogisticLossImpl final : public CArgMinLossImpl { +public: + using TObjective = std::function; + using TObjectiveGradient = std::function; + +public: + CArgMinMultinomialLogisticLossImpl(std::size_t numberClasses, + double lambda, + const CPRNG::CXorOShiro128Plus& rng); + std::unique_ptr clone() const override; + bool nextPass() override; + void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override; + void merge(const CArgMinLossImpl& other) override; + TDoubleVector value() const override; + + // Exposed for unit testing. + TObjective objective() const; + TObjectiveGradient objectiveGradient() const; + +private: + using TDoubleVectorVec = std::vector; + using TKMeans = CKMeansOnline; + +private: + static constexpr std::size_t NUMBER_CENTRES = 128; + static constexpr std::size_t NUMBER_RESTARTS = 5; + +private: + std::size_t m_NumberClasses = 0; + std::size_t m_CurrentPass = 0; + mutable CPRNG::CXorOShiro128Plus m_Rng; + TDoubleVector m_ClassCounts; + TDoubleVector m_DoublePrediction; + TKMeans m_PredictionSketch; + TDoubleVectorVec m_Centres; + TDoubleVectorVec m_CentresClassCounts; }; } @@ -185,7 +265,8 @@ class MATHS_EXPORT CLoss { //! Transforms a prediction from the forest to the target space. virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0; //! Get an object which computes the leaf value that minimises loss. - virtual CArgMinLoss minimizer(double lambda) const = 0; + virtual CArgMinLoss minimizer(double lambda, + const CPRNG::CXorOShiro128Plus& rng) const = 0; //! Get the name of the loss function virtual const std::string& name() const = 0; @@ -214,7 +295,7 @@ class MATHS_EXPORT CMse final : public CLoss { double weight = 1.0) const override; bool isCurvatureConstant() const override; TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; - CArgMinLoss minimizer(double lambda) const override; + CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override; const std::string& name() const override; }; @@ -227,11 +308,47 @@ class MATHS_EXPORT CMse final : public CLoss { //! //! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the //! prediction and \f$S(\cdot)\f$ denotes the logistic function. -class MATHS_EXPORT CBinomialLogistic final : public CLoss { +class MATHS_EXPORT CBinomialLogisticLoss final : public CLoss { +public: + static const std::string NAME; + +public: + std::unique_ptr clone() const override; + std::size_t numberParameters() const override; + double value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight = 1.0) const override; + void gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; + void curvature(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight = 1.0) const override; + bool isCurvatureConstant() const override; + TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; + CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override; + const std::string& name() const override; +}; + +//! \brief Implements loss for multinomial logistic regression. +//! +//! DESCRIPTION:\n +//! This targets the cross-entropy loss using the forest to predict the class +//! probabilities via the softmax function: +//!
+//!   \f$\displaystyle l_i(p) = -\sum_i a_{ij} \log(\sigma(p))\f$
+//! 
+//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ denotes +//! the vector valued prediction and \f$\sigma(p)\$ is the softmax function, i.e. +//! \f$[\sigma(p)]_j = \frac{e^{p_i}}{\sum_k e^{p_k}}\f$. +class MATHS_EXPORT CMultinomialLogisticLoss final : public CLoss { public: static const std::string NAME; public: + CMultinomialLogisticLoss(std::size_t numberClasses); std::unique_ptr clone() const override; std::size_t numberParameters() const override; double value(const TMemoryMappedFloatVector& prediction, @@ -247,8 +364,11 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss { double weight = 1.0) const override; bool isCurvatureConstant() const override; TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override; - CArgMinLoss minimizer(double lambda) const override; + CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override; const std::string& name() const override; + +private: + std::size_t m_NumberClasses; }; } } diff --git a/include/maths/CKMeans.h b/include/maths/CKMeans.h index aa1c8d3d9e..4ea8914448 100644 --- a/include/maths/CKMeans.h +++ b/include/maths/CKMeans.h @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -125,7 +126,7 @@ class CKMeans { const TPointVec& points() const { return m_Points; } //! Get the cluster checksum. - uint64_t checksum() const { return m_Checksum; } + std::uint64_t checksum() const { return m_Checksum; } private: //! The centroid of the points in this cluster. @@ -133,7 +134,7 @@ class CKMeans { //! The points in the cluster. TPointVec m_Points; //! A checksum for the points in the cluster. - uint64_t m_Checksum; + std::uint64_t m_Checksum; }; using TClusterVec = std::vector; @@ -183,8 +184,9 @@ class CKMeans { if (m_Centres.empty()) { return true; } + TMeanAccumulatorVec newCentres; for (std::size_t i = 0u; i < maxIterations; ++i) { - if (!this->updateCentres()) { + if (!this->updateCentres(newCentres)) { return true; } } @@ -481,19 +483,19 @@ class CKMeans { private: //! Single iteration of Lloyd's algorithm to update \p centres. - bool updateCentres() { - const TCoordinate precision = TCoordinate(5) * - std::numeric_limits::epsilon(); - TMeanAccumulatorVec newCentres(m_Centres.size(), - TMeanAccumulator(las::zero(m_Centres[0]))); + bool updateCentres(TMeanAccumulatorVec& newCentres) { + const TCoordinate precision{TCoordinate(5) * + std::numeric_limits::epsilon()}; + newCentres.assign(m_Centres.size(), TMeanAccumulator(las::zero(m_Centres[0]))); CCentroidComputer computer(m_Centres, newCentres); m_Points.preorderDepthFirst(computer); bool changed = false; + POINT newCentre; for (std::size_t i = 0u; i < newCentres.size(); ++i) { - POINT newCentre(CBasicStatistics::mean(newCentres[i])); + newCentre = CBasicStatistics::mean(newCentres[i]); if (las::distance(m_Centres[i], newCentre) > precision * las::norm(m_Centres[i])) { - m_Centres[i] = newCentre; + las::swap(m_Centres[i], newCentre); changed = true; } } diff --git a/include/maths/CKMeansOnline.h b/include/maths/CKMeansOnline.h index 83dd2ce589..8b314470a5 100644 --- a/include/maths/CKMeansOnline.h +++ b/include/maths/CKMeansOnline.h @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -36,8 +37,7 @@ namespace ml { namespace maths { -//! \brief Computes k-means of a set of points online using \f$O(k)\f$ -//! memory. +//! \brief Computes k-means of a set of points online using \f$O(k)\f$ memory. //! //! DESCRIPTION:\n //! This is a sketch data structure of points and their spherical variance @@ -65,39 +65,25 @@ class CKMeansOnline { using TKMeansOnlineVec = std::vector; protected: - //! \brief Checks if a cluster should be deleted based on its count. - class CShouldDelete { - public: - CShouldDelete(double minimumCategoryCount) - : m_MinimumCategoryCount(minimumCategoryCount) {} - - template - bool operator()(const CLUSTER& cluster) const { - return CBasicStatistics::count(cluster.first) < m_MinimumCategoryCount; - } - - private: - double m_MinimumCategoryCount; - }; - using TFloatPoint = typename SFloatingPoint::Type; using TFloatCoordinate = typename SCoordinate::Type; using TFloatPointDoublePr = std::pair; using TFloatPointDoublePrVec = std::vector; - using TFloatMeanAccumulator = typename CBasicStatistics::SSampleMean::TAccumulator; - using TFloatMeanAccumulatorDoublePr = std::pair; - using TFloatMeanAccumulatorDoublePrVec = std::vector; - using TDoubleMeanAccumulator = + using TFloatPointMeanAccumulator = + typename CBasicStatistics::SSampleMean::TAccumulator; + using TFloatPointMeanAccumulatorDoublePr = std::pair; + using TFloatPointMeanAccumulatorDoublePrVec = std::vector; + using TDoublePointMeanAccumulator = typename CBasicStatistics::SSampleMean::TAccumulator; - using TDoubleMeanVarAccumulator = + using TDoublePointMeanVarAccumulator = typename CBasicStatistics::SSampleMeanVar::TAccumulator; -protected: +public: //! The minimum permitted size for the clusterer. static const std::size_t MINIMUM_SPACE; //! The maximum allowed size of the points buffer. - static const std::size_t MAXIMUM_BUFFER_SIZE; + static const std::size_t BUFFER_SIZE; //! The number of times to seed the clustering in reduce. static const std::size_t NUMBER_SEEDS; @@ -106,55 +92,73 @@ class CKMeansOnline { static const std::size_t MAX_ITERATIONS; static const core::TPersistenceTag K_TAG; + static const core::TPersistenceTag BUFFER_SIZE_TAG; + static const core::TPersistenceTag NUMBER_SEEDS_TAG; + static const core::TPersistenceTag MAX_ITERATIONS_TAG; static const core::TPersistenceTag CLUSTERS_TAG; static const core::TPersistenceTag POINTS_TAG; static const core::TPersistenceTag RNG_TAG; public: - //! \param[in] k The maximum space in numbers of clusters. - //! A cluster comprises one float point vector, one count and - //! a double holding the spherical variance. - //! \param[in] decayRate The rate at which we data ages out - //! of the clusterer. - //! \param[in] minimumCategoryCount The minimum permitted count - //! for a cluster. - //! \note This will store as much information about the points - //! subject to this constraint so will generally hold \p k - //! clusters. - CKMeansOnline(std::size_t k, double decayRate = 0.0, double minimumCategoryCount = MINIMUM_CATEGORY_COUNT) - : m_K(std::max(k, MINIMUM_SPACE)), m_DecayRate(decayRate), - m_MinimumCategoryCount(minimumCategoryCount) { - m_Clusters.reserve(m_K + MAXIMUM_BUFFER_SIZE + 1u); - m_PointsBuffer.reserve(MAXIMUM_BUFFER_SIZE); + //! \param[in] k The numbers of clusters to maintain. A cluster comprises + //! one float point vector, one count and a double holding the spherical + //! variance. + //! \param[in] decayRate The rate to age old data out of the clusters. + //! \param[in] minClusterSize The minimum permitted number of points in + //! a cluster. + //! \param[in] bufferSize The number of points to buffer before reclustering. + //! \param[in] numberSeeds The number of seeds to use when reclustering. + //! \param[in] maxIterations The maximum number of iterations to use when + //! reclustering. + CKMeansOnline(std::size_t k, + double decayRate = 0.0, + double minClusterSize = MINIMUM_CATEGORY_COUNT, + std::size_t bufferSize = BUFFER_SIZE, + std::size_t numberSeeds = NUMBER_SEEDS, + std::size_t maxIterations = MAX_ITERATIONS) + : m_K{std::max(k, MINIMUM_SPACE)}, m_BufferSize{bufferSize}, m_NumberSeeds{numberSeeds}, + m_MaxIterations{maxIterations}, m_DecayRate{decayRate}, m_MinClusterSize{minClusterSize} { + m_Clusters.reserve(m_K + m_BufferSize + 1); } - //! Construct a new classifier with the specified space limit - //! \p space and categories \p categories. + //! Construct with \p clusters. CKMeansOnline(std::size_t k, double decayRate, - double minimumCategoryCount, - TFloatMeanAccumulatorDoublePrVec& clusters) - : m_K(std::max(k, MINIMUM_SPACE)), m_DecayRate(decayRate), - m_MinimumCategoryCount(minimumCategoryCount) { + double minClusterSize, + TFloatPointMeanAccumulatorDoublePrVec& clusters) + : CKMeansOnline{k, decayRate, minClusterSize} { m_Clusters.swap(clusters); - m_Clusters.reserve(m_K + MAXIMUM_BUFFER_SIZE + 1u); - m_PointsBuffer.reserve(MAXIMUM_BUFFER_SIZE); + m_Clusters.reserve(m_K + m_BufferSize + 1); } //! Create from part of a state document. bool acceptRestoreTraverser(const SDistributionRestoreParams& params, core::CStateRestoreTraverser& traverser) { m_DecayRate = params.s_DecayRate; - m_MinimumCategoryCount = params.s_MinimumCategoryCount; + m_MinClusterSize = params.s_MinimumCategoryCount; + TFloatPointDoublePrVec points; do { - const std::string name = traverser.name(); - RESTORE(RNG_TAG, m_Rng.fromString(traverser.value())); + const std::string& name{traverser.name()}; + RESTORE(RNG_TAG, m_Rng.fromString(traverser.value())) + RESTORE(K_TAG, core::CPersistUtils::restore(K_TAG, m_K, traverser)) + RESTORE(BUFFER_SIZE_TAG, + core::CPersistUtils::restore(BUFFER_SIZE_TAG, m_BufferSize, traverser)) + RESTORE(NUMBER_SEEDS_TAG, + core::CPersistUtils::restore(NUMBER_SEEDS_TAG, m_NumberSeeds, traverser)) + RESTORE(MAX_ITERATIONS_TAG, + core::CPersistUtils::restore(MAX_ITERATIONS_TAG, m_MaxIterations, traverser)) RESTORE(K_TAG, core::CPersistUtils::restore(K_TAG, m_K, traverser)) RESTORE(CLUSTERS_TAG, core::CPersistUtils::restore(CLUSTERS_TAG, m_Clusters, traverser)) - RESTORE(POINTS_TAG, core::CPersistUtils::restore(POINTS_TAG, m_PointsBuffer, traverser)) + RESTORE(POINTS_TAG, core::CPersistUtils::restore(POINTS_TAG, points, traverser)) } while (traverser.next()); + + for (const auto& point : points) { + m_Clusters.emplace_back( + CBasicStatistics::momentsAccumulator(point.second, point.first), 0.0); + } + return true; } @@ -162,33 +166,35 @@ class CKMeansOnline { void acceptPersistInserter(core::CStatePersistInserter& inserter) const { inserter.insertValue(RNG_TAG, m_Rng.toString()); core::CPersistUtils::persist(K_TAG, m_K, inserter); + core::CPersistUtils::persist(BUFFER_SIZE_TAG, m_BufferSize, inserter); + core::CPersistUtils::persist(NUMBER_SEEDS_TAG, m_NumberSeeds, inserter); + core::CPersistUtils::persist(MAX_ITERATIONS_TAG, m_MaxIterations, inserter); core::CPersistUtils::persist(CLUSTERS_TAG, m_Clusters, inserter); - core::CPersistUtils::persist(POINTS_TAG, m_PointsBuffer, inserter); } //! Efficiently swap the contents of this and \p other. void swap(CKMeansOnline& other) { std::swap(m_Rng, other.m_Rng); std::swap(m_K, other.m_K); + std::swap(m_BufferSize, other.m_BufferSize); + std::swap(m_NumberSeeds, other.m_NumberSeeds); + std::swap(m_MaxIterations, other.m_MaxIterations); std::swap(m_DecayRate, other.m_DecayRate); - std::swap(m_MinimumCategoryCount, other.m_MinimumCategoryCount); + std::swap(m_MinClusterSize, other.m_MinClusterSize); m_Clusters.swap(other.m_Clusters); - m_PointsBuffer.swap(other.m_PointsBuffer); } //! Get the total number of clusters. - std::size_t size() const { - return std::min(m_Clusters.size() + m_PointsBuffer.size(), m_K); - } + std::size_t size() const { return std::min(m_Clusters.size(), m_K); } //! Get the clusters being maintained. void clusters(TSphericalClusterVec& result) const { result.clear(); result.reserve(m_Clusters.size()); for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - const TFloatPoint& m = CBasicStatistics::mean(m_Clusters[i].first); - double n = CBasicStatistics::count(m_Clusters[i].first); - double v = m_Clusters[i].second; + const TFloatPoint& m{CBasicStatistics::mean(m_Clusters[i].first)}; + double n{CBasicStatistics::count(m_Clusters[i].first)}; + double v{m_Clusters[i].second}; result.emplace_back(m, SCountAndVariance(n, v)); } } @@ -214,7 +220,7 @@ class CKMeansOnline { TSphericalClusterVec clusters; this->clusters(clusters); - return kmeans(m_Rng, clusters, k, result); + return kmeans(m_Rng, clusters, k, result, m_NumberSeeds, m_MaxIterations); } //! Get our best estimate of the \p k means clustering of @@ -226,8 +232,12 @@ class CKMeansOnline { //! \param[out] result Filled in with the \p k means clustering //! of \p clusters. template - static bool - kmeans(RNG& rng, TSphericalClusterVec& clusters, std::size_t k, TSphericalClusterVecVec& result) { + static bool kmeans(RNG& rng, + TSphericalClusterVec& clusters, + std::size_t k, + TSphericalClusterVecVec& result, + std::size_t numberSeeds = NUMBER_SEEDS, + std::size_t maxIterations = MAX_ITERATIONS) { result.clear(); if (k == 0) { @@ -254,18 +264,23 @@ class CKMeansOnline { CKMeans kmeans; kmeans.setPoints(clusters); - CBasicStatistics::COrderStatisticsStack minCost; + CBasicStatistics::SMin::TAccumulator minCost; TSphericalClusterVec centres; TSphericalClusterVecVec candidates; - for (std::size_t i = 0u; i < NUMBER_SEEDS; ++i) { + for (std::size_t i = 0u; i < numberSeeds; ++i) { CKMeansPlusPlusInitialization seedCentres(rng); seedCentres.run(clusters, k, centres); kmeans.setCentres(centres); - kmeans.run(MAX_ITERATIONS); + kmeans.run(maxIterations); kmeans.clusters(candidates); + candidates.erase(std::remove_if(candidates.begin(), candidates.end(), + [](TSphericalClusterVec& cluster) { + return cluster.empty(); + }), + candidates.end()); CSphericalGaussianInfoCriterion criterion; criterion.add(candidates); - double cost = criterion.calculate(); + double cost{criterion.calculate()}; if (minCost.add(cost)) { result.swap(candidates); } @@ -284,20 +299,19 @@ class CKMeansOnline { //! \p split if it is a valid partition and cleared otherwise. bool split(const TSizeVecVec& split, TKMeansOnlineVec& result) { result.clear(); - this->reduce(); if (!this->checkSplit(split)) { return false; } result.reserve(split.size()); - TFloatMeanAccumulatorDoublePrVec clusters; + TFloatPointMeanAccumulatorDoublePrVec clusters; for (std::size_t i = 0u; i < split.size(); ++i) { clusters.clear(); clusters.reserve(split[i].size()); for (std::size_t j = 0u; j < split[i].size(); ++j) { clusters.push_back(m_Clusters[split[i][j]]); } - result.emplace_back(m_K, m_DecayRate, m_MinimumCategoryCount, clusters); + result.emplace_back(m_K, m_DecayRate, m_MinClusterSize, clusters); } return true; @@ -308,11 +322,8 @@ class CKMeansOnline { //! \param[in] x A point to add to the clusterer. //! \param[in] count The count weight of this point. void add(const TDoublePoint& x, double count = 1.0) { - if (m_PointsBuffer.size() < MAXIMUM_BUFFER_SIZE) { - m_PointsBuffer.emplace_back(x, count); - } else { - m_Clusters.push_back(TFloatMeanAccumulatorDoublePr()); - CKMeansOnline::add(x, count, m_Clusters.back()); + m_Clusters.emplace_back(CBasicStatistics::momentsAccumulator(count, x), 0.0); + if (m_Clusters.size() > m_K + m_BufferSize) { this->reduce(); } } @@ -323,18 +334,13 @@ class CKMeansOnline { void merge(const CKMeansOnline& other) { LOG_TRACE(<< "Merge"); - for (std::size_t i = 0u; i < other.m_PointsBuffer.size(); ++i) { - m_Clusters.push_back(TFloatMeanAccumulatorDoublePr()); - CKMeansOnline::add(other.m_PointsBuffer[i].first, - other.m_PointsBuffer[i].second, m_Clusters.back()); - } m_Clusters.insert(m_Clusters.end(), other.m_Clusters.begin(), other.m_Clusters.end()); this->reduce(); // Reclaim memory from the vector buffer. - TFloatMeanAccumulatorDoublePrVec categories(m_Clusters); + TFloatPointMeanAccumulatorDoublePrVec categories(m_Clusters); m_Clusters.swap(categories); } @@ -348,7 +354,7 @@ class CKMeansOnline { return; } - double alpha = std::exp(-m_DecayRate * time); + double alpha{std::exp(-m_DecayRate * time)}; LOG_TRACE(<< "alpha = " << alpha); this->age(alpha); @@ -365,14 +371,17 @@ class CKMeansOnline { // Prune any dead categories: we're not interested in // maintaining categories with low counts. m_Clusters.erase(std::remove_if(m_Clusters.begin(), m_Clusters.end(), - CShouldDelete(m_MinimumCategoryCount)), + [this](const auto& cluster) { + return CBasicStatistics::count( + cluster.first) < m_MinClusterSize; + }), m_Clusters.end()); LOG_TRACE(<< "clusters = " << core::CContainerPrinter::print(m_Clusters)); } - //! Get the current points buffer. - bool buffering() const { return m_PointsBuffer.size() > 0; } + //! Check if there are points in the buffer. + bool buffering() const { return m_Clusters.size() > m_K; } //! Get \p n samples of the distribution corresponding to the //! categories we are maintaining. @@ -388,17 +397,15 @@ class CKMeansOnline { using TDoubleVec = std::vector; using TDoubleSizePr = std::pair; - static const double ALMOST_ONE = 0.99999; - // See, for example, Effective C++ item 3. const_cast(this)->reduce(); LOG_TRACE(<< "categories = " << core::CContainerPrinter::print(m_Clusters)); TDoubleVec counts; counts.reserve(m_Clusters.size()); - double Z = 0.0; + double Z{0.0}; for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - double ni = CBasicStatistics::count(m_Clusters[i].first); + double ni{CBasicStatistics::count(m_Clusters[i].first)}; counts.push_back(ni); Z += ni; } @@ -412,35 +419,35 @@ class CKMeansOnline { result.reserve(2 * numberSamples); TDoubleVec weights; - TDoublePointVec categorySamples; + TDoublePointVec clusterSamples; for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - double ni = counts[i]; + double ni{counts[i]}; - categorySamples.clear(); - TDoublePoint m = CBasicStatistics::mean(m_Clusters[i].first); + clusterSamples.clear(); + TDoublePoint m{CBasicStatistics::mean(m_Clusters[i].first)}; if (m_Clusters[i].second == 0.0) { - categorySamples.push_back(m); + clusterSamples.push_back(m); } else { - std::size_t ni_ = static_cast(std::ceil(ni)); + std::size_t ni_{static_cast(std::ceil(ni))}; TDoublePoint v(m_Clusters[i].second); - sampleGaussian(ni_, m, v.asDiagonal(), categorySamples); + sampleGaussian(ni_, m, v.asDiagonal(), clusterSamples); } - ni /= static_cast(categorySamples.size()); + ni /= static_cast(clusterSamples.size()); - result.insert(result.end(), categorySamples.begin(), categorySamples.end()); - weights.insert(weights.end(), categorySamples.size(), ni); + result.insert(result.end(), clusterSamples.begin(), clusterSamples.end()); + weights.insert(weights.end(), clusterSamples.size(), ni); } LOG_TRACE(<< "samples = " << core::CContainerPrinter::print(result)); LOG_TRACE(<< "weights = " << core::CContainerPrinter::print(weights)); - TDoublePointVec final; - final.reserve(static_cast( + TDoublePointVec finalSamples; + finalSamples.reserve(static_cast( std::ceil(std::accumulate(weights.begin(), weights.end(), 0.0)))); - TDoubleMeanAccumulator sample; + TDoublePointMeanAccumulator sample{las::zero(result[0])}; for (;;) { - CBasicStatistics::COrderStatisticsStack nearest; - const TDoublePoint& sample_ = CBasicStatistics::mean(sample); + CBasicStatistics::SMin::TAccumulator nearest; + const TDoublePoint& sample_{CBasicStatistics::mean(sample)}; for (std::size_t j = 0u; j < result.size(); ++j) { if (weights[j] > 0.0) { nearest.add({las::distance(result[j], sample_), j}); @@ -450,20 +457,20 @@ class CKMeansOnline { break; } - std::size_t j = nearest[0].second; - const TDoublePoint& xj = result[j]; + std::size_t j{nearest[0].second}; + const TDoublePoint& xj{result[j]}; do { - double nj = std::min(1.0 - CBasicStatistics::count(sample), weights[j]); + double nj{std::min(1.0 - CBasicStatistics::count(sample), weights[j])}; sample.add(xj, nj); weights[j] -= nj; if (CBasicStatistics::count(sample) > ALMOST_ONE) { - final.push_back(CBasicStatistics::mean(sample)); - sample = TDoubleMeanAccumulator(); + finalSamples.push_back(CBasicStatistics::mean(sample)); + sample = TDoublePointMeanAccumulator{las::zero(result[0])}; } } while (weights[j] > 0.0); } - result.swap(final); + result = std::move(finalSamples); LOG_TRACE(<< "# samples = " << result.size()); LOG_TRACE(<< "samples = " << core::CContainerPrinter::print(result)); } @@ -474,25 +481,21 @@ class CKMeansOnline { } //! Get a checksum for this object. - uint64_t checksum(uint64_t seed = 0) const { + std::uint64_t checksum(std::uint64_t seed = 0) const { seed = CChecksum::calculate(seed, m_K); seed = CChecksum::calculate(seed, m_DecayRate); - seed = CChecksum::calculate(seed, m_Clusters); - return CChecksum::calculate(seed, m_PointsBuffer); + return CChecksum::calculate(seed, m_Clusters); } //! Get the memory used by this component void debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const { mem->setName("CKMeansOnline"); core::CMemoryDebug::dynamicSize("m_Clusters", m_Clusters, mem); - core::CMemoryDebug::dynamicSize("m_PointsBuffer", m_PointsBuffer, mem); } //! Get the memory used by this component std::size_t memoryUsage() const { - std::size_t mem = core::CMemory::dynamicSize(m_Clusters); - mem += core::CMemory::dynamicSize(m_PointsBuffer); - return mem; + return core::CMemory::dynamicSize(m_Clusters); } protected: @@ -519,12 +522,7 @@ class CKMeansOnline { //! Reduce the number of clusters to m_K by k-means clustering. void reduce() { - // Add all the points as new spherical clusters and reduce. - for (const auto& point : m_PointsBuffer) { - m_Clusters.push_back(TFloatMeanAccumulatorDoublePr()); - CKMeansOnline::add(point.first, point.second, m_Clusters.back()); - } - m_PointsBuffer.clear(); + deduplicate(m_Clusters); if (m_Clusters.size() < m_K) { return; @@ -533,21 +531,19 @@ class CKMeansOnline { LOG_TRACE(<< "clusters = " << core::CContainerPrinter::print(m_Clusters)); LOG_TRACE(<< "# clusters = " << m_Clusters.size()); - TSphericalClusterVecVec kclusters; - { - TSphericalClusterVec clusters; - this->clusters(clusters); - kmeans(m_Rng, clusters, m_K, kclusters); - } + TSphericalClusterVecVec newClusters; + TSphericalClusterVec oldClusters; + this->clusters(oldClusters); + kmeans(m_Rng, oldClusters, m_K, newClusters, m_NumberSeeds, m_MaxIterations); - m_Clusters.resize(kclusters.size()); - for (std::size_t i = 0u; i < kclusters.size(); ++i) { - TDoubleMeanVarAccumulator cluster; - for (const auto& point : kclusters[i]) { + m_Clusters.resize(newClusters.size()); + for (std::size_t i = 0u; i < newClusters.size(); ++i) { + TDoublePointMeanVarAccumulator cluster{las::zero(oldClusters[0])}; + for (const auto& point : newClusters[i]) { cluster.add(point); } - double n = CBasicStatistics::count(cluster); - const TDoublePoint& m = CBasicStatistics::mean(cluster); + double n{CBasicStatistics::count(cluster)}; + const TDoublePoint& m{CBasicStatistics::mean(cluster)}; m_Clusters[i].first = CBasicStatistics::momentsAccumulator( TFloatCoordinate(n), TFloatPoint(m)); m_Clusters[i].second = variance(cluster); @@ -557,28 +553,41 @@ class CKMeansOnline { LOG_TRACE(<< "# reduced clusters = " << m_Clusters.size()); } - //! Add \p count copies of \p mx to the cluster \p cluster. - static void add(const TDoublePoint& mx, double count, TFloatMeanAccumulatorDoublePr& cluster) { - double nx = count; - TDoublePoint vx(0.0); - double nc = CBasicStatistics::count(cluster.first); - TDoublePoint mc = CBasicStatistics::mean(cluster.first); - TDoublePoint vc(cluster.second); - TDoubleMeanVarAccumulator moments = - CBasicStatistics::momentsAccumulator(nc, mc, vc) + - CBasicStatistics::momentsAccumulator(nx, mx, vx); - TFloatCoordinate ncx = CBasicStatistics::count(moments); - TFloatPoint mcx = CBasicStatistics::mean(moments); - cluster.first = CBasicStatistics::momentsAccumulator(ncx, mcx); - cluster.second = variance(moments); + //! Remove any duplicates in \p points. + //! + //! \note We assume \p points is small so the bruteforce approach is fast. + static void deduplicate(TFloatPointMeanAccumulatorDoublePrVec& clusters) { + if (clusters.size() > 1) { + std::stable_sort(clusters.begin(), clusters.end(), + [](const auto& lhs, const auto& rhs) { + return CBasicStatistics::mean(lhs.first) < + CBasicStatistics::mean(rhs.first); + }); + auto back = clusters.begin(); + for (auto i = back + 1; i != clusters.end(); ++i) { + if (CBasicStatistics::mean(back->first) == CBasicStatistics::mean(i->first)) { + back->first += i->first; + double n[]{CBasicStatistics::count(back->first), + CBasicStatistics::count(i->first)}; + back->second = (n[0] * back->second + n[1] * i->second) / + (n[0] + n[1]); + } else if (++back != i) { + *back = std::move(*i); + } + } + clusters.erase(back + 1, clusters.end()); + } } //! Get the spherically symmetric variance from \p moments. - static double variance(const TDoubleMeanVarAccumulator& moments) { - const TDoublePoint& v = CBasicStatistics::maximumLikelihoodVariance(moments); - return v.L1() / static_cast(v.dimension()); + static double variance(const TDoublePointMeanVarAccumulator& moments) { + const TDoublePoint& v{CBasicStatistics::maximumLikelihoodVariance(moments)}; + return las::L1(v) / static_cast(las::dimension(v)); } +private: + static constexpr double ALMOST_ONE = 0.99999; + private: //! The random number generator. CPRNG::CXorOShiro128Plus m_Rng; @@ -586,24 +595,29 @@ class CKMeansOnline { //! The number of clusters to maintain. std::size_t m_K; + //! The number of points to buffer before clustering. + std::size_t m_BufferSize; + + //! The number of seeds to use when reclustering. + std::size_t m_NumberSeeds; + + //! The number of iterations of k-means to use when reclustering. + std::size_t m_MaxIterations; + //! The rate at which the categories lose information. double m_DecayRate; - //! The minimum permitted count for a cluster. - double m_MinimumCategoryCount; + //! The minimum permitted number of points in a cluster. + double m_MinClusterSize; //! The clusters we are maintaining. - TFloatMeanAccumulatorDoublePrVec m_Clusters; - - //! A buffer of the points added while the space constraint - //! is satisfied. - TFloatPointDoublePrVec m_PointsBuffer; + TFloatPointMeanAccumulatorDoublePrVec m_Clusters; }; template const std::size_t CKMeansOnline::MINIMUM_SPACE = 4u; template -const std::size_t CKMeansOnline::MAXIMUM_BUFFER_SIZE = 6u; +const std::size_t CKMeansOnline::BUFFER_SIZE = 6u; template const std::size_t CKMeansOnline::NUMBER_SEEDS = 5u; template @@ -617,6 +631,12 @@ template const core::TPersistenceTag CKMeansOnline::POINTS_TAG("c", "points"); template const core::TPersistenceTag CKMeansOnline::RNG_TAG("d", "rng"); +template +const core::TPersistenceTag CKMeansOnline::BUFFER_SIZE_TAG("e", "buffer_size"); +template +const core::TPersistenceTag CKMeansOnline::NUMBER_SEEDS_TAG("f", "number_seeds"); +template +const core::TPersistenceTag CKMeansOnline::MAX_ITERATIONS_TAG("g", "max_iterations"); } } diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h index 22aca2b029..7af9a97219 100644 --- a/include/maths/CLinearAlgebraEigen.h +++ b/include/maths/CLinearAlgebraEigen.h @@ -25,6 +25,7 @@ #include #include +#include #include namespace Eigen { @@ -43,7 +44,7 @@ bool operator<(const SparseMatrix& lhs, LESS_OR_GREATER(lhs.cols(), rhs.cols()) for (STORAGE_INDEX i = 0; i < lhs.rows(); ++i) { for (STORAGE_INDEX j = 0; j < lhs.cols(); ++j) { - LESS_OR_GREATER(lhs.coeff(i, j), rhs.coeff(i, j)) + LESS_OR_GREATER(lhs(i, j), rhs(i, j)) } } return false; @@ -55,7 +56,7 @@ bool operator<(const SparseVector& lhs, const SparseVector& rhs) { LESS_OR_GREATER(lhs.size(), rhs.size()) for (STORAGE_INDEX i = 0; i < lhs.size(); ++i) { - LESS_OR_GREATER(lhs.coeff(i), rhs(i)) + LESS_OR_GREATER(lhs(i), rhs(i)) } return false; } @@ -66,12 +67,8 @@ bool operator<(const Matrix& lh const Matrix& rhs) { LESS_OR_GREATER(lhs.rows(), rhs.rows()) LESS_OR_GREATER(lhs.cols(), rhs.cols()) - for (decltype(lhs.rows()) i = 0; i < lhs.rows(); ++i) { - for (decltype(lhs.cols()) j = 0; j < lhs.cols(); ++j) { - LESS_OR_GREATER(lhs.coeff(i, j), rhs.coeff(i, j)) - } - } - return false; + return std::lexicographical_compare(lhs.data(), lhs.data() + lhs.size(), + rhs.data(), rhs.data() + rhs.size()); } //! Less than on an Eigen memory mapped matrix. @@ -80,12 +77,8 @@ bool operator<(const Map& lhs, const Map& rhs) { LESS_OR_GREATER(lhs.rows(), rhs.rows()) LESS_OR_GREATER(lhs.cols(), rhs.cols()) - for (decltype(lhs.rows()) i = 0; i < lhs.rows(); ++i) { - for (decltype(lhs.cols()) j = 0; j < lhs.cols(); ++j) { - LESS_OR_GREATER(lhs.coeff(i, j), rhs.coeff(i, j)) - } - } - return false; + return std::lexicographical_compare(lhs.data(), lhs.data() + lhs.size(), + rhs.data(), rhs.data() + rhs.size()); } #undef LESS_OR_GREATER @@ -302,7 +295,7 @@ class CDenseVector : public Eigen::Matrix { return result; } -private: + //! Convert from a std::vector. static CDenseVector fromStdVector(const std::vector& vector) { CDenseVector result(vector.size()); for (std::size_t i = 0; i < vector.size(); ++i) { diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h index 914d128abf..cab1e3b8b1 100644 --- a/include/maths/CLinearAlgebraShims.h +++ b/include/maths/CLinearAlgebraShims.h @@ -20,6 +20,32 @@ namespace ml { namespace maths { namespace las { +//! Swap two vectors or matrices efficiently. +template +void swap(T& lhs, T& rhs) { + lhs.swap(rhs); +} + +//! Swap two stack vectors. +template +void swap(CVectorNx1& lhs, CVectorNx1& rhs) { + std::swap(lhs, rhs); +} + +//! Swap two annotated vectors and their annotations. +template +void swap(CAnnotatedVector& lhs, + CAnnotatedVector& rhs) { + swap(static_cast(lhs), static_cast(rhs)); + std::swap(lhs.annotation(), rhs.annotation()); +} + +//! Swap two stack matrices. +template +void swap(CSymmetricMatrixNxN& lhs, CSymmetricMatrixNxN& rhs) { + std::swap(lhs, rhs); +} + //! Get the dimension of one of our internal vectors. template std::size_t dimension(const VECTOR& x) { diff --git a/include/maths/CTools.h b/include/maths/CTools.h index d51f760564..0e0ed7c03b 100644 --- a/include/maths/CTools.h +++ b/include/maths/CTools.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -683,6 +684,31 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width)); } + //! Compute the softmax from the multinomial logit values \p logit. + //! + //! i.e. \f$[\sigma(z)]_i = \frac{exp(z_i)}{\sum_j exp(z_j)}\f$. + //! + //! \tparam COLLECTION Is assumed to be a collection type, i.e. it + //! must support iterator based access. + template + static COLLECTION softmax(COLLECTION z) { + COLLECTION probabilities{std::move(z)}; + double Z{0.0}; + double zmax{*std::max_element(z.begin(), z.end())}; + for (auto& pi : probabilities) { + pi = std::exp(pi - zmax); + Z += pi; + } + for (auto& pi : probabilities) { + pi /= Z; + } + return probabilities; + } + + //! Specialize the softmax for our dense vector type. + template + static CDenseVector softmax(CDenseVector z); + //! Linearly interpolate a function on the interval [\p a, \p b]. static double linearlyInterpolate(double a, double b, double fa, double fb, double x); diff --git a/include/maths/CToolsDetail.h b/include/maths/CToolsDetail.h index 1224ee8a89..050d86822d 100644 --- a/include/maths/CToolsDetail.h +++ b/include/maths/CToolsDetail.h @@ -300,6 +300,13 @@ void CTools::spread(double a, double b, double separation, T& points) { LOG_TRACE(<< "# iterations = " << iteration << " # points = " << n + 1); } + +template +CDenseVector CTools::softmax(CDenseVector z) { + double zmax{z.maxCoeff()}; + z = (z.array() - zmax).exp(); + return z / z.template lpNorm<1>(); +} } } diff --git a/include/maths/CXMeansOnline.h b/include/maths/CXMeansOnline.h index 54337669f5..a409da6440 100644 --- a/include/maths/CXMeansOnline.h +++ b/include/maths/CXMeansOnline.h @@ -391,7 +391,7 @@ class CXMeansOnline : public CClusterer> { TKMeansOnline::kmeans(rng, node, 2, candidate); LOG_TRACE(<< "candidate = " << core::CContainerPrinter::print(candidate)); - if (candidate.size() != 2) { + if (candidate.size() > 2) { LOG_ERROR(<< "Expected 2-split: " << core::CContainerPrinter::print(candidate)); break; diff --git a/include/maths/Constants.h b/include/maths/Constants.h index 26b155459d..f15efe6941 100644 --- a/include/maths/Constants.h +++ b/include/maths/Constants.h @@ -25,62 +25,62 @@ namespace maths { //! the sample mean. However, it means we are insensitive anomalous //! deviations in data whose variation is significantly smaller than //! this minimum value. -const double MINIMUM_COEFFICIENT_OF_VARIATION{1e-4}; +constexpr double MINIMUM_COEFFICIENT_OF_VARIATION{1e-4}; //! The largest probability for which an event is considered anomalous //! enough to be worthwhile showing a user. -const double LARGEST_SIGNIFICANT_PROBABILITY{0.05}; +constexpr double LARGEST_SIGNIFICANT_PROBABILITY{0.05}; //! The largest probability that it is deemed significantly anomalous. -const double SMALL_PROBABILITY{1e-4}; +constexpr double SMALL_PROBABILITY{1e-4}; //! The largest probability that it is deemed extremely anomalous. //! Probabilities smaller than this are only weakly discriminated //! in the sense that they are given the correct order, but fairly //! similar score. -const double MINUSCULE_PROBABILITY{1e-50}; +constexpr double MINUSCULE_PROBABILITY{1e-50}; //! The margin between the smallest value and the support left end //! to use for the gamma distribution. -const double GAMMA_OFFSET_MARGIN{0.1}; +constexpr double GAMMA_OFFSET_MARGIN{0.1}; //! The margin between the smallest value and the support left end //! to use for the log-normal distribution. -const double LOG_NORMAL_OFFSET_MARGIN{1.0}; +constexpr double LOG_NORMAL_OFFSET_MARGIN{1.0}; //! The minimum amount by which a trend decomposition component can //! reduce the prediction error variance and still be worthwhile //! modeling. We have different thresholds because we have inductive //! bias for particular types of components. -const double COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[]{0.6, 0.4}; +constexpr double COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[]{0.6, 0.4}; //! The minimum repeated amplitude of a seasonal component, as a //! multiple of error standard deviation, to be worthwhile modeling. //! We have different thresholds because we have inductive bias for //! particular types of components. -const double SEASONAL_SIGNIFICANT_AMPLITUDE[]{1.0, 2.0}; +constexpr double SEASONAL_SIGNIFICANT_AMPLITUDE[]{1.0, 2.0}; //! The minimum autocorrelation of a seasonal component to be //! worthwhile modeling. We have different thresholds because we //! have inductive bias for particular types of components. -const double SEASONAL_SIGNIFICANT_AUTOCORRELATION[]{0.5, 0.6}; +constexpr double SEASONAL_SIGNIFICANT_AUTOCORRELATION[]{0.5, 0.6}; //! The fraction of values which are treated as outliers when testing //! for and initializing a seasonal component. -const double SEASONAL_OUTLIER_FRACTION{0.1}; +constexpr double SEASONAL_OUTLIER_FRACTION{0.1}; //! The minimum multiplier of the mean inlier fraction difference //! (from a periodic pattern) to constitute an outlier when testing //! for and initializing a seasonal component. -const double SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD{3.0}; +constexpr double SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD{3.0}; //! The weight to assign outliers when testing for and initializing //! a seasonal component. -const double SEASONAL_OUTLIER_WEIGHT{0.1}; +constexpr double SEASONAL_OUTLIER_WEIGHT{0.1}; //! The significance of a test statistic to choose to model //! a trend decomposition component. -const double COMPONENT_STATISTICALLY_SIGNIFICANT{0.001}; +constexpr double COMPONENT_STATISTICALLY_SIGNIFICANT{0.001}; //! The log of COMPONENT_STATISTICALLY_SIGNIFICANT. const double LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE{ @@ -88,17 +88,17 @@ const double LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE{ //! The default number of regression models used in periodic and //! calendar cyclic components of the trend decomposition. -const std::size_t COMPONENT_SIZE{36u}; +constexpr std::size_t COMPONENT_SIZE{36u}; //! The minimum variance scale for which the likelihood function //! can be accurately adjusted. For smaller scales errors are //! introduced for some priors. -const double MINIMUM_ACCURATE_VARIANCE_SCALE{0.5}; +constexpr double MINIMUM_ACCURATE_VARIANCE_SCALE{0.5}; //! The maximum variance scale for which the likelihood function //! can be accurately adjusted. For larger scales errors are //! introduced for some priors. -const double MAXIMUM_ACCURATE_VARIANCE_SCALE{2.0}; +constexpr double MAXIMUM_ACCURATE_VARIANCE_SCALE{2.0}; //! The confidence interval to use for the seasonal trend and //! variation. We detrend to the nearest point in the confidence @@ -106,16 +106,16 @@ const double MAXIMUM_ACCURATE_VARIANCE_SCALE{2.0}; //! scaling the likelihood function so that we don't get transient //! anomalies after detecting a periodic trend (when the trend //! can be in significant error). -const double DEFAULT_SEASONAL_CONFIDENCE_INTERVAL{50.0}; +constexpr double DEFAULT_SEASONAL_CONFIDENCE_INTERVAL{50.0}; //! The minimum fractional count of points in a cluster. -const double MINIMUM_CLUSTER_SPLIT_FRACTION{0.0}; +constexpr double MINIMUM_CLUSTER_SPLIT_FRACTION{0.0}; //! The default minimum count of points in a cluster. -const double MINIMUM_CLUSTER_SPLIT_COUNT{24.0}; +constexpr double MINIMUM_CLUSTER_SPLIT_COUNT{24.0}; //! The minimum count of a category in the sketch to cluster. -const double MINIMUM_CATEGORY_COUNT{0.5}; +constexpr double MINIMUM_CATEGORY_COUNT{0.5}; } } diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc index 59892c311e..8777a9e439 100644 --- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc @@ -77,7 +77,8 @@ CDataFrameTrainBoostedTreeClassifierRunner::CDataFrameTrainBoostedTreeClassifier const CDataFrameAnalysisSpecification& spec, const CDataFrameAnalysisParameters& parameters) : CDataFrameTrainBoostedTreeRunner{ - spec, parameters, std::make_unique()} { + spec, parameters, + std::make_unique()} { m_NumTopClasses = parameters[NUM_TOP_CLASSES].fallback(std::size_t{0}); m_PredictionFieldType = diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index 72d0e28be2..7b702b36e2 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -236,7 +236,7 @@ void addPredictionTestData(EPredictionType type, if (type == E_Regression) { loss = std::make_unique(); } else { - loss = std::make_unique(); + loss = std::make_unique(); } maths::CBoostedTreeFactory treeFactory{ diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index f5819c335f..bfa25e413d 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -357,7 +357,8 @@ void CBoostedTreeImpl::initializePerFoldTestLosses() { } void CBoostedTreeImpl::computeProbabilityAtWhichToAssignClassOne(const core::CDataFrame& frame) { - if (m_Loss->name() == boosted_tree::CBinomialLogistic::NAME) { + // TODO generalize for multi-class. + if (m_Loss->name() == boosted_tree::CBinomialLogisticLoss::NAME) { switch (m_ClassAssignmentObjective) { case CBoostedTree::E_Accuracy: break; @@ -911,7 +912,8 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr using TArgMinLossVec = std::vector; TArgMinLossVec leafValues( - tree.size(), m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier())); + tree.size(), + m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier(), m_Rng)); auto nextPass = [&] { bool done{true}; for (const auto& value : leafValues) { diff --git a/lib/maths/CBoostedTreeLoss.cc b/lib/maths/CBoostedTreeLoss.cc index 1d7196efaa..45a967b8f8 100644 --- a/lib/maths/CBoostedTreeLoss.cc +++ b/lib/maths/CBoostedTreeLoss.cc @@ -6,8 +6,16 @@ #include +#include +#include +#include +#include +#include #include #include +#include + +#include namespace ml { namespace maths { @@ -31,6 +39,16 @@ double logLogistic(double logOdds) { } return CTools::fastLog(CTools::logisticFunction(logOdds)); } + +template +CDenseVector logSoftmax(CDenseVector z) { + // Version which handles overflow and underflow when taking exponentials. + double zmax{z.maxCoeff()}; + z = z - zmax * CDenseVector::Ones(z.size()); + double Z{z.array().exp().matrix().template lpNorm<1>()}; + z = z - std::log(Z) * CDenseVector::Ones(z.size()); + return std::move(z); +} } namespace boosted_tree_detail { @@ -82,31 +100,31 @@ CArgMinMseImpl::TDoubleVector CArgMinMseImpl::value() const { return result; } -CArgMinLogisticImpl::CArgMinLogisticImpl(double lambda) - : CArgMinLossImpl{lambda}, m_CategoryCounts{0}, - m_BucketCategoryCounts(128, TDoubleVector2x1{0.0}) { +CArgMinBinomialLogisticLossImpl::CArgMinBinomialLogisticLossImpl(double lambda) + : CArgMinLossImpl{lambda}, m_ClassCounts{0}, + m_BucketsClassCounts(NUMBER_BUCKETS, TDoubleVector2x1{0.0}) { } -std::unique_ptr CArgMinLogisticImpl::clone() const { - return std::make_unique(*this); +std::unique_ptr CArgMinBinomialLogisticLossImpl::clone() const { + return std::make_unique(*this); } -bool CArgMinLogisticImpl::nextPass() { +bool CArgMinBinomialLogisticLossImpl::nextPass() { m_CurrentPass += this->bucketWidth() > 0.0 ? 1 : 2; return m_CurrentPass < 2; } -void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction, - double actual, - double weight) { +void CArgMinBinomialLogisticLossImpl::add(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { switch (m_CurrentPass) { case 0: { m_PredictionMinMax.add(prediction(0)); - m_CategoryCounts(static_cast(actual)) += weight; + m_ClassCounts(static_cast(actual)) += weight; break; } case 1: { - auto& count = m_BucketCategoryCounts[this->bucket(prediction(0))]; + auto& count = m_BucketsClassCounts[this->bucket(prediction(0))]; count(static_cast(actual)) += weight; break; } @@ -115,17 +133,17 @@ void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction, } } -void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) { - const auto* logistic = dynamic_cast(&other); +void CArgMinBinomialLogisticLossImpl::merge(const CArgMinLossImpl& other) { + const auto* logistic = dynamic_cast(&other); if (logistic != nullptr) { switch (m_CurrentPass) { case 0: m_PredictionMinMax += logistic->m_PredictionMinMax; - m_CategoryCounts += logistic->m_CategoryCounts; + m_ClassCounts += logistic->m_ClassCounts; break; case 1: - for (std::size_t i = 0; i < m_BucketCategoryCounts.size(); ++i) { - m_BucketCategoryCounts[i] += logistic->m_BucketCategoryCounts[i]; + for (std::size_t i = 0; i < m_BucketsClassCounts.size(); ++i) { + m_BucketsClassCounts[i] += logistic->m_BucketsClassCounts[i]; } break; default: @@ -134,7 +152,7 @@ void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) { } } -CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const { +CArgMinBinomialLogisticLossImpl::TDoubleVector CArgMinBinomialLogisticLossImpl::value() const { std::function objective; double minWeight; @@ -151,16 +169,16 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const { : 0.0}; objective = [prediction, this](double weight) { double logOdds{prediction + weight}; - double c0{m_CategoryCounts(0)}; - double c1{m_CategoryCounts(1)}; + double c0{m_ClassCounts(0)}; + double c1{m_ClassCounts(1)}; return this->lambda() * CTools::pow2(weight) - c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds); }; // Weight shrinkage means the optimal weight will be somewhere between // the logit of the empirical probability and zero. - double c0{m_CategoryCounts(0) + 1.0}; - double c1{m_CategoryCounts(1) + 1.0}; + double c0{m_ClassCounts(0) + 1.0}; + double c1{m_ClassCounts(1) + 1.0}; double empiricalProbabilityC1{c1 / (c0 + c1)}; double empiricalLogOddsC1{ std::log(empiricalProbabilityC1 / (1.0 - empiricalProbabilityC1))}; @@ -170,10 +188,10 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const { } else { objective = [this](double weight) { double loss{0.0}; - for (std::size_t i = 0; i < m_BucketCategoryCounts.size(); ++i) { + for (std::size_t i = 0; i < m_BucketsClassCounts.size(); ++i) { double logOdds{this->bucketCentre(i) + weight}; - double c0{m_BucketCategoryCounts[i](0)}; - double c1{m_BucketCategoryCounts[i](1)}; + double c0{m_BucketsClassCounts[i](0)}; + double c1{m_BucketsClassCounts[i](1)}; loss -= c0 * logOneMinusLogistic(logOdds) + c1 * logLogistic(logOdds); } return loss + this->lambda() * CTools::pow2(weight); @@ -205,6 +223,216 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const { result(0) = minimum; return result; } + +CArgMinMultinomialLogisticLossImpl::CArgMinMultinomialLogisticLossImpl(std::size_t numberClasses, + double lambda, + const CPRNG::CXorOShiro128Plus& rng) + : CArgMinLossImpl{lambda}, m_NumberClasses{numberClasses}, m_Rng{rng}, + m_ClassCounts{TDoubleVector::Zero(numberClasses)}, + m_PredictionSketch{NUMBER_CENTRES / 2, // The size of the partition + 0.0, // The rate at which information is aged out (irrelevant) + 0.0, // The minimum permitted cluster size (irrelevant) + NUMBER_CENTRES / 2, // The buffer size + 1, // The number of seeds for k-means to try + 2} { // The number of iterations to use in k-means +} + +std::unique_ptr CArgMinMultinomialLogisticLossImpl::clone() const { + return std::make_unique(*this); +} + +bool CArgMinMultinomialLogisticLossImpl::nextPass() { + + using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; + + if (m_CurrentPass++ == 0) { + TKMeans::TSphericalClusterVecVec clusters; + if (m_PredictionSketch.kmeans(NUMBER_CENTRES / 2, clusters) == false) { + m_Centres.push_back(TDoubleVector::Zero(m_NumberClasses)); + ++m_CurrentPass; + } else { + // Extract the k-centres. + m_Centres.reserve(clusters.size()); + for (const auto& cluster : clusters) { + TMeanAccumulator centre{TDoubleVector::Zero(m_NumberClasses)}; + for (const auto& point : cluster) { + centre.add(point); + } + m_Centres.push_back(CBasicStatistics::mean(centre)); + } + std::stable_sort(m_Centres.begin(), m_Centres.end()); + m_Centres.erase(std::unique(m_Centres.begin(), m_Centres.end()), + m_Centres.end()); + LOG_TRACE(<< "# centres = " << m_Centres.size()); + m_CurrentPass += m_Centres.size() == 1 ? 1 : 0; + m_CentresClassCounts.resize(m_Centres.size(), + TDoubleVector::Zero(m_NumberClasses)); + } + + // Reclaim the memory used by k-means. + m_PredictionSketch = TKMeans{0}; + } + + LOG_TRACE(<< "current pass = " << m_CurrentPass); + + return m_CurrentPass < 2; +} + +void CArgMinMultinomialLogisticLossImpl::add(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) { + + using TMinAccumulator = CBasicStatistics::SMin>::TAccumulator; + + switch (m_CurrentPass) { + case 0: { + // We have a member variable to avoid allocating a tempory each time. + m_DoublePrediction = prediction; + m_PredictionSketch.add(m_DoublePrediction, weight); + m_ClassCounts(static_cast(actual)) += weight; + break; + } + case 1: { + TMinAccumulator nearest; + for (std::size_t i = 0; i < m_Centres.size(); ++i) { + nearest.add({(m_Centres[i] - prediction).squaredNorm(), i}); + } + auto& count = m_CentresClassCounts[nearest[0].second]; + count(static_cast(actual)) += weight; + break; + } + default: + break; + } +} + +void CArgMinMultinomialLogisticLossImpl::merge(const CArgMinLossImpl& other) { + const auto* logistic = dynamic_cast(&other); + if (logistic != nullptr) { + switch (m_CurrentPass) { + case 0: + m_PredictionSketch.merge(logistic->m_PredictionSketch); + m_ClassCounts += logistic->m_ClassCounts; + break; + case 1: + for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) { + m_CentresClassCounts[i] += logistic->m_CentresClassCounts[i]; + } + break; + default: + break; + } + } +} + +CArgMinMultinomialLogisticLossImpl::TDoubleVector +CArgMinMultinomialLogisticLossImpl::value() const { + + using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; + + TDoubleVector weightBoundingBox[2]; + weightBoundingBox[0] = std::numeric_limits::max() * + TDoubleVector::Ones(m_NumberClasses); + weightBoundingBox[1] = -weightBoundingBox[0]; + + if (m_Centres.size() == 1) { + + // Weight shrinkage means the optimal weight will be somewhere between + // the logit of the empirical probability and zero. + TDoubleVector empiricalProbabilities{m_ClassCounts.array() + 0.1}; + empiricalProbabilities = empiricalProbabilities / + empiricalProbabilities.lpNorm<1>(); + TDoubleVector empiricalLogOdds{ + empiricalProbabilities.array().log().matrix() - m_Centres[0]}; + weightBoundingBox[0] = weightBoundingBox[0].array().min(0.0); + weightBoundingBox[1] = weightBoundingBox[1].array().max(0.0); + weightBoundingBox[0] = weightBoundingBox[0].array().min(empiricalLogOdds.array()); + weightBoundingBox[1] = weightBoundingBox[1].array().max(empiricalLogOdds.array()); + + } else { + + for (const auto& centre : m_Centres) { + weightBoundingBox[0] = weightBoundingBox[0].array().min(-centre.array()); + weightBoundingBox[1] = weightBoundingBox[1].array().max(-centre.array()); + } + } + LOG_TRACE(<< "bounding box blc = " << weightBoundingBox[0].transpose()); + LOG_TRACE(<< "bounding box trc = " << weightBoundingBox[1].transpose()); + + // Optimize via LBFGS with multiple restarts. + + TMinAccumulator minLoss; + TDoubleVector result; + + TDoubleVector x0(m_NumberClasses); + TObjective objective{this->objective()}; + TObjectiveGradient objectiveGradient{this->objectiveGradient()}; + for (std::size_t i = 0; i < NUMBER_RESTARTS; ++i) { + for (int j = 0; j < x0.size(); ++j) { + double alpha{CSampling::uniformSample(m_Rng, 0.0, 1.0)}; + x0(j) = weightBoundingBox[0](j) + + alpha * (weightBoundingBox[1](j) - weightBoundingBox[0](j)); + } + LOG_TRACE(<< "x0 = " << x0.transpose()); + + double loss; + CLbfgs lgbfs{5}; + std::tie(x0, loss) = lgbfs.minimize(objective, objectiveGradient, std::move(x0)); + if (minLoss.add(loss)) { + result = x0; + } + LOG_TRACE(<< "loss = " << loss << " weight for loss = " << x0.transpose()); + } + LOG_TRACE(<< "minimum loss = " << minLoss << " weight* = " << result.transpose()); + + return result; +} + +CArgMinMultinomialLogisticLossImpl::TObjective +CArgMinMultinomialLogisticLossImpl::objective() const { + TDoubleVector logProbabilities; + double lambda{this->lambda()}; + if (m_Centres.size() == 1) { + return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable { + logProbabilities = m_Centres[0] + weight; + logProbabilities = logSoftmax(std::move(logProbabilities)); + return lambda * weight.squaredNorm() - m_ClassCounts.transpose() * logProbabilities; + }; + } + return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable { + double loss{0.0}; + for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) { + logProbabilities = m_Centres[i] + weight; + logProbabilities = logSoftmax(std::move(logProbabilities)); + loss -= m_CentresClassCounts[i].transpose() * logProbabilities; + } + return loss + lambda * weight.squaredNorm(); + }; +} + +CArgMinMultinomialLogisticLossImpl::TObjectiveGradient +CArgMinMultinomialLogisticLossImpl::objectiveGradient() const { + TDoubleVector probabilities; + double lambda{this->lambda()}; + if (m_Centres.size() == 1) { + return [probabilities, lambda, this](const TDoubleVector& weight) mutable { + probabilities = m_Centres[0] + weight; + probabilities = CTools::softmax(std::move(probabilities)); + return TDoubleVector{2.0 * lambda * weight - + (m_ClassCounts - m_ClassCounts.array().sum() * probabilities)}; + }; + } + return [probabilities, lambda, this](const TDoubleVector& weight) mutable -> TDoubleVector { + TDoubleVector lossGradient{TDoubleVector::Zero(m_NumberClasses)}; + for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) { + probabilities = m_Centres[i] + weight; + probabilities = CTools::softmax(std::move(probabilities)); + lossGradient -= m_CentresClassCounts[i] - + m_CentresClassCounts[i].array().sum() * probabilities; + } + return TDoubleVector{2.0 * lambda * weight + lossGradient}; + }; +} } namespace boosted_tree { @@ -277,7 +505,7 @@ CMse::TDoubleVector CMse::transform(const TMemoryMappedFloatVector& prediction) return TDoubleVector{prediction}; } -CArgMinLoss CMse::minimizer(double lambda) const { +CArgMinLoss CMse::minimizer(double lambda, const CPRNG::CXorOShiro128Plus&) const { return this->makeMinimizer(CArgMinMseImpl{lambda}); } @@ -287,25 +515,25 @@ const std::string& CMse::name() const { const std::string CMse::NAME{"mse"}; -std::unique_ptr CBinomialLogistic::clone() const { - return std::make_unique(*this); +std::unique_ptr CBinomialLogisticLoss::clone() const { + return std::make_unique(*this); } -std::size_t CBinomialLogistic::numberParameters() const { +std::size_t CBinomialLogisticLoss::numberParameters() const { return 1; } -double CBinomialLogistic::value(const TMemoryMappedFloatVector& prediction, - double actual, - double weight) const { +double CBinomialLogisticLoss::value(const TMemoryMappedFloatVector& prediction, + double actual, + double weight) const { return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction(0)) + actual * logLogistic(prediction(0))); } -void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction, - double actual, - TWriter writer, - double weight) const { +void CBinomialLogisticLoss::gradient(const TMemoryMappedFloatVector& prediction, + double actual, + TWriter writer, + double weight) const { if (prediction(0) > -LOG_EPSILON && actual == 1.0) { writer(0, -weight * std::exp(-prediction(0))); } else { @@ -313,10 +541,10 @@ void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction, } } -void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction, - double /*actual*/, - TWriter writer, - double weight) const { +void CBinomialLogisticLoss::curvature(const TMemoryMappedFloatVector& prediction, + double /*actual*/, + TWriter writer, + double weight) const { if (prediction(0) > -LOG_EPSILON) { writer(0, weight * std::exp(-prediction(0))); } else { @@ -325,26 +553,133 @@ void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction, } } -bool CBinomialLogistic::isCurvatureConstant() const { +bool CBinomialLogisticLoss::isCurvatureConstant() const { return false; } -CBinomialLogistic::TDoubleVector -CBinomialLogistic::transform(const TMemoryMappedFloatVector& prediction) const { +CBinomialLogisticLoss::TDoubleVector +CBinomialLogisticLoss::transform(const TMemoryMappedFloatVector& prediction) const { TDoubleVector result{prediction}; result(0) = CTools::logisticFunction(result(0)); return result; } -CArgMinLoss CBinomialLogistic::minimizer(double lambda) const { - return this->makeMinimizer(CArgMinLogisticImpl{lambda}); +CArgMinLoss CBinomialLogisticLoss::minimizer(double lambda, + const CPRNG::CXorOShiro128Plus&) const { + return this->makeMinimizer(CArgMinBinomialLogisticLossImpl{lambda}); +} + +const std::string& CBinomialLogisticLoss::name() const { + return NAME; +} + +const std::string CBinomialLogisticLoss::NAME{"binomial_logistic"}; + +CMultinomialLogisticLoss::CMultinomialLogisticLoss(std::size_t numberClasses) + : m_NumberClasses{numberClasses} { +} + +std::unique_ptr CMultinomialLogisticLoss::clone() const { + return std::make_unique(m_NumberClasses); +} + +std::size_t CMultinomialLogisticLoss::numberParameters() const { + return m_NumberClasses; +} + +double CMultinomialLogisticLoss::value(const TMemoryMappedFloatVector& predictions, + double actual, + double weight) const { + double zmax{predictions.maxCoeff()}; + double logZ{0.0}; + for (int i = 0; i < predictions.size(); ++i) { + logZ += std::exp(predictions(i) - zmax); + } + logZ = zmax + CTools::fastLog(logZ); + + // i.e. -log(z(actual)) + return weight * (logZ - predictions(static_cast(actual))); +} + +void CMultinomialLogisticLoss::gradient(const TMemoryMappedFloatVector& predictions, + double actual, + TWriter writer, + double weight) const { + + // We prefer an implementation which avoids any memory allocations. + + double zmax{predictions.maxCoeff()}; + double logZ{0.0}; + for (int i = 0; i < predictions.size(); ++i) { + logZ += std::exp(predictions(i) - zmax); + } + logZ = zmax + CTools::fastLog(logZ); + + for (int i = 0; i < predictions.size(); ++i) { + if (i == static_cast(actual)) { + if (predictions(i) - logZ > -LOG_EPSILON) { + writer(i, -weight * std::exp(-(predictions(i) - logZ))); + } else { + writer(i, weight * (std::exp(predictions(i) - logZ) - 1.0)); + } + } else { + writer(i, weight * std::exp(predictions(i) - logZ)); + } + } +} + +void CMultinomialLogisticLoss::curvature(const TMemoryMappedFloatVector& predictions, + double /*actual*/, + TWriter writer, + double weight) const { + + // Return the lower triangle of the Hessian column major. + + // We prefer an implementation which avoids any memory allocations. + + double zmax{predictions.maxCoeff()}; + double logZ{0.0}; + for (int i = 0; i < predictions.size(); ++i) { + logZ += std::exp(predictions(i) - zmax); + } + logZ = zmax + CTools::fastLog(logZ); + + for (std::size_t i = 0, k = 0; i < m_NumberClasses; ++i) { + if (predictions(i) - logZ > -LOG_EPSILON) { + writer(i, weight * std::exp(-(predictions(i) - logZ))); + } else { + double probability{std::exp(predictions(i) - logZ)}; + writer(i, weight * weight * probability * (1.0 - probability)); + } + for (std::size_t j = i + 1; j < m_NumberClasses; ++j, ++k) { + double probabilities[]{std::exp(predictions(i) - logZ), + std::exp(predictions(j) - logZ)}; + writer(k, -weight * probabilities[0] * probabilities[1]); + } + } +} + +bool CMultinomialLogisticLoss::isCurvatureConstant() const { + return false; +} + +CMultinomialLogisticLoss::TDoubleVector +CMultinomialLogisticLoss::transform(const TMemoryMappedFloatVector& prediction) const { + TDoubleVector result{prediction}; + return CTools::softmax(std::move(result)); +} + +CArgMinLoss CMultinomialLogisticLoss::minimizer(double lambda, + const CPRNG::CXorOShiro128Plus& rng) const { + return this->makeMinimizer( + CArgMinMultinomialLogisticLossImpl{m_NumberClasses, lambda, rng}); } -const std::string& CBinomialLogistic::name() const { +const std::string& CMultinomialLogisticLoss::name() const { return NAME; } -const std::string CBinomialLogistic::NAME{"binomial_logistic"}; +const std::string CMultinomialLogisticLoss::NAME{"multinomial_logistic"}; } } } diff --git a/lib/maths/unittest/CBoostedTreeLossTest.cc b/lib/maths/unittest/CBoostedTreeLossTest.cc new file mode 100644 index 0000000000..c1e03da96c --- /dev/null +++ b/lib/maths/unittest/CBoostedTreeLossTest.cc @@ -0,0 +1,678 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include "core/CContainerPrinter.h" +#include "maths/CLbfgs.h" +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include + +BOOST_AUTO_TEST_SUITE(CBoostedTreeLossTest) + +using namespace ml; +using TDoubleVec = std::vector; +using TDoubleVecVec = std::vector; +using TDoubleVector = maths::boosted_tree::CLoss::TDoubleVector; +using TDoubleVectorVec = std::vector; +using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector; +using maths::boosted_tree::CBinomialLogisticLoss; +using maths::boosted_tree::CMultinomialLogisticLoss; +using maths::boosted_tree_detail::CArgMinBinomialLogisticLossImpl; +using maths::boosted_tree_detail::CArgMinMultinomialLogisticLossImpl; + +namespace { +void minimizeGridSearch(std::function objective, + double scale, + int d, + TDoubleVector& x, + double& min, + TDoubleVector& argmin) { + if (d == x.size()) { + double value{objective(x)}; + if (value < min) { + min = value; + argmin = x; + } + return; + } + for (double xd : {-0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5}) { + x(d) = scale * xd; + minimizeGridSearch(objective, scale, d + 1, x, min, argmin); + } +} +} + +BOOST_AUTO_TEST_CASE(testBinomialLogisticMinimizerEdgeCases) { + + // All predictions equal and zero. + { + CArgMinBinomialLogisticLossImpl argmin{0.0}; + maths::CFloatStorage storage[]{0.0}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, 0.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 0.0); + BOOST_REQUIRE_EQUAL(false, argmin.nextPass()); + BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]); + } + + // All predictions are equal and 0.5. + { + test::CRandomNumbers rng; + + TDoubleVec labels; + rng.generateUniformSamples(0.0, 1.0, 1000, labels); + for (auto& label : labels) { + label = std::floor(label + 0.3); + } + + CArgMinBinomialLogisticLossImpl argmin{0.0}; + std::size_t numberPasses{0}; + std::size_t counts[]{0, 0}; + + do { + ++numberPasses; + for (std::size_t i = 0; i < labels.size(); ++i) { + maths::CFloatStorage storage[]{0.5}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + ++counts[static_cast(labels[i])]; + } + } while (argmin.nextPass()); + + double p{static_cast(counts[1]) / 1000.0}; + double expected{std::log(p / (1.0 - p)) - 0.5}; + double actual{argmin.value()[0]}; + + BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses); + BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected)); + } + + // Test underflow of probabilities. + { + CArgMinBinomialLogisticLossImpl argmin{0.0}; + + TDoubleVec predictions{-500.0, -30.0, -15.0, -400.0}; + TDoubleVec labels{1.0, 1.0, 0.0, 1.0}; + do { + for (std::size_t i = 0; i < predictions.size(); ++i) { + maths::CFloatStorage storage[]{predictions[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + } + } while (argmin.nextPass()); + + double minimizer{argmin.value()[0]}; + + // Check we're at the minimum. + CBinomialLogisticLoss loss; + TDoubleVec losses; + for (double eps : {-10.0, 0.0, 10.0}) { + double lossAtEps{0.0}; + for (std::size_t i = 0; i < predictions.size(); ++i) { + maths::CFloatStorage storage[]{predictions[i] + minimizer + eps}; + TMemoryMappedFloatVector probe{storage, 1}; + lossAtEps += loss.value(probe, labels[i]); + } + losses.push_back(lossAtEps); + } + BOOST_TEST_REQUIRE(losses[0] >= losses[1]); + BOOST_TEST_REQUIRE(losses[2] >= losses[1]); + } +} + +BOOST_AUTO_TEST_CASE(testBinomialLogisticMinimizerRandom) { + + // Test that we a good approximation of the additive term for the log-odds + // which minimises the exact cross entropy objective. + + test::CRandomNumbers rng; + + TDoubleVec labels; + TDoubleVec predictions; + + for (auto lambda : {0.0, 10.0}) { + + LOG_DEBUG(<< "lambda = " << lambda); + + // The exact objective. + auto exactObjective = [&](double weight) { + double loss{0.0}; + for (std::size_t i = 0; i < labels.size(); ++i) { + double p{maths::CTools::logisticFunction(predictions[i] + weight)}; + loss -= (1.0 - labels[i]) * maths::CTools::fastLog(1.0 - p) + + labels[i] * maths::CTools::fastLog(p); + } + return loss + lambda * maths::CTools::pow2(weight); + }; + + // This loop is fuzzing the predicted log-odds and testing we get consistently + // good estimates of the true minimizer. + for (std::size_t t = 0; t < 10; ++t) { + + double min{std::numeric_limits::max()}; + double max{-min}; + + rng.generateUniformSamples(0.0, 1.0, 1000, labels); + for (auto& label : labels) { + label = std::floor(label + 0.5); + } + predictions.clear(); + for (const auto& label : labels) { + TDoubleVec weight; + rng.generateNormalSamples(label, 2.0, 1, weight); + predictions.push_back(weight[0]); + min = std::min(min, weight[0]); + max = std::max(max, weight[0]); + } + + double expected; + double objectiveAtExpected; + std::size_t maxIterations{20}; + maths::CSolvers::minimize(-max, -min, exactObjective(-max), + exactObjective(-min), exactObjective, 1e-3, + maxIterations, expected, objectiveAtExpected); + LOG_DEBUG(<< "expected = " << expected + << " objective(expected) = " << objectiveAtExpected); + + CArgMinBinomialLogisticLossImpl argmin{lambda}; + CArgMinBinomialLogisticLossImpl argminPartition[2]{{lambda}, {lambda}}; + auto nextPass = [&] { + bool done{argmin.nextPass() == false}; + done &= (argminPartition[0].nextPass() == false); + done &= (argminPartition[1].nextPass() == false); + return done == false; + }; + + do { + for (std::size_t i = 0; i < labels.size() / 2; ++i) { + maths::CFloatStorage storage[]{predictions[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + argminPartition[0].add(prediction, labels[i]); + } + for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) { + maths::CFloatStorage storage[]{predictions[i]}; + TMemoryMappedFloatVector prediction{storage, 1}; + argmin.add(prediction, labels[i]); + argminPartition[1].add(prediction, labels[i]); + } + argminPartition[0].merge(argminPartition[1]); + argminPartition[1] = argminPartition[0]; + } while (nextPass()); + + double actual{argmin.value()(0)}; + double actualPartition{argminPartition[0].value()(0)}; + double objectiveAtActual{exactObjective(actual)}; + LOG_DEBUG(<< "actual = " << actual << " objective(actual) = " << objectiveAtActual); + + // We should be within 1% for the value and 0.001% for the objective + // at the value. + BOOST_REQUIRE_EQUAL(actual, actualPartition); + BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected)); + BOOST_REQUIRE_CLOSE_ABSOLUTE(objectiveAtExpected, objectiveAtActual, + 1e-5 * objectiveAtExpected); + } + } +} + +BOOST_AUTO_TEST_CASE(testBinomialLogisticLossForUnderflow) { + + // Test the behaviour of value, gradient and curvature of the logistic loss in + // the vicinity the point at which we switch to using Taylor expansion of the + // logistic function is as expected. + + double eps{100.0 * std::numeric_limits::epsilon()}; + + CBinomialLogisticLoss loss; + + // Losses should be very nearly linear function of log-odds when they're large. + { + maths::CFloatStorage storage[]{1.0 - std::log(eps), 1.0 + std::log(eps)}; + TMemoryMappedFloatVector predictions[]{{&storage[0], 1}, {&storage[1], 1}}; + TDoubleVec previousLoss{loss.value(predictions[0], 0.0), + loss.value(predictions[1], 1.0)}; + LOG_DEBUG(<< core::CContainerPrinter::print(previousLoss)); + for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { + storage[0] = scale - std::log(eps); + storage[1] = scale + std::log(eps); + TDoubleVec currentLoss{loss.value(predictions[0], 0.0), + loss.value(predictions[1], 1.0)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, previousLoss[0] - currentLoss[0], 0.005); + BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, previousLoss[1] - currentLoss[1], 0.005); + previousLoss = currentLoss; + } + } + + // The gradient and curvature should be proportional to the exponential of the + // log-odds when they're small. + { + auto readDerivatives = [&](double prediction, TDoubleVec& gradients, + TDoubleVec& curvatures) { + maths::CFloatStorage storage[]{prediction + std::log(eps), + prediction - std::log(eps)}; + TMemoryMappedFloatVector predictions[]{{&storage[0], 1}, {&storage[1], 1}}; + loss.gradient(predictions[0], 0.0, [&](std::size_t, double value) { + gradients[0] = value; + }); + loss.gradient(predictions[1], 1.0, [&](std::size_t, double value) { + gradients[1] = value; + }); + loss.curvature(predictions[0], 0.0, [&](std::size_t, double value) { + curvatures[0] = value; + }); + loss.curvature(predictions[1], 1.0, [&](std::size_t, double value) { + curvatures[1] = value; + }); + }; + + TDoubleVec previousGradient(2); + TDoubleVec previousCurvature(2); + readDerivatives(1.0, previousGradient, previousCurvature); + + for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { + TDoubleVec currentGradient(2); + TDoubleVec currentCurvature(2); + readDerivatives(scale, currentGradient, currentCurvature); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(0.25), previousGradient[0] / currentGradient[0], 0.01); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(-0.25), previousGradient[1] / currentGradient[1], 0.01); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(0.25), previousCurvature[0] / currentCurvature[0], 0.01); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(-0.25), previousCurvature[1] / currentCurvature[1], 0.01); + previousGradient = currentGradient; + previousCurvature = currentCurvature; + } + } +} + +BOOST_AUTO_TEST_CASE(testMultinomialLogisticGradient) { + + // Test that the gradient function is close to the numerical derivative + // of the objective. + + maths::CPRNG::CXorOShiro128Plus rng; + test::CRandomNumbers testRng; + + for (std::size_t t = 0; t < 10; ++t) { + + CArgMinMultinomialLogisticLossImpl argmin{3, 0.1 * static_cast(t + 1), rng}; + + TDoubleVec labels; + testRng.generateUniformSamples(0.0, 3.0, 20, labels); + + TDoubleVec predictions; + if (t % 2 == 0) { + predictions.resize(3 * labels.size(), 0.0); + } else { + testRng.generateUniformSamples(-1.0, 1.0, 3 * labels.size(), predictions); + } + + do { + for (std::size_t i = 0; i < labels.size(); ++i) { + maths::CFloatStorage storage[]{predictions[3 * i + 0], + predictions[3 * i + 1], + predictions[3 * i + 2]}; + TMemoryMappedFloatVector prediction{storage, 3}; + argmin.add(prediction, std::floor(labels[i])); + } + } while (argmin.nextPass()); + + auto objective = argmin.objective(); + auto objectiveGradient = argmin.objectiveGradient(); + + double eps{1e-3}; + TDoubleVec probes; + testRng.generateUniformSamples(-1.0, 1.0, 30, probes); + for (std::size_t i = 0; i < probes.size(); i += 3) { + TDoubleVector probe{3}; + probe(0) = probes[i]; + probe(1) = probes[i + 1]; + probe(2) = probes[i + 2]; + + TDoubleVector expectedGradient{3}; + for (std::size_t j = 0; j < 3; ++j) { + TDoubleVector shift{TDoubleVector::Zero(3)}; + shift(j) = eps; + expectedGradient(j) = + (objective(probe + shift) - objective(probe - shift)) / (2.0 * eps); + } + TDoubleVector actualGradient{objectiveGradient(probe)}; + + BOOST_REQUIRE_SMALL((expectedGradient - actualGradient).norm(), eps); + } + } +} + +BOOST_AUTO_TEST_CASE(testMultinomialLogisticMinimizerEdgeCases) { + + maths::CPRNG::CXorOShiro128Plus rng; + test::CRandomNumbers testRng; + + // All predictions equal and zero. + { + CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; + + maths::CFloatStorage storage[]{0.0, 0.0, 0.0}; + TMemoryMappedFloatVector prediction{storage, 3}; + argmin.add(prediction, 0.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 1.0); + argmin.add(prediction, 0.0); + argmin.add(prediction, 2.0); + argmin.add(prediction, 1.0); + BOOST_REQUIRE_EQUAL(false, argmin.nextPass()); + + TDoubleVector expectedProbabilities{3}; + expectedProbabilities(0) = 2.0 / 6.0; + expectedProbabilities(1) = 3.0 / 6.0; + expectedProbabilities(2) = 1.0 / 6.0; + TDoubleVector actualProbabilities{maths::CTools::softmax(argmin.value())}; + + BOOST_REQUIRE_SMALL((actualProbabilities - expectedProbabilities).norm(), 1e-3); + } + + // All predictions are equal and 0.5. + for (std::size_t t = 0; t < 10; ++t) { + + TDoubleVec labels; + testRng.generateUniformSamples(0.0, 2.0, 20, labels); + for (auto& label : labels) { + label = std::floor(label + 0.3); + } + + CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; + + std::size_t numberPasses{0}; + std::size_t counts[]{0, 0, 0}; + maths::CFloatStorage storage[]{0.5, 0.5, 0.5}; + TMemoryMappedFloatVector prediction{storage, 3}; + + do { + ++numberPasses; + for (const auto& label : labels) { + argmin.add(prediction, label); + ++counts[static_cast(label)]; + } + } while (argmin.nextPass()); + + BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses); + + TDoubleVector expectedProbabilities{3}; + expectedProbabilities(0) = static_cast(counts[0]) / 20.0; + expectedProbabilities(1) = static_cast(counts[1]) / 20.0; + expectedProbabilities(2) = static_cast(counts[2]) / 20.0; + TDoubleVector actualLogit{prediction + argmin.value()}; + TDoubleVector actualProbabilities{maths::CTools::softmax(actualLogit)}; + + BOOST_REQUIRE_SMALL((actualProbabilities - expectedProbabilities).norm(), 0.001); + } + + // Test underflow of probabilities. + LOG_DEBUG(<< "Test underflow"); + { + CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; + + TDoubleVecVec predictions{{-230.0, -200.0, -200.0}, + {-30.0, -10.0, -20.0}, + {-15.0, -50.0, -30.0}, + {-400.0, -350.0, -300.0}}; + TDoubleVec labels{1.0, 1.0, 0.0, 2.0}; + do { + for (std::size_t i = 0; i < predictions.size(); ++i) { + maths::CFloatStorage storage[]{predictions[i][0], predictions[i][1], + predictions[i][2]}; + TMemoryMappedFloatVector prediction{storage, 3}; + argmin.add(prediction, labels[i]); + } + } while (argmin.nextPass()); + + TDoubleVector minimizer{argmin.value()}; + + // Check we're at a minimum. + CMultinomialLogisticLoss loss{3}; + for (std::size_t i = 0; i < 3; ++i) { + TDoubleVec losses; + for (double eps : {-30.0, 0.0, 30.0}) { + double lossAtEps{0.0}; + for (std::size_t j = 0; j < predictions.size(); ++j) { + maths::CFloatStorage storage[]{predictions[j][0] + minimizer(0), + predictions[j][1] + minimizer(1), + predictions[j][2] + minimizer(2)}; + storage[i] += eps; + TMemoryMappedFloatVector probe{storage, 3}; + lossAtEps += loss.value(probe, labels[j]); + } + losses.push_back(lossAtEps); + } + LOG_DEBUG(<< core::CContainerPrinter::print(losses)); + BOOST_TEST_REQUIRE(losses[0] >= losses[1]); + BOOST_TEST_REQUIRE(losses[2] >= losses[1]); + } + } + + // All labels equal. + { + CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; + + maths::CFloatStorage storage[]{0.0, 0.0, 0.0}; + TMemoryMappedFloatVector prediction{storage, 3}; + TDoubleVec labels{1.0, 1.0, 1.0, 1.0}; + + do { + for (const auto& label : labels) { + argmin.add(prediction, label); + } + } while (argmin.nextPass()); + + TDoubleVector minimizer{argmin.value()}; + + double totalLoss{0.0}; + CMultinomialLogisticLoss loss{3}; + for (const auto& label : labels) { + maths::CFloatStorage probeStorage[]{prediction(0) + minimizer(0), + prediction(1) + minimizer(1), + prediction(2) + minimizer(2)}; + TMemoryMappedFloatVector probe{probeStorage, 3}; + totalLoss += loss.value(probe, label); + } + + BOOST_REQUIRE_SMALL(totalLoss, 1e-3); + } +} + +BOOST_AUTO_TEST_CASE(testMultinomialLogisticMinimizerRandom) { + + // Test that we have a good approximation of the additive term for the log-odds + // which minimises the exact cross entropy objective. + + maths::CPRNG::CXorOShiro128Plus rng; + test::CRandomNumbers testRng; + + TDoubleVec labels; + TDoubleVectorVec predictions; + + double scales[]{6.0, 1.0}; + double lambdas[]{0.0, 10.0}; + + for (auto i : {0, 1}) { + + double lambda{lambdas[i]}; + + LOG_DEBUG(<< "lambda = " << lambda); + + // The exact objective. + auto exactObjective = [&](const TDoubleVector& weight) { + double loss{0.0}; + for (std::size_t j = 0; j < labels.size(); ++j) { + TDoubleVector probabilities{predictions[j] + weight}; + probabilities = maths::CTools::softmax(std::move(probabilities)); + loss -= maths::CTools::fastLog(probabilities(static_cast(labels[j]))); + } + return loss + lambda * weight.squaredNorm(); + }; + + // This loop is fuzzing the predicted log-odds and testing we get consistently + // good estimates of the true minimizer. + + double sumObjectiveGridSearch{0.0}; + double sumObjectiveAtActual{0.0}; + + for (std::size_t t = 0; t < 10; ++t) { + + testRng.generateUniformSamples(0.0, 2.0, 500, labels); + for (auto& label : labels) { + label = std::floor(label + 0.5); + } + + predictions.clear(); + for (const auto& label : labels) { + TDoubleVec prediction; + testRng.generateNormalSamples(0.0, 2.0, 3, prediction); + prediction[static_cast(label)] += 1.0; + predictions.push_back(TDoubleVector::fromStdVector(prediction)); + } + + TDoubleVector weight{3}; + double objectiveGridSearch{std::numeric_limits::max()}; + TDoubleVector argminGridSearch; + minimizeGridSearch(exactObjective, scales[i], 0, weight, + objectiveGridSearch, argminGridSearch); + LOG_DEBUG(<< "argmin grid search = " << argminGridSearch.transpose() + << ", min objective grid search = " << objectiveGridSearch); + + std::size_t numberPasses{0}; + maths::CFloatStorage storage[]{0.0, 0.0, 0.0}; + TMemoryMappedFloatVector prediction{storage, 3}; + + CArgMinMultinomialLogisticLossImpl argmin{3, lambda, rng}; + + do { + ++numberPasses; + for (std::size_t j = 0; j < labels.size(); ++j) { + storage[0] = predictions[j](0); + storage[1] = predictions[j](1); + storage[2] = predictions[j](2); + argmin.add(prediction, labels[j]); + } + } while (argmin.nextPass()); + + BOOST_REQUIRE_EQUAL(std::size_t{2}, numberPasses); + + TDoubleVector actual{argmin.value()}; + double objectiveAtActual{exactObjective(actual)}; + LOG_DEBUG(<< "actual = " << actual.transpose() + << ", objective(actual) = " << objectiveAtActual); + + BOOST_TEST_REQUIRE(objectiveAtActual < 1.01 * objectiveGridSearch); + + sumObjectiveGridSearch += objectiveGridSearch; + sumObjectiveAtActual += objectiveAtActual; + } + + LOG_DEBUG(<< "sum min objective grid search = " << sumObjectiveGridSearch); + LOG_DEBUG(<< "sum objective(actual) = " << sumObjectiveAtActual); + BOOST_TEST_REQUIRE(sumObjectiveAtActual < sumObjectiveGridSearch); + } +} + +BOOST_AUTO_TEST_CASE(testMultinomialLogisticLossForUnderflow) { + + // Test the behaviour of value, gradient and Hessian of the logistic loss in + // the regime where the probabilities underflow. + + using TFloatVec = std::vector; + + double eps{std::numeric_limits::epsilon()}; + + auto logits = [](double x, TFloatVec& result) { result.assign({0.0, x}); }; + + CMultinomialLogisticLoss loss{2}; + + // Losses should be very nearly linear function of log-odds when they're large. + { + TFloatVec storage[2]; + logits(1.0 - std::log(eps), storage[0]); + logits(1.0 + std::log(eps), storage[1]); + + TMemoryMappedFloatVector predictions[]{{&storage[0][0], 2}, {&storage[1][0], 2}}; + TDoubleVec previousLoss{loss.value(predictions[0], 0.0), + loss.value(predictions[1], 1.0)}; + + for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { + logits(scale - std::log(eps), storage[0]); + logits(scale + std::log(eps), storage[1]); + TDoubleVec currentLoss{loss.value(predictions[0], 0.0), + loss.value(predictions[1], 1.0)}; + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, previousLoss[0] - currentLoss[0], 0.005); + BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, previousLoss[1] - currentLoss[1], 0.005); + previousLoss = currentLoss; + } + } + + // The gradient and curvature should be proportional to the exponential of the + // log-odds when they're small. + /* TODO { + auto readDerivatives = [&](double prediction, TDoubleVecVec& gradients, + TDoubleVecVec& curvatures) { + TFloatVec storage[2]; + logits(prediction + std::log(eps), storage[0]); + logits(prediction - std::log(eps), storage[1]); + TMemoryMappedFloatVector predictions[]{{&storage[0][0], 2}, + {&storage[1][0], 2}}; + loss.gradient(predictions[0], 0.0, [&](std::size_t i, double value) { + gradients[0][i] = value; + }); + loss.gradient(predictions[1], 1.0, [&](std::size_t i, double value) { + gradients[1][i] = value; + }); + loss.curvature(predictions[0], 0.0, [&](std::size_t i, double value) { + curvatures[0][i] = value; + }); + loss.curvature(predictions[1], 1.0, [&](std::size_t i, double value) { + curvatures[1][i] = value; + }); + }; + + TDoubleVecVec previousGradient(2, TDoubleVec(2)); + TDoubleVecVec previousCurvature(2, TDoubleVec(3)); + readDerivatives(1.0, previousGradient, previousCurvature); + + for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { + TDoubleVecVec currentGradient(2, TDoubleVec(2)); + TDoubleVecVec currentCurvature(2, TDoubleVec(3)); + readDerivatives(scale, currentGradient, currentCurvature); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(0.25), previousGradient[0][1] / currentGradient[0][1], 0.01); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(-0.25), previousGradient[1][1] / currentGradient[1][1], 0.01); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(0.25), previousCurvature[0][2] / currentCurvature[0][1], 0.01); + BOOST_REQUIRE_CLOSE_ABSOLUTE( + std::exp(-0.25), previousCurvature[1][2] / currentCurvature[1][1], 0.01); + previousGradient = currentGradient; + previousCurvature = currentCurvature; + } + }*/ +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 2b10bf0cb7..84d7e88832 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -23,11 +22,13 @@ #include #include +#include #include #include #include #include #include +#include BOOST_AUTO_TEST_SUITE(CBoostedTreeTest) @@ -43,7 +44,6 @@ using TRowRef = core::CDataFrame::TRowRef; using TRowItr = core::CDataFrame::TRowItr; using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; -using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector; namespace { @@ -885,259 +885,6 @@ BOOST_AUTO_TEST_CASE(testDepthBasedRegularization) { } } -BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) { - - using maths::boosted_tree_detail::CArgMinLogisticImpl; - - // All predictions equal and zero. - { - CArgMinLogisticImpl argmin{0.0}; - maths::CFloatStorage storage[]{0.0}; - TMemoryMappedFloatVector prediction{storage, 1}; - argmin.add(prediction, 0.0); - argmin.add(prediction, 1.0); - argmin.add(prediction, 1.0); - argmin.add(prediction, 0.0); - argmin.nextPass(); - BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]); - } - - // All predictions are equal. - { - test::CRandomNumbers rng; - - TDoubleVec labels; - TDoubleVec weights; - rng.generateUniformSamples(0.0, 1.0, 1000, labels); - for (auto& label : labels) { - label = std::floor(label + 0.3); - } - weights.resize(labels.size(), 0.0); - - CArgMinLogisticImpl argmin{0.0}; - std::size_t numberPasses{0}; - std::size_t counts[2]{0, 0}; - - do { - ++numberPasses; - for (std::size_t i = 0; i < labels.size(); ++i) { - maths::CFloatStorage storage[]{weights[i]}; - TMemoryMappedFloatVector prediction{storage, 1}; - argmin.add(prediction, labels[i]); - ++counts[static_cast(labels[i])]; - } - } while (argmin.nextPass()); - - double p{static_cast(counts[1]) / 1000.0}; - double expected{std::log(p / (1.0 - p))}; - double actual{argmin.value()[0]}; - - BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses); - BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected)); - } - - // Test underflow of probabilities. - { - CArgMinLogisticImpl argmin{0.0}; - - TDoubleVec predictions{-500.0, -30.0, -15.0, -400.0}; - TDoubleVec actuals{1.0, 1.0, 0.0, 1.0}; - do { - for (std::size_t i = 0; i < predictions.size(); ++i) { - maths::CFloatStorage storage[]{predictions[i]}; - TMemoryMappedFloatVector prediction{storage, 1}; - argmin.add(prediction, actuals[i]); - } - } while (argmin.nextPass()); - - double minimizer{argmin.value()[0]}; - - // Check we're at the minimum. - maths::boosted_tree::CBinomialLogistic loss; - TDoubleVec losses; - for (double eps : {-10.0, 0.0, 10.0}) { - double lossAtEps{0.0}; - for (std::size_t i = 0; i < predictions.size(); ++i) { - maths::CFloatStorage storage[]{predictions[i] + minimizer + eps}; - TMemoryMappedFloatVector probe{storage, 1}; - lossAtEps += loss.value(probe, actuals[i]); - } - losses.push_back(lossAtEps); - } - BOOST_TEST_REQUIRE(losses[0] >= losses[1]); - BOOST_TEST_REQUIRE(losses[2] >= losses[1]); - } -} - -BOOST_AUTO_TEST_CASE(testLogisticMinimizerRandom) { - - // Test that we a good approximation of the additive term for the log-odds - // which minimises the cross entropy objective. - - using maths::boosted_tree_detail::CArgMinLogisticImpl; - - test::CRandomNumbers rng; - - TDoubleVec labels; - TDoubleVec weights; - - for (auto lambda : {0.0, 10.0}) { - - LOG_DEBUG(<< "lambda = " << lambda); - - // The true objective. - auto objective = [&](double weight) { - double loss{0.0}; - for (std::size_t i = 0; i < labels.size(); ++i) { - double p{maths::CTools::logisticFunction(weights[i] + weight)}; - loss -= (1.0 - labels[i]) * maths::CTools::fastLog(1.0 - p) + - labels[i] * maths::CTools::fastLog(p); - } - return loss + lambda * maths::CTools::pow2(weight); - }; - - // This loop is fuzzing the predicted log-odds and testing we get consistently - // good estimates of the true minimizer. - for (std::size_t t = 0; t < 10; ++t) { - - double min{std::numeric_limits::max()}; - double max{-min}; - - rng.generateUniformSamples(0.0, 1.0, 1000, labels); - for (auto& label : labels) { - label = std::floor(label + 0.5); - } - weights.clear(); - for (const auto& label : labels) { - TDoubleVec weight; - rng.generateNormalSamples(label, 2.0, 1, weight); - weights.push_back(weight[0]); - min = std::min(min, weight[0]); - max = std::max(max, weight[0]); - } - - double expected; - double objectiveAtExpected; - std::size_t maxIterations{20}; - maths::CSolvers::minimize(-max, -min, objective(-max), objective(-min), - objective, 1e-3, maxIterations, expected, - objectiveAtExpected); - LOG_DEBUG(<< "expected = " << expected - << " objective at expected = " << objectiveAtExpected); - - CArgMinLogisticImpl argmin{lambda}; - CArgMinLogisticImpl argminPartition[2]{{lambda}, {lambda}}; - auto nextPass = [&] { - bool done{argmin.nextPass() == false}; - done &= (argminPartition[0].nextPass() == false); - done &= (argminPartition[1].nextPass() == false); - return done == false; - }; - - do { - for (std::size_t i = 0; i < labels.size() / 2; ++i) { - maths::CFloatStorage storage[]{weights[i]}; - TMemoryMappedFloatVector prediction{storage, 1}; - argmin.add(prediction, labels[i]); - argminPartition[0].add(prediction, labels[i]); - } - for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) { - maths::CFloatStorage storage[]{weights[i]}; - TMemoryMappedFloatVector prediction{storage, 1}; - argmin.add(prediction, labels[i]); - argminPartition[1].add(prediction, labels[i]); - } - argminPartition[0].merge(argminPartition[1]); - argminPartition[1] = argminPartition[0]; - } while (nextPass()); - - double actual{argmin.value()(0)}; - double actualPartition{argminPartition[0].value()(0)}; - LOG_DEBUG(<< "actual = " << actual - << " objective at actual = " << objective(actual)); - - // We should be within 1% for the value and 0.001% for the objective - // at the value. - BOOST_REQUIRE_EQUAL(actual, actualPartition); - BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected)); - BOOST_REQUIRE_CLOSE_ABSOLUTE(objectiveAtExpected, objective(actual), - 1e-5 * objectiveAtExpected); - } - } -} - -BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) { - - // Test the behaviour of value, gradient and curvature of the logistic loss in - // the vicinity the point at which we switch to using Taylor expansion of the - // logistic function is as expected. - - double eps{100.0 * std::numeric_limits::epsilon()}; - - maths::boosted_tree::CBinomialLogistic loss; - - // Losses should be very nearly linear function of log-odds when they're large. - { - maths::CFloatStorage predictions[]{1.0 - std::log(eps), 1.0 + std::log(eps)}; - TMemoryMappedFloatVector prediction0{&predictions[0], 1}; - TMemoryMappedFloatVector prediction1{&predictions[1], 1}; - TDoubleVec lastLoss{loss.value(prediction0, 0.0), loss.value(prediction1, 1.0)}; - for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { - predictions[0] = scale - std::log(eps); - predictions[1] = scale + std::log(eps); - TDoubleVec currentLoss{loss.value(prediction0, 0.0), - loss.value(prediction1, 1.0)}; - BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 0.005); - BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 0.005); - lastLoss = currentLoss; - } - } - - // The gradient and curvature should be proportional to the exponential of the - // log-odds when they're small. - { - auto readDerivatives = [&](double prediction, TDoubleVec& gradients, - TDoubleVec& curvatures) { - maths::CFloatStorage predictions[]{prediction + std::log(eps), - prediction - std::log(eps)}; - TMemoryMappedFloatVector prediction0{&predictions[0], 1}; - TMemoryMappedFloatVector prediction1{&predictions[1], 1}; - loss.gradient(prediction0, 0.0, [&](std::size_t, double value) { - gradients[0] = value; - }); - loss.gradient(prediction1, 1.0, [&](std::size_t, double value) { - gradients[1] = value; - }); - loss.curvature(prediction0, 0.0, [&](std::size_t, double value) { - curvatures[0] = value; - }); - loss.curvature(prediction1, 1.0, [&](std::size_t, double value) { - curvatures[1] = value; - }); - }; - - TDoubleVec lastGradient(2); - TDoubleVec lastCurvature(2); - readDerivatives(1.0, lastGradient, lastCurvature); - - for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { - TDoubleVec currentGradient(2); - TDoubleVec currentCurvature(2); - readDerivatives(scale, currentGradient, currentCurvature); - BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(0.25), - lastGradient[0] / currentGradient[0], 0.01); - BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(-0.25), - lastGradient[1] / currentGradient[1], 0.01); - BOOST_REQUIRE_CLOSE_ABSOLUTE( - std::exp(0.25), lastCurvature[0] / currentCurvature[0], 0.01); - BOOST_REQUIRE_CLOSE_ABSOLUTE( - std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 0.01); - lastGradient = currentGradient; - lastCurvature = currentCurvature; - } - } -} - BOOST_AUTO_TEST_CASE(testLogisticRegression) { // The idea of this test is to create a random linear relationship between @@ -1191,9 +938,10 @@ BOOST_AUTO_TEST_CASE(testLogisticRegression) { fillDataFrame(trainRows, rows - trainRows, cols, {false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *frame); - auto regression = maths::CBoostedTreeFactory::constructFromParameters( - 1, std::make_unique()) - .buildFor(*frame, cols - 1); + auto regression = + maths::CBoostedTreeFactory::constructFromParameters( + 1, std::make_unique()) + .buildFor(*frame, cols - 1); regression->train(); regression->predict(); @@ -1263,7 +1011,7 @@ BOOST_AUTO_TEST_CASE(testImbalancedClasses) { frame->finishWritingRows(); auto regression = maths::CBoostedTreeFactory::constructFromParameters( - 1, std::make_unique()) + 1, std::make_unique()) .buildFor(*frame, cols - 1); regression->train(); diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc index 2394334220..cb89e921bb 100644 --- a/lib/maths/unittest/CDataFrameUtilsTest.cc +++ b/lib/maths/unittest/CDataFrameUtilsTest.cc @@ -975,7 +975,7 @@ BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategoriesWithMissing) { } } rng.generateUniformSamples(0.0, 1.0, 1, u01); - if (u01[i] < 0.9) { + if (u01[0] < 0.9) { for (std::size_t j = 0; j + 1 < cols; ++j) { if (maths::CDataFrameUtils::isMissing(values[j][i]) == false) { values[cols - 1][i] += values[j][i]; diff --git a/lib/maths/unittest/CKMeansOnlineTest.cc b/lib/maths/unittest/CKMeansOnlineTest.cc index 56d7179235..0fd144e31e 100644 --- a/lib/maths/unittest/CKMeansOnlineTest.cc +++ b/lib/maths/unittest/CKMeansOnlineTest.cc @@ -4,6 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ +#include "core/CContainerPrinter.h" #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include @@ -32,6 +34,7 @@ using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAc using TVector2 = maths::CVectorNx1; using TVector2Vec = std::vector; using TVector2VecVec = std::vector; +using TFloatVector2 = maths::CVectorNx1; using TMean2Accumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; using TMeanVar2Accumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; using TVector5 = maths::CVectorNx1; @@ -39,22 +42,43 @@ using TVector5Vec = std::vector; using TMeanVar5Accumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; template -class CKMeansOnlineTestForTest : public maths::CKMeansOnline { +class CKMeansOnlineForTest : public maths::CKMeansOnline { public: using TSphericalClusterVec = typename maths::CKMeansOnline::TSphericalClusterVec; - using TDoubleMeanVarAccumulator = typename maths::CKMeansOnline::TDoubleMeanVarAccumulator; - using TFloatMeanAccumulatorDoublePr = - typename maths::CKMeansOnline::TFloatMeanAccumulatorDoublePr; + using TFloatCoordinate = typename maths::CKMeansOnline::TFloatCoordinate; + using TFloatPoint = typename maths::CKMeansOnline::TFloatPoint; + using TFloatPointMeanAccumulatorDoublePr = + typename maths::CKMeansOnline::TFloatPointMeanAccumulatorDoublePr; + using TFloatPointMeanAccumulatorDoublePrVec = + typename maths::CKMeansOnline::TFloatPointMeanAccumulatorDoublePrVec; + using TDoublePointMeanVarAccumulator = + typename maths::CKMeansOnline::TDoublePointMeanVarAccumulator; + using TDoublePoint = typename maths::CKMeansOnline::TDoublePoint; public: - CKMeansOnlineTestForTest(std::size_t k, double decayRate = 0.0) + CKMeansOnlineForTest(std::size_t k, double decayRate = 0.0) : maths::CKMeansOnline(k, decayRate) {} - static void add(const POINT& x, double count, TFloatMeanAccumulatorDoublePr& cluster) { - maths::CKMeansOnline::add(x, count, cluster); + static void deduplicate(TFloatPointMeanAccumulatorDoublePrVec& clusters) { + maths::CKMeansOnline::deduplicate(clusters); } - static double variance(const TDoubleMeanVarAccumulator& moments) { + static void add(const POINT& mx, double count, TFloatPointMeanAccumulatorDoublePr& cluster) { + double nx{count}; + TDoublePoint vx{maths::las::zero(mx)}; + double nc{maths::CBasicStatistics::count(cluster.first)}; + TDoublePoint mc{maths::CBasicStatistics::mean(cluster.first)}; + TDoublePoint vc{cluster.second * maths::las::ones(mx)}; + TDoublePointMeanVarAccumulator moments{ + maths::CBasicStatistics::momentsAccumulator(nc, mc, vc) + + maths::CBasicStatistics::momentsAccumulator(nx, mx, vx)}; + TFloatCoordinate ncx{maths::CBasicStatistics::count(moments)}; + TFloatPoint mcx{maths::CBasicStatistics::mean(moments)}; + cluster.first = maths::CBasicStatistics::momentsAccumulator(ncx, mcx); + cluster.second = variance(moments); + } + + static double variance(const TDoublePointMeanVarAccumulator& moments) { return maths::CKMeansOnline::variance(moments); } }; @@ -93,13 +117,13 @@ BOOST_AUTO_TEST_CASE(testVariance) { expected.add(coordinates[i] - maths::CBasicStatistics::mean(actual)(i % 5)); } - LOG_DEBUG(<< "actual = " << CKMeansOnlineTestForTest::variance(actual)); + LOG_DEBUG(<< "actual = " << CKMeansOnlineForTest::variance(actual)); LOG_DEBUG(<< "expected = " << maths::CBasicStatistics::maximumLikelihoodVariance(expected)); BOOST_REQUIRE_CLOSE_ABSOLUTE( maths::CBasicStatistics::maximumLikelihoodVariance(expected), - CKMeansOnlineTestForTest::variance(actual), + CKMeansOnlineForTest::variance(actual), 1e-10 * maths::CBasicStatistics::maximumLikelihoodVariance(expected)); } } @@ -128,7 +152,7 @@ BOOST_AUTO_TEST_CASE(testAdd) { TMean2AccumulatorDoublePr actual; TMeanVar2Accumulator expected; for (std::size_t i = 0u; i < points.size(); ++i) { - CKMeansOnlineTestForTest::add(points[i], counts[i], actual); + CKMeansOnlineForTest::add(points[i], counts[i], actual); expected.add(points[i], counts[i]); } @@ -152,6 +176,93 @@ BOOST_AUTO_TEST_CASE(testAdd) { } } +BOOST_AUTO_TEST_CASE(testDeduplicate) { + // Test we behaviour: + // - If all points are duplicates + // - If no points are duplicates + // - For random permutation of duplicates + + CKMeansOnlineForTest::TFloatPointMeanAccumulatorDoublePrVec points; + + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{1.0}, TFloatVector2{0.0}), + 0.0); + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{2.0}, TFloatVector2{0.0}), + 0.0); + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{1.0}, TFloatVector2{0.0}), + 0.0); + CKMeansOnlineForTest::deduplicate(points); + BOOST_REQUIRE_EQUAL(1, points.size()); + BOOST_REQUIRE_EQUAL(TFloatVector2{0.0}, + maths::CBasicStatistics::mean(points[0].first)); + BOOST_REQUIRE_EQUAL(4.0, maths::CBasicStatistics::count(points[0].first)); + BOOST_REQUIRE_EQUAL(0.0, points[0].second); + points.clear(); + + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{1.0}, TFloatVector2{0.0}), + 0.0); + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{2.0}, TFloatVector2{1.0}), + 0.0); + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{1.0}, TFloatVector2{2.0}), + 0.0); + CKMeansOnlineForTest::deduplicate(points); + BOOST_REQUIRE_EQUAL(3, points.size()); + BOOST_REQUIRE_EQUAL(TFloatVector2{0.0}, + maths::CBasicStatistics::mean(points[0].first)); + BOOST_REQUIRE_EQUAL(1.0, maths::CBasicStatistics::count(points[0].first)); + BOOST_REQUIRE_EQUAL(0.0, points[0].second); + BOOST_REQUIRE_EQUAL(TFloatVector2{1.0}, + maths::CBasicStatistics::mean(points[1].first)); + BOOST_REQUIRE_EQUAL(2.0, maths::CBasicStatistics::count(points[1].first)); + BOOST_REQUIRE_EQUAL(0.0, points[1].second); + BOOST_REQUIRE_EQUAL(TFloatVector2{2.0}, + maths::CBasicStatistics::mean(points[2].first)); + BOOST_REQUIRE_EQUAL(1.0, maths::CBasicStatistics::count(points[2].first)); + BOOST_REQUIRE_EQUAL(0.0, points[2].second); + points.clear(); + + test::CRandomNumbers rng; + for (std::size_t t = 1; t <= 100; ++t) { + for (std::size_t i = 0; i < 5; ++i) { + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{static_cast(i)}, + TFloatVector2{0.0}), + 0.0); + } + for (std::size_t i = 0; i < 7; ++i) { + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{1.0}, TFloatVector2{1.0}), + 0.0); + } + for (std::size_t i = 0; i < 3; ++i) { + points.emplace_back(maths::CBasicStatistics::momentsAccumulator( + maths::CFloatStorage{2.0}, TFloatVector2{2.0}), + 0.0); + } + rng.random_shuffle(points.begin(), points.end()); + + CKMeansOnlineForTest::deduplicate(points); + + BOOST_REQUIRE_EQUAL(3, points.size()); + BOOST_REQUIRE_EQUAL(TFloatVector2{0.0}, + maths::CBasicStatistics::mean(points[0].first)); + BOOST_REQUIRE_EQUAL(10.0, maths::CBasicStatistics::count(points[0].first)); + BOOST_REQUIRE_EQUAL(TFloatVector2{1.0}, + maths::CBasicStatistics::mean(points[1].first)); + BOOST_REQUIRE_EQUAL(7.0, maths::CBasicStatistics::count(points[1].first)); + BOOST_REQUIRE_EQUAL(TFloatVector2{2.0}, + maths::CBasicStatistics::mean(points[2].first)); + BOOST_REQUIRE_EQUAL(6.0, maths::CBasicStatistics::count(points[2].first)); + + points.clear(); + } +} + BOOST_AUTO_TEST_CASE(testReduce) { // Test some invariants: // - Number of clusters should be no more than k after @@ -186,8 +297,8 @@ BOOST_AUTO_TEST_CASE(testReduce) { kmeans.add(points[i], counts[i]); expected.add(points[i], counts[i]); - if ((i + 1) % 7 == 0) { - CKMeansOnlineTestForTest::TSphericalClusterVec clusters; + if (((i - 10) + 1) % 7 == 0) { + CKMeansOnlineForTest::TSphericalClusterVec clusters; kmeans.clusters(clusters); BOOST_TEST_REQUIRE(clusters.size() <= 10); @@ -362,7 +473,7 @@ BOOST_AUTO_TEST_CASE(testSplit) { TVector2Vec points; for (std::size_t i = 0u; i < 2; ++i) { TDoubleVec coordinates; - rng.generateNormalSamples(m[i], v[i], 350, coordinates); + rng.generateNormalSamples(m[i], v[i], 352, coordinates); for (std::size_t j = 0u; j < coordinates.size(); j += 2) { double c[]{coordinates[j + 0], coordinates[j + 1]}; points.push_back(TVector2(c)); @@ -370,10 +481,18 @@ BOOST_AUTO_TEST_CASE(testSplit) { } maths::CKMeansOnline kmeansOnline(30); - for (std::size_t i = 0u; i < points.size(); ++i) { + const std::size_t BUFFERING{0}; + const std::size_t NOT_BUFFERING{1}; + std::size_t counts[2]{0, 0}; + for (std::size_t i = 0; i < 30; ++i) { + kmeansOnline.add(points[i]); + } + for (std::size_t i = 30; i < points.size(); ++i) { kmeansOnline.add(points[i]); + ++counts[kmeansOnline.buffering() ? BUFFERING : NOT_BUFFERING]; } - BOOST_TEST_REQUIRE(!kmeansOnline.buffering()); + BOOST_REQUIRE_EQUAL(counts[BUFFERING], maths::CKMeansOnline::BUFFER_SIZE * + counts[NOT_BUFFERING]); std::size_t one[]{0, 2, 7, 18, 19, 22}; std::size_t two[]{3, 4, 5, 6, 10, 11, 23, 24}; @@ -443,7 +562,7 @@ BOOST_AUTO_TEST_CASE(testMerge) { TMeanVar2Accumulator expected; for (std::size_t i = 0u; i < 2; ++i) { - CKMeansOnlineTestForTest::TSphericalClusterVec clusters; + CKMeansOnlineForTest::TSphericalClusterVec clusters; kmeans[i].clusters(clusters); for (std::size_t j = 0u; j < clusters.size(); ++j) { expected.add(clusters[j]); @@ -453,7 +572,7 @@ BOOST_AUTO_TEST_CASE(testMerge) { kmeans[0].merge(kmeans[1]); TMeanVar2Accumulator actual; - CKMeansOnlineTestForTest::TSphericalClusterVec clusters; + CKMeansOnlineForTest::TSphericalClusterVec clusters; kmeans[0].clusters(clusters); for (std::size_t j = 0u; j < clusters.size(); ++j) { actual.add(clusters[j]); @@ -498,7 +617,7 @@ BOOST_AUTO_TEST_CASE(testPropagateForwardsByTime) { kmeans.add(points[i]); } - CKMeansOnlineTestForTest::TSphericalClusterVec clusters; + CKMeansOnlineForTest::TSphericalClusterVec clusters; kmeans.clusters(clusters); LOG_DEBUG(<< "clusters before = " << core::CContainerPrinter::print(clusters)); diff --git a/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc b/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc index 66aa7c8415..92f405cc9e 100644 --- a/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc +++ b/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc @@ -511,7 +511,7 @@ BOOST_AUTO_TEST_CASE(testSplitAndMerge) { LOG_DEBUG(<< "mean meanError = " << maths::CBasicStatistics::mean(meanMeanError)); LOG_DEBUG(<< "mean covError = " << maths::CBasicStatistics::mean(meanCovError)); BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanMeanError) < 0.013); - BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanCovError) < 0.030); + BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanCovError) < 0.044); } } diff --git a/lib/maths/unittest/CXMeansOnlineTest.cc b/lib/maths/unittest/CXMeansOnlineTest.cc index e879883a75..bd7da22a5d 100644 --- a/lib/maths/unittest/CXMeansOnlineTest.cc +++ b/lib/maths/unittest/CXMeansOnlineTest.cc @@ -312,8 +312,8 @@ BOOST_AUTO_TEST_CASE(testClusteringVanilla) { } LOG_DEBUG(<< "mean error = " << meanError[0]); LOG_DEBUG(<< "covariance error = " << covError[0]); - BOOST_TEST_REQUIRE(meanError[0] < 0.034); - BOOST_TEST_REQUIRE(covError[0] < 0.39); + BOOST_TEST_REQUIRE(meanError[0] < 0.045); + BOOST_TEST_REQUIRE(covError[0] < 0.36); meanMeanError.add(meanError[0]); meanCovError.add(covError[0]); } @@ -653,6 +653,9 @@ BOOST_AUTO_TEST_CASE(testLargeHistory) { } BOOST_AUTO_TEST_CASE(testLatLongData) { + // Check that the log likelihood of the data in the lat_long.csv + // is significantly increased by clustering. + using TTimeDoubleVecPr = std::pair; using TTimeDoubleVecPrVec = std::vector; using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; @@ -714,8 +717,8 @@ BOOST_AUTO_TEST_CASE(testLatLongData) { LOG_DEBUG(<< "gaussian log(L) = " << maths::CBasicStatistics::mean(LLR)); LOG_DEBUG(<< "clustered log(L) = " << maths::CBasicStatistics::mean(LLC)); - BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(LLC) < - 0.6 * maths::CBasicStatistics::mean(LLR)); + BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(LLC) > + 0.4 * maths::CBasicStatistics::mean(LLR)); } BOOST_AUTO_TEST_CASE(testPersist) { diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index a6dfab62a6..e985f1fc6c 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -25,6 +25,7 @@ SRCS=\ CBayesianOptimisationTest.cc \ CBjkstUniqueValuesTest.cc \ CBoostedTreeLeafNodeStatisticsTest.cc \ + CBoostedTreeLossTest.cc \ CBoostedTreeTest.cc \ CBootstrapClustererTest.cc \ CBoundingBoxTest.cc \