diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index daff481f91..0b331d7ee0 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -41,6 +41,7 @@
 
 * Add instrumentation to report statistics related to data frame analytics jobs, i.e.
 progress, memory usage, etc. (See {ml-pull}906[#906].)
+* Multiclass classification. (See {ml-pull}1037[#1037].)
 
 === Enhancements
 
diff --git a/include/maths/CBoostedTreeLoss.h b/include/maths/CBoostedTreeLoss.h
index 20637d5b0e..788d867f21 100644
--- a/include/maths/CBoostedTreeLoss.h
+++ b/include/maths/CBoostedTreeLoss.h
@@ -8,8 +8,10 @@
 #define INCLUDED_ml_maths_CBoostedTreeLoss_h
 
 #include <maths/CBasicStatistics.h>
+#include <maths/CKMeansOnline.h>
 #include <maths/CLinearAlgebra.h>
 #include <maths/CLinearAlgebraEigen.h>
+#include <maths/CPRNG.h>
 #include <maths/ImportExport.h>
 #include <maths/MathsTypes.h>
 
@@ -66,9 +68,26 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
 
 //! \brief Finds the value to add to a set of predicted log-odds which minimises
 //! regularised cross entropy loss w.r.t. the actual categories.
-class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
+//!
+//! DESCRIPTION:\n
+//! We want to find the weight which minimizes the log-loss, i.e. which satisfies
+//! <pre class="fragment">
+//!   \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_i{ a_i \log(S(p_i + w)) + (1 - a_i) \log(1 - S(p_i + w)) } }\f$
+//! </pre>
+//!
+//! Rather than working with this function directly we bucket the predictions `p_i`
+//! in a first pass over the data and compute weight which minimizes the approximate
+//! function
+//! <pre class="fragment">
+//! \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_{B}{ c_{1,B} \log(S(\bar{p}_B + w)) + c_{0,B} \log(1 - S(\bar{p}_B + w)) } }\f$
+//! </pre>
+//!
+//! Here, \f$B\f$ ranges over the buckets, \f$\bar{p}_B\f$ denotes the B'th bucket
+//! centre and \f$c_{0,B}\f$ and \f$c_{1,B}\f$ denote the counts of actual classes
+//! 0 and 1, respectively, in the bucket \f$B\f$.
+class MATHS_EXPORT CArgMinBinomialLogisticLossImpl final : public CArgMinLossImpl {
 public:
-    CArgMinLogisticImpl(double lambda);
+    CArgMinBinomialLogisticLossImpl(double lambda);
     std::unique_ptr<CArgMinLossImpl> clone() const override;
     bool nextPass() override;
     void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
@@ -80,11 +99,13 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
     using TDoubleVector2x1 = CVectorNx1<double, 2>;
     using TDoubleVector2x1Vec = std::vector<TDoubleVector2x1>;
 
+private:
+    static constexpr std::size_t NUMBER_BUCKETS = 128;
+
 private:
     std::size_t bucket(double prediction) const {
         double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()};
-        return std::min(static_cast<std::size_t>(bucket),
-                        m_BucketCategoryCounts.size() - 1);
+        return std::min(static_cast<std::size_t>(bucket), m_BucketsClassCounts.size() - 1);
     }
 
     double bucketCentre(std::size_t bucket) const {
@@ -95,15 +116,74 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
     double bucketWidth() const {
         return m_PredictionMinMax.initialized()
                    ? m_PredictionMinMax.range() /
-                         static_cast<double>(m_BucketCategoryCounts.size())
+                         static_cast<double>(m_BucketsClassCounts.size())
                    : 0.0;
     }
 
 private:
     std::size_t m_CurrentPass = 0;
     TMinMaxAccumulator m_PredictionMinMax;
-    TDoubleVector2x1 m_CategoryCounts;
-    TDoubleVector2x1Vec m_BucketCategoryCounts;
+    TDoubleVector2x1 m_ClassCounts;
+    TDoubleVector2x1Vec m_BucketsClassCounts;
+};
+
+//! \brief Finds the value to add to a set of predicted multinomial logit which
+//! minimises regularised cross entropy loss w.r.t. the actual classes.
+//!
+//! DESCRIPTION:\n
+//! We want to find the weight which minimizes the log-loss, i.e. which satisfies
+//! <pre class="fragment">
+//!   \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_i{ \log([softmax(p_i + w)]_{a_i}) } }\f$
+//! </pre>
+//!
+//! Here, \f$a_i\f$ is the index of the i'th example's true class. Rather than
+//! working with this function directly we approximate it by the means and count
+//! of predictions in a partition of the original data, i.e. we compute the weight
+//! weight which satisfies
+//! <pre class="fragment">
+//! \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_P{ c_{a_i, P} \log([softmax(\bar{p}_P + w)]) } }\f$
+//! </pre>
+//!
+//! Here, \f$P\f$ ranges over the subsets of the partition, \f$\bar{p}_P\f$ denotes
+//! the mean of the predictions in the P'th subset and \f$c_{a_i, P}\f$ denote the
+//! counts of each classes \f$\{a_i\}\f$ in the subset \f$P\f$. We compute this
+//! partition by k-means.
+class MATHS_EXPORT CArgMinMultinomialLogisticLossImpl final : public CArgMinLossImpl {
+public:
+    using TObjective = std::function<double(const TDoubleVector&)>;
+    using TObjectiveGradient = std::function<TDoubleVector(const TDoubleVector&)>;
+
+public:
+    CArgMinMultinomialLogisticLossImpl(std::size_t numberClasses,
+                                       double lambda,
+                                       const CPRNG::CXorOShiro128Plus& rng);
+    std::unique_ptr<CArgMinLossImpl> clone() const override;
+    bool nextPass() override;
+    void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
+    void merge(const CArgMinLossImpl& other) override;
+    TDoubleVector value() const override;
+
+    // Exposed for unit testing.
+    TObjective objective() const;
+    TObjectiveGradient objectiveGradient() const;
+
+private:
+    using TDoubleVectorVec = std::vector<TDoubleVector>;
+    using TKMeans = CKMeansOnline<TDoubleVector>;
+
+private:
+    static constexpr std::size_t NUMBER_CENTRES = 128;
+    static constexpr std::size_t NUMBER_RESTARTS = 5;
+
+private:
+    std::size_t m_NumberClasses = 0;
+    std::size_t m_CurrentPass = 0;
+    mutable CPRNG::CXorOShiro128Plus m_Rng;
+    TDoubleVector m_ClassCounts;
+    TDoubleVector m_DoublePrediction;
+    TKMeans m_PredictionSketch;
+    TDoubleVectorVec m_Centres;
+    TDoubleVectorVec m_CentresClassCounts;
 };
 }
 
@@ -185,7 +265,8 @@ class MATHS_EXPORT CLoss {
     //! Transforms a prediction from the forest to the target space.
     virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0;
     //! Get an object which computes the leaf value that minimises loss.
-    virtual CArgMinLoss minimizer(double lambda) const = 0;
+    virtual CArgMinLoss minimizer(double lambda,
+                                  const CPRNG::CXorOShiro128Plus& rng) const = 0;
     //! Get the name of the loss function
     virtual const std::string& name() const = 0;
 
@@ -214,7 +295,7 @@ class MATHS_EXPORT CMse final : public CLoss {
                    double weight = 1.0) const override;
     bool isCurvatureConstant() const override;
     TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
-    CArgMinLoss minimizer(double lambda) const override;
+    CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
     const std::string& name() const override;
 };
 
@@ -227,11 +308,47 @@ class MATHS_EXPORT CMse final : public CLoss {
 //! </pre>
 //! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
 //! prediction and \f$S(\cdot)\f$ denotes the logistic function.
-class MATHS_EXPORT CBinomialLogistic final : public CLoss {
+class MATHS_EXPORT CBinomialLogisticLoss final : public CLoss {
+public:
+    static const std::string NAME;
+
+public:
+    std::unique_ptr<CLoss> clone() const override;
+    std::size_t numberParameters() const override;
+    double value(const TMemoryMappedFloatVector& prediction,
+                 double actual,
+                 double weight = 1.0) const override;
+    void gradient(const TMemoryMappedFloatVector& prediction,
+                  double actual,
+                  TWriter writer,
+                  double weight = 1.0) const override;
+    void curvature(const TMemoryMappedFloatVector& prediction,
+                   double actual,
+                   TWriter writer,
+                   double weight = 1.0) const override;
+    bool isCurvatureConstant() const override;
+    TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
+    CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
+    const std::string& name() const override;
+};
+
+//!  \brief Implements loss for multinomial logistic regression.
+//!
+//! DESCRIPTION:\n
+//! This targets the cross-entropy loss using the forest to predict the class
+//! probabilities via the softmax function:
+//! <pre class="fragment">
+//!   \f$\displaystyle l_i(p) = -\sum_i a_{ij} \log(\sigma(p))\f$
+//! </pre>
+//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ denotes
+//! the vector valued prediction and \f$\sigma(p)\$ is the softmax function, i.e.
+//! \f$[\sigma(p)]_j = \frac{e^{p_i}}{\sum_k e^{p_k}}\f$.
+class MATHS_EXPORT CMultinomialLogisticLoss final : public CLoss {
 public:
     static const std::string NAME;
 
 public:
+    CMultinomialLogisticLoss(std::size_t numberClasses);
     std::unique_ptr<CLoss> clone() const override;
     std::size_t numberParameters() const override;
     double value(const TMemoryMappedFloatVector& prediction,
@@ -247,8 +364,11 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss {
                    double weight = 1.0) const override;
     bool isCurvatureConstant() const override;
     TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
-    CArgMinLoss minimizer(double lambda) const override;
+    CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
     const std::string& name() const override;
+
+private:
+    std::size_t m_NumberClasses;
 };
 }
 }
diff --git a/include/maths/CKMeans.h b/include/maths/CKMeans.h
index aa1c8d3d9e..4ea8914448 100644
--- a/include/maths/CKMeans.h
+++ b/include/maths/CKMeans.h
@@ -21,6 +21,7 @@
 #include <boost/iterator/counting_iterator.hpp>
 
 #include <cstddef>
+#include <cstdint>
 #include <sstream>
 #include <utility>
 #include <vector>
@@ -125,7 +126,7 @@ class CKMeans {
         const TPointVec& points() const { return m_Points; }
 
         //! Get the cluster checksum.
-        uint64_t checksum() const { return m_Checksum; }
+        std::uint64_t checksum() const { return m_Checksum; }
 
     private:
         //! The centroid of the points in this cluster.
@@ -133,7 +134,7 @@ class CKMeans {
         //! The points in the cluster.
         TPointVec m_Points;
         //! A checksum for the points in the cluster.
-        uint64_t m_Checksum;
+        std::uint64_t m_Checksum;
     };
 
     using TClusterVec = std::vector<CCluster>;
@@ -183,8 +184,9 @@ class CKMeans {
         if (m_Centres.empty()) {
             return true;
         }
+        TMeanAccumulatorVec newCentres;
         for (std::size_t i = 0u; i < maxIterations; ++i) {
-            if (!this->updateCentres()) {
+            if (!this->updateCentres(newCentres)) {
                 return true;
             }
         }
@@ -481,19 +483,19 @@ class CKMeans {
 
 private:
     //! Single iteration of Lloyd's algorithm to update \p centres.
-    bool updateCentres() {
-        const TCoordinate precision = TCoordinate(5) *
-                                      std::numeric_limits<TCoordinate>::epsilon();
-        TMeanAccumulatorVec newCentres(m_Centres.size(),
-                                       TMeanAccumulator(las::zero(m_Centres[0])));
+    bool updateCentres(TMeanAccumulatorVec& newCentres) {
+        const TCoordinate precision{TCoordinate(5) *
+                                    std::numeric_limits<TCoordinate>::epsilon()};
+        newCentres.assign(m_Centres.size(), TMeanAccumulator(las::zero(m_Centres[0])));
         CCentroidComputer computer(m_Centres, newCentres);
         m_Points.preorderDepthFirst(computer);
         bool changed = false;
+        POINT newCentre;
         for (std::size_t i = 0u; i < newCentres.size(); ++i) {
-            POINT newCentre(CBasicStatistics::mean(newCentres[i]));
+            newCentre = CBasicStatistics::mean(newCentres[i]);
             if (las::distance(m_Centres[i], newCentre) >
                 precision * las::norm(m_Centres[i])) {
-                m_Centres[i] = newCentre;
+                las::swap(m_Centres[i], newCentre);
                 changed = true;
             }
         }
diff --git a/include/maths/CKMeansOnline.h b/include/maths/CKMeansOnline.h
index 83dd2ce589..8b314470a5 100644
--- a/include/maths/CKMeansOnline.h
+++ b/include/maths/CKMeansOnline.h
@@ -29,6 +29,7 @@
 
 #include <cmath>
 #include <cstddef>
+#include <iterator>
 #include <numeric>
 #include <utility>
 #include <vector>
@@ -36,8 +37,7 @@
 namespace ml {
 namespace maths {
 
-//! \brief Computes k-means of a set of points online using \f$O(k)\f$
-//! memory.
+//! \brief Computes k-means of a set of points online using \f$O(k)\f$ memory.
 //!
 //! DESCRIPTION:\n
 //! This is a sketch data structure of points and their spherical variance
@@ -65,39 +65,25 @@ class CKMeansOnline {
     using TKMeansOnlineVec = std::vector<CKMeansOnline>;
 
 protected:
-    //! \brief Checks if a cluster should be deleted based on its count.
-    class CShouldDelete {
-    public:
-        CShouldDelete(double minimumCategoryCount)
-            : m_MinimumCategoryCount(minimumCategoryCount) {}
-
-        template<typename CLUSTER>
-        bool operator()(const CLUSTER& cluster) const {
-            return CBasicStatistics::count(cluster.first) < m_MinimumCategoryCount;
-        }
-
-    private:
-        double m_MinimumCategoryCount;
-    };
-
     using TFloatPoint = typename SFloatingPoint<POINT, CFloatStorage>::Type;
     using TFloatCoordinate = typename SCoordinate<TFloatPoint>::Type;
     using TFloatPointDoublePr = std::pair<TFloatPoint, double>;
     using TFloatPointDoublePrVec = std::vector<TFloatPointDoublePr>;
-    using TFloatMeanAccumulator = typename CBasicStatistics::SSampleMean<TFloatPoint>::TAccumulator;
-    using TFloatMeanAccumulatorDoublePr = std::pair<TFloatMeanAccumulator, double>;
-    using TFloatMeanAccumulatorDoublePrVec = std::vector<TFloatMeanAccumulatorDoublePr>;
-    using TDoubleMeanAccumulator =
+    using TFloatPointMeanAccumulator =
+        typename CBasicStatistics::SSampleMean<TFloatPoint>::TAccumulator;
+    using TFloatPointMeanAccumulatorDoublePr = std::pair<TFloatPointMeanAccumulator, double>;
+    using TFloatPointMeanAccumulatorDoublePrVec = std::vector<TFloatPointMeanAccumulatorDoublePr>;
+    using TDoublePointMeanAccumulator =
         typename CBasicStatistics::SSampleMean<TDoublePoint>::TAccumulator;
-    using TDoubleMeanVarAccumulator =
+    using TDoublePointMeanVarAccumulator =
         typename CBasicStatistics::SSampleMeanVar<TDoublePoint>::TAccumulator;
 
-protected:
+public:
     //! The minimum permitted size for the clusterer.
     static const std::size_t MINIMUM_SPACE;
 
     //! The maximum allowed size of the points buffer.
-    static const std::size_t MAXIMUM_BUFFER_SIZE;
+    static const std::size_t BUFFER_SIZE;
 
     //! The number of times to seed the clustering in reduce.
     static const std::size_t NUMBER_SEEDS;
@@ -106,55 +92,73 @@ class CKMeansOnline {
     static const std::size_t MAX_ITERATIONS;
 
     static const core::TPersistenceTag K_TAG;
+    static const core::TPersistenceTag BUFFER_SIZE_TAG;
+    static const core::TPersistenceTag NUMBER_SEEDS_TAG;
+    static const core::TPersistenceTag MAX_ITERATIONS_TAG;
     static const core::TPersistenceTag CLUSTERS_TAG;
     static const core::TPersistenceTag POINTS_TAG;
     static const core::TPersistenceTag RNG_TAG;
 
 public:
-    //! \param[in] k The maximum space in numbers of clusters.
-    //! A cluster comprises one float point vector, one count and
-    //! a double holding the spherical variance.
-    //! \param[in] decayRate The rate at which we data ages out
-    //! of the clusterer.
-    //! \param[in] minimumCategoryCount The minimum permitted count
-    //! for a cluster.
-    //! \note This will store as much information about the points
-    //! subject to this constraint so will generally hold \p k
-    //! clusters.
-    CKMeansOnline(std::size_t k, double decayRate = 0.0, double minimumCategoryCount = MINIMUM_CATEGORY_COUNT)
-        : m_K(std::max(k, MINIMUM_SPACE)), m_DecayRate(decayRate),
-          m_MinimumCategoryCount(minimumCategoryCount) {
-        m_Clusters.reserve(m_K + MAXIMUM_BUFFER_SIZE + 1u);
-        m_PointsBuffer.reserve(MAXIMUM_BUFFER_SIZE);
+    //! \param[in] k The numbers of clusters to maintain. A cluster comprises
+    //! one float point vector, one count and a double holding the spherical
+    //! variance.
+    //! \param[in] decayRate The rate to age old data out of the clusters.
+    //! \param[in] minClusterSize The minimum permitted number of points in
+    //! a cluster.
+    //! \param[in] bufferSize The number of points to buffer before reclustering.
+    //! \param[in] numberSeeds The number of seeds to use when reclustering.
+    //! \param[in] maxIterations The maximum number of iterations to use when
+    //! reclustering.
+    CKMeansOnline(std::size_t k,
+                  double decayRate = 0.0,
+                  double minClusterSize = MINIMUM_CATEGORY_COUNT,
+                  std::size_t bufferSize = BUFFER_SIZE,
+                  std::size_t numberSeeds = NUMBER_SEEDS,
+                  std::size_t maxIterations = MAX_ITERATIONS)
+        : m_K{std::max(k, MINIMUM_SPACE)}, m_BufferSize{bufferSize}, m_NumberSeeds{numberSeeds},
+          m_MaxIterations{maxIterations}, m_DecayRate{decayRate}, m_MinClusterSize{minClusterSize} {
+        m_Clusters.reserve(m_K + m_BufferSize + 1);
     }
 
-    //! Construct a new classifier with the specified space limit
-    //! \p space and categories \p categories.
+    //! Construct with \p clusters.
     CKMeansOnline(std::size_t k,
                   double decayRate,
-                  double minimumCategoryCount,
-                  TFloatMeanAccumulatorDoublePrVec& clusters)
-        : m_K(std::max(k, MINIMUM_SPACE)), m_DecayRate(decayRate),
-          m_MinimumCategoryCount(minimumCategoryCount) {
+                  double minClusterSize,
+                  TFloatPointMeanAccumulatorDoublePrVec& clusters)
+        : CKMeansOnline{k, decayRate, minClusterSize} {
         m_Clusters.swap(clusters);
-        m_Clusters.reserve(m_K + MAXIMUM_BUFFER_SIZE + 1u);
-        m_PointsBuffer.reserve(MAXIMUM_BUFFER_SIZE);
+        m_Clusters.reserve(m_K + m_BufferSize + 1);
     }
 
     //! Create from part of a state document.
     bool acceptRestoreTraverser(const SDistributionRestoreParams& params,
                                 core::CStateRestoreTraverser& traverser) {
         m_DecayRate = params.s_DecayRate;
-        m_MinimumCategoryCount = params.s_MinimumCategoryCount;
+        m_MinClusterSize = params.s_MinimumCategoryCount;
+        TFloatPointDoublePrVec points;
 
         do {
-            const std::string name = traverser.name();
-            RESTORE(RNG_TAG, m_Rng.fromString(traverser.value()));
+            const std::string& name{traverser.name()};
+            RESTORE(RNG_TAG, m_Rng.fromString(traverser.value()))
+            RESTORE(K_TAG, core::CPersistUtils::restore(K_TAG, m_K, traverser))
+            RESTORE(BUFFER_SIZE_TAG,
+                    core::CPersistUtils::restore(BUFFER_SIZE_TAG, m_BufferSize, traverser))
+            RESTORE(NUMBER_SEEDS_TAG,
+                    core::CPersistUtils::restore(NUMBER_SEEDS_TAG, m_NumberSeeds, traverser))
+            RESTORE(MAX_ITERATIONS_TAG,
+                    core::CPersistUtils::restore(MAX_ITERATIONS_TAG, m_MaxIterations, traverser))
             RESTORE(K_TAG, core::CPersistUtils::restore(K_TAG, m_K, traverser))
             RESTORE(CLUSTERS_TAG,
                     core::CPersistUtils::restore(CLUSTERS_TAG, m_Clusters, traverser))
-            RESTORE(POINTS_TAG, core::CPersistUtils::restore(POINTS_TAG, m_PointsBuffer, traverser))
+            RESTORE(POINTS_TAG, core::CPersistUtils::restore(POINTS_TAG, points, traverser))
         } while (traverser.next());
+
+        for (const auto& point : points) {
+            m_Clusters.emplace_back(
+                CBasicStatistics::momentsAccumulator(point.second, point.first), 0.0);
+        }
+
         return true;
     }
 
@@ -162,33 +166,35 @@ class CKMeansOnline {
     void acceptPersistInserter(core::CStatePersistInserter& inserter) const {
         inserter.insertValue(RNG_TAG, m_Rng.toString());
         core::CPersistUtils::persist(K_TAG, m_K, inserter);
+        core::CPersistUtils::persist(BUFFER_SIZE_TAG, m_BufferSize, inserter);
+        core::CPersistUtils::persist(NUMBER_SEEDS_TAG, m_NumberSeeds, inserter);
+        core::CPersistUtils::persist(MAX_ITERATIONS_TAG, m_MaxIterations, inserter);
         core::CPersistUtils::persist(CLUSTERS_TAG, m_Clusters, inserter);
-        core::CPersistUtils::persist(POINTS_TAG, m_PointsBuffer, inserter);
     }
 
     //! Efficiently swap the contents of this and \p other.
     void swap(CKMeansOnline& other) {
         std::swap(m_Rng, other.m_Rng);
         std::swap(m_K, other.m_K);
+        std::swap(m_BufferSize, other.m_BufferSize);
+        std::swap(m_NumberSeeds, other.m_NumberSeeds);
+        std::swap(m_MaxIterations, other.m_MaxIterations);
         std::swap(m_DecayRate, other.m_DecayRate);
-        std::swap(m_MinimumCategoryCount, other.m_MinimumCategoryCount);
+        std::swap(m_MinClusterSize, other.m_MinClusterSize);
         m_Clusters.swap(other.m_Clusters);
-        m_PointsBuffer.swap(other.m_PointsBuffer);
     }
 
     //! Get the total number of clusters.
-    std::size_t size() const {
-        return std::min(m_Clusters.size() + m_PointsBuffer.size(), m_K);
-    }
+    std::size_t size() const { return std::min(m_Clusters.size(), m_K); }
 
     //! Get the clusters being maintained.
     void clusters(TSphericalClusterVec& result) const {
         result.clear();
         result.reserve(m_Clusters.size());
         for (std::size_t i = 0u; i < m_Clusters.size(); ++i) {
-            const TFloatPoint& m = CBasicStatistics::mean(m_Clusters[i].first);
-            double n = CBasicStatistics::count(m_Clusters[i].first);
-            double v = m_Clusters[i].second;
+            const TFloatPoint& m{CBasicStatistics::mean(m_Clusters[i].first)};
+            double n{CBasicStatistics::count(m_Clusters[i].first)};
+            double v{m_Clusters[i].second};
             result.emplace_back(m, SCountAndVariance(n, v));
         }
     }
@@ -214,7 +220,7 @@ class CKMeansOnline {
         TSphericalClusterVec clusters;
         this->clusters(clusters);
 
-        return kmeans(m_Rng, clusters, k, result);
+        return kmeans(m_Rng, clusters, k, result, m_NumberSeeds, m_MaxIterations);
     }
 
     //! Get our best estimate of the \p k means clustering of
@@ -226,8 +232,12 @@ class CKMeansOnline {
     //! \param[out] result Filled in with the \p k means clustering
     //! of \p clusters.
     template<typename RNG>
-    static bool
-    kmeans(RNG& rng, TSphericalClusterVec& clusters, std::size_t k, TSphericalClusterVecVec& result) {
+    static bool kmeans(RNG& rng,
+                       TSphericalClusterVec& clusters,
+                       std::size_t k,
+                       TSphericalClusterVecVec& result,
+                       std::size_t numberSeeds = NUMBER_SEEDS,
+                       std::size_t maxIterations = MAX_ITERATIONS) {
         result.clear();
 
         if (k == 0) {
@@ -254,18 +264,23 @@ class CKMeansOnline {
 
         CKMeans<TSphericalCluster> kmeans;
         kmeans.setPoints(clusters);
-        CBasicStatistics::COrderStatisticsStack<double, 1> minCost;
+        CBasicStatistics::SMin<double>::TAccumulator minCost;
         TSphericalClusterVec centres;
         TSphericalClusterVecVec candidates;
-        for (std::size_t i = 0u; i < NUMBER_SEEDS; ++i) {
+        for (std::size_t i = 0u; i < numberSeeds; ++i) {
             CKMeansPlusPlusInitialization<TSphericalCluster, RNG> seedCentres(rng);
             seedCentres.run(clusters, k, centres);
             kmeans.setCentres(centres);
-            kmeans.run(MAX_ITERATIONS);
+            kmeans.run(maxIterations);
             kmeans.clusters(candidates);
+            candidates.erase(std::remove_if(candidates.begin(), candidates.end(),
+                                            [](TSphericalClusterVec& cluster) {
+                                                return cluster.empty();
+                                            }),
+                             candidates.end());
             CSphericalGaussianInfoCriterion<TSphericalCluster, E_BIC> criterion;
             criterion.add(candidates);
-            double cost = criterion.calculate();
+            double cost{criterion.calculate()};
             if (minCost.add(cost)) {
                 result.swap(candidates);
             }
@@ -284,20 +299,19 @@ class CKMeansOnline {
     //! \p split if it is a valid partition and cleared otherwise.
     bool split(const TSizeVecVec& split, TKMeansOnlineVec& result) {
         result.clear();
-        this->reduce();
         if (!this->checkSplit(split)) {
             return false;
         }
 
         result.reserve(split.size());
-        TFloatMeanAccumulatorDoublePrVec clusters;
+        TFloatPointMeanAccumulatorDoublePrVec clusters;
         for (std::size_t i = 0u; i < split.size(); ++i) {
             clusters.clear();
             clusters.reserve(split[i].size());
             for (std::size_t j = 0u; j < split[i].size(); ++j) {
                 clusters.push_back(m_Clusters[split[i][j]]);
             }
-            result.emplace_back(m_K, m_DecayRate, m_MinimumCategoryCount, clusters);
+            result.emplace_back(m_K, m_DecayRate, m_MinClusterSize, clusters);
         }
 
         return true;
@@ -308,11 +322,8 @@ class CKMeansOnline {
     //! \param[in] x A point to add to the clusterer.
     //! \param[in] count The count weight of this point.
     void add(const TDoublePoint& x, double count = 1.0) {
-        if (m_PointsBuffer.size() < MAXIMUM_BUFFER_SIZE) {
-            m_PointsBuffer.emplace_back(x, count);
-        } else {
-            m_Clusters.push_back(TFloatMeanAccumulatorDoublePr());
-            CKMeansOnline::add(x, count, m_Clusters.back());
+        m_Clusters.emplace_back(CBasicStatistics::momentsAccumulator(count, x), 0.0);
+        if (m_Clusters.size() > m_K + m_BufferSize) {
             this->reduce();
         }
     }
@@ -323,18 +334,13 @@ class CKMeansOnline {
     void merge(const CKMeansOnline& other) {
         LOG_TRACE(<< "Merge");
 
-        for (std::size_t i = 0u; i < other.m_PointsBuffer.size(); ++i) {
-            m_Clusters.push_back(TFloatMeanAccumulatorDoublePr());
-            CKMeansOnline::add(other.m_PointsBuffer[i].first,
-                               other.m_PointsBuffer[i].second, m_Clusters.back());
-        }
         m_Clusters.insert(m_Clusters.end(), other.m_Clusters.begin(),
                           other.m_Clusters.end());
 
         this->reduce();
 
         // Reclaim memory from the vector buffer.
-        TFloatMeanAccumulatorDoublePrVec categories(m_Clusters);
+        TFloatPointMeanAccumulatorDoublePrVec categories(m_Clusters);
         m_Clusters.swap(categories);
     }
 
@@ -348,7 +354,7 @@ class CKMeansOnline {
             return;
         }
 
-        double alpha = std::exp(-m_DecayRate * time);
+        double alpha{std::exp(-m_DecayRate * time)};
         LOG_TRACE(<< "alpha = " << alpha);
 
         this->age(alpha);
@@ -365,14 +371,17 @@ class CKMeansOnline {
         // Prune any dead categories: we're not interested in
         // maintaining categories with low counts.
         m_Clusters.erase(std::remove_if(m_Clusters.begin(), m_Clusters.end(),
-                                        CShouldDelete(m_MinimumCategoryCount)),
+                                        [this](const auto& cluster) {
+                                            return CBasicStatistics::count(
+                                                       cluster.first) < m_MinClusterSize;
+                                        }),
                          m_Clusters.end());
 
         LOG_TRACE(<< "clusters = " << core::CContainerPrinter::print(m_Clusters));
     }
 
-    //! Get the current points buffer.
-    bool buffering() const { return m_PointsBuffer.size() > 0; }
+    //! Check if there are points in the buffer.
+    bool buffering() const { return m_Clusters.size() > m_K; }
 
     //! Get \p n samples of the distribution corresponding to the
     //! categories we are maintaining.
@@ -388,17 +397,15 @@ class CKMeansOnline {
         using TDoubleVec = std::vector<double>;
         using TDoubleSizePr = std::pair<double, std::size_t>;
 
-        static const double ALMOST_ONE = 0.99999;
-
         // See, for example, Effective C++ item 3.
         const_cast<CKMeansOnline*>(this)->reduce();
         LOG_TRACE(<< "categories = " << core::CContainerPrinter::print(m_Clusters));
 
         TDoubleVec counts;
         counts.reserve(m_Clusters.size());
-        double Z = 0.0;
+        double Z{0.0};
         for (std::size_t i = 0u; i < m_Clusters.size(); ++i) {
-            double ni = CBasicStatistics::count(m_Clusters[i].first);
+            double ni{CBasicStatistics::count(m_Clusters[i].first)};
             counts.push_back(ni);
             Z += ni;
         }
@@ -412,35 +419,35 @@ class CKMeansOnline {
         result.reserve(2 * numberSamples);
 
         TDoubleVec weights;
-        TDoublePointVec categorySamples;
+        TDoublePointVec clusterSamples;
         for (std::size_t i = 0u; i < m_Clusters.size(); ++i) {
-            double ni = counts[i];
+            double ni{counts[i]};
 
-            categorySamples.clear();
-            TDoublePoint m = CBasicStatistics::mean(m_Clusters[i].first);
+            clusterSamples.clear();
+            TDoublePoint m{CBasicStatistics::mean(m_Clusters[i].first)};
             if (m_Clusters[i].second == 0.0) {
-                categorySamples.push_back(m);
+                clusterSamples.push_back(m);
             } else {
-                std::size_t ni_ = static_cast<std::size_t>(std::ceil(ni));
+                std::size_t ni_{static_cast<std::size_t>(std::ceil(ni))};
                 TDoublePoint v(m_Clusters[i].second);
-                sampleGaussian(ni_, m, v.asDiagonal(), categorySamples);
+                sampleGaussian(ni_, m, v.asDiagonal(), clusterSamples);
             }
 
-            ni /= static_cast<double>(categorySamples.size());
+            ni /= static_cast<double>(clusterSamples.size());
 
-            result.insert(result.end(), categorySamples.begin(), categorySamples.end());
-            weights.insert(weights.end(), categorySamples.size(), ni);
+            result.insert(result.end(), clusterSamples.begin(), clusterSamples.end());
+            weights.insert(weights.end(), clusterSamples.size(), ni);
         }
         LOG_TRACE(<< "samples = " << core::CContainerPrinter::print(result));
         LOG_TRACE(<< "weights = " << core::CContainerPrinter::print(weights));
 
-        TDoublePointVec final;
-        final.reserve(static_cast<std::size_t>(
+        TDoublePointVec finalSamples;
+        finalSamples.reserve(static_cast<std::size_t>(
             std::ceil(std::accumulate(weights.begin(), weights.end(), 0.0))));
-        TDoubleMeanAccumulator sample;
+        TDoublePointMeanAccumulator sample{las::zero(result[0])};
         for (;;) {
-            CBasicStatistics::COrderStatisticsStack<TDoubleSizePr, 1> nearest;
-            const TDoublePoint& sample_ = CBasicStatistics::mean(sample);
+            CBasicStatistics::SMin<TDoubleSizePr>::TAccumulator nearest;
+            const TDoublePoint& sample_{CBasicStatistics::mean(sample)};
             for (std::size_t j = 0u; j < result.size(); ++j) {
                 if (weights[j] > 0.0) {
                     nearest.add({las::distance(result[j], sample_), j});
@@ -450,20 +457,20 @@ class CKMeansOnline {
                 break;
             }
 
-            std::size_t j = nearest[0].second;
-            const TDoublePoint& xj = result[j];
+            std::size_t j{nearest[0].second};
+            const TDoublePoint& xj{result[j]};
             do {
-                double nj = std::min(1.0 - CBasicStatistics::count(sample), weights[j]);
+                double nj{std::min(1.0 - CBasicStatistics::count(sample), weights[j])};
                 sample.add(xj, nj);
                 weights[j] -= nj;
                 if (CBasicStatistics::count(sample) > ALMOST_ONE) {
-                    final.push_back(CBasicStatistics::mean(sample));
-                    sample = TDoubleMeanAccumulator();
+                    finalSamples.push_back(CBasicStatistics::mean(sample));
+                    sample = TDoublePointMeanAccumulator{las::zero(result[0])};
                 }
             } while (weights[j] > 0.0);
         }
 
-        result.swap(final);
+        result = std::move(finalSamples);
         LOG_TRACE(<< "# samples = " << result.size());
         LOG_TRACE(<< "samples = " << core::CContainerPrinter::print(result));
     }
@@ -474,25 +481,21 @@ class CKMeansOnline {
     }
 
     //! Get a checksum for this object.
-    uint64_t checksum(uint64_t seed = 0) const {
+    std::uint64_t checksum(std::uint64_t seed = 0) const {
         seed = CChecksum::calculate(seed, m_K);
         seed = CChecksum::calculate(seed, m_DecayRate);
-        seed = CChecksum::calculate(seed, m_Clusters);
-        return CChecksum::calculate(seed, m_PointsBuffer);
+        return CChecksum::calculate(seed, m_Clusters);
     }
 
     //! Get the memory used by this component
     void debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const {
         mem->setName("CKMeansOnline");
         core::CMemoryDebug::dynamicSize("m_Clusters", m_Clusters, mem);
-        core::CMemoryDebug::dynamicSize("m_PointsBuffer", m_PointsBuffer, mem);
     }
 
     //! Get the memory used by this component
     std::size_t memoryUsage() const {
-        std::size_t mem = core::CMemory::dynamicSize(m_Clusters);
-        mem += core::CMemory::dynamicSize(m_PointsBuffer);
-        return mem;
+        return core::CMemory::dynamicSize(m_Clusters);
     }
 
 protected:
@@ -519,12 +522,7 @@ class CKMeansOnline {
 
     //! Reduce the number of clusters to m_K by k-means clustering.
     void reduce() {
-        // Add all the points as new spherical clusters and reduce.
-        for (const auto& point : m_PointsBuffer) {
-            m_Clusters.push_back(TFloatMeanAccumulatorDoublePr());
-            CKMeansOnline::add(point.first, point.second, m_Clusters.back());
-        }
-        m_PointsBuffer.clear();
+        deduplicate(m_Clusters);
 
         if (m_Clusters.size() < m_K) {
             return;
@@ -533,21 +531,19 @@ class CKMeansOnline {
         LOG_TRACE(<< "clusters = " << core::CContainerPrinter::print(m_Clusters));
         LOG_TRACE(<< "# clusters = " << m_Clusters.size());
 
-        TSphericalClusterVecVec kclusters;
-        {
-            TSphericalClusterVec clusters;
-            this->clusters(clusters);
-            kmeans(m_Rng, clusters, m_K, kclusters);
-        }
+        TSphericalClusterVecVec newClusters;
+        TSphericalClusterVec oldClusters;
+        this->clusters(oldClusters);
+        kmeans(m_Rng, oldClusters, m_K, newClusters, m_NumberSeeds, m_MaxIterations);
 
-        m_Clusters.resize(kclusters.size());
-        for (std::size_t i = 0u; i < kclusters.size(); ++i) {
-            TDoubleMeanVarAccumulator cluster;
-            for (const auto& point : kclusters[i]) {
+        m_Clusters.resize(newClusters.size());
+        for (std::size_t i = 0u; i < newClusters.size(); ++i) {
+            TDoublePointMeanVarAccumulator cluster{las::zero(oldClusters[0])};
+            for (const auto& point : newClusters[i]) {
                 cluster.add(point);
             }
-            double n = CBasicStatistics::count(cluster);
-            const TDoublePoint& m = CBasicStatistics::mean(cluster);
+            double n{CBasicStatistics::count(cluster)};
+            const TDoublePoint& m{CBasicStatistics::mean(cluster)};
             m_Clusters[i].first = CBasicStatistics::momentsAccumulator(
                 TFloatCoordinate(n), TFloatPoint(m));
             m_Clusters[i].second = variance(cluster);
@@ -557,28 +553,41 @@ class CKMeansOnline {
         LOG_TRACE(<< "# reduced clusters = " << m_Clusters.size());
     }
 
-    //! Add \p count copies of \p mx to the cluster \p cluster.
-    static void add(const TDoublePoint& mx, double count, TFloatMeanAccumulatorDoublePr& cluster) {
-        double nx = count;
-        TDoublePoint vx(0.0);
-        double nc = CBasicStatistics::count(cluster.first);
-        TDoublePoint mc = CBasicStatistics::mean(cluster.first);
-        TDoublePoint vc(cluster.second);
-        TDoubleMeanVarAccumulator moments =
-            CBasicStatistics::momentsAccumulator(nc, mc, vc) +
-            CBasicStatistics::momentsAccumulator(nx, mx, vx);
-        TFloatCoordinate ncx = CBasicStatistics::count(moments);
-        TFloatPoint mcx = CBasicStatistics::mean(moments);
-        cluster.first = CBasicStatistics::momentsAccumulator(ncx, mcx);
-        cluster.second = variance(moments);
+    //! Remove any duplicates in \p points.
+    //!
+    //! \note We assume \p points is small so the bruteforce approach is fast.
+    static void deduplicate(TFloatPointMeanAccumulatorDoublePrVec& clusters) {
+        if (clusters.size() > 1) {
+            std::stable_sort(clusters.begin(), clusters.end(),
+                             [](const auto& lhs, const auto& rhs) {
+                                 return CBasicStatistics::mean(lhs.first) <
+                                        CBasicStatistics::mean(rhs.first);
+                             });
+            auto back = clusters.begin();
+            for (auto i = back + 1; i != clusters.end(); ++i) {
+                if (CBasicStatistics::mean(back->first) == CBasicStatistics::mean(i->first)) {
+                    back->first += i->first;
+                    double n[]{CBasicStatistics::count(back->first),
+                               CBasicStatistics::count(i->first)};
+                    back->second = (n[0] * back->second + n[1] * i->second) /
+                                   (n[0] + n[1]);
+                } else if (++back != i) {
+                    *back = std::move(*i);
+                }
+            }
+            clusters.erase(back + 1, clusters.end());
+        }
     }
 
     //! Get the spherically symmetric variance from \p moments.
-    static double variance(const TDoubleMeanVarAccumulator& moments) {
-        const TDoublePoint& v = CBasicStatistics::maximumLikelihoodVariance(moments);
-        return v.L1() / static_cast<double>(v.dimension());
+    static double variance(const TDoublePointMeanVarAccumulator& moments) {
+        const TDoublePoint& v{CBasicStatistics::maximumLikelihoodVariance(moments)};
+        return las::L1(v) / static_cast<double>(las::dimension(v));
     }
 
+private:
+    static constexpr double ALMOST_ONE = 0.99999;
+
 private:
     //! The random number generator.
     CPRNG::CXorOShiro128Plus m_Rng;
@@ -586,24 +595,29 @@ class CKMeansOnline {
     //! The number of clusters to maintain.
     std::size_t m_K;
 
+    //! The number of points to buffer before clustering.
+    std::size_t m_BufferSize;
+
+    //! The number of seeds to use when reclustering.
+    std::size_t m_NumberSeeds;
+
+    //! The number of iterations of k-means to use when reclustering.
+    std::size_t m_MaxIterations;
+
     //! The rate at which the categories lose information.
     double m_DecayRate;
 
-    //! The minimum permitted count for a cluster.
-    double m_MinimumCategoryCount;
+    //! The minimum permitted number of points in a cluster.
+    double m_MinClusterSize;
 
     //! The clusters we are maintaining.
-    TFloatMeanAccumulatorDoublePrVec m_Clusters;
-
-    //! A buffer of the points added while the space constraint
-    //! is satisfied.
-    TFloatPointDoublePrVec m_PointsBuffer;
+    TFloatPointMeanAccumulatorDoublePrVec m_Clusters;
 };
 
 template<typename POINT>
 const std::size_t CKMeansOnline<POINT>::MINIMUM_SPACE = 4u;
 template<typename POINT>
-const std::size_t CKMeansOnline<POINT>::MAXIMUM_BUFFER_SIZE = 6u;
+const std::size_t CKMeansOnline<POINT>::BUFFER_SIZE = 6u;
 template<typename POINT>
 const std::size_t CKMeansOnline<POINT>::NUMBER_SEEDS = 5u;
 template<typename POINT>
@@ -617,6 +631,12 @@ template<typename POINT>
 const core::TPersistenceTag CKMeansOnline<POINT>::POINTS_TAG("c", "points");
 template<typename POINT>
 const core::TPersistenceTag CKMeansOnline<POINT>::RNG_TAG("d", "rng");
+template<typename POINT>
+const core::TPersistenceTag CKMeansOnline<POINT>::BUFFER_SIZE_TAG("e", "buffer_size");
+template<typename POINT>
+const core::TPersistenceTag CKMeansOnline<POINT>::NUMBER_SEEDS_TAG("f", "number_seeds");
+template<typename POINT>
+const core::TPersistenceTag CKMeansOnline<POINT>::MAX_ITERATIONS_TAG("g", "max_iterations");
 }
 }
 
diff --git a/include/maths/CLinearAlgebraEigen.h b/include/maths/CLinearAlgebraEigen.h
index 22aca2b029..7af9a97219 100644
--- a/include/maths/CLinearAlgebraEigen.h
+++ b/include/maths/CLinearAlgebraEigen.h
@@ -25,6 +25,7 @@
 #include <Eigen/SVD>
 #include <Eigen/SparseCore>
 
+#include <algorithm>
 #include <iterator>
 
 namespace Eigen {
@@ -43,7 +44,7 @@ bool operator<(const SparseMatrix<SCALAR, FLAGS, STORAGE_INDEX>& lhs,
     LESS_OR_GREATER(lhs.cols(), rhs.cols())
     for (STORAGE_INDEX i = 0; i < lhs.rows(); ++i) {
         for (STORAGE_INDEX j = 0; j < lhs.cols(); ++j) {
-            LESS_OR_GREATER(lhs.coeff(i, j), rhs.coeff(i, j))
+            LESS_OR_GREATER(lhs(i, j), rhs(i, j))
         }
     }
     return false;
@@ -55,7 +56,7 @@ bool operator<(const SparseVector<SCALAR, FLAGS, STORAGE_INDEX>& lhs,
                const SparseVector<SCALAR, FLAGS, STORAGE_INDEX>& rhs) {
     LESS_OR_GREATER(lhs.size(), rhs.size())
     for (STORAGE_INDEX i = 0; i < lhs.size(); ++i) {
-        LESS_OR_GREATER(lhs.coeff(i), rhs(i))
+        LESS_OR_GREATER(lhs(i), rhs(i))
     }
     return false;
 }
@@ -66,12 +67,8 @@ bool operator<(const Matrix<SCALAR, ROWS, COLS, OPTIONS, MAX_ROWS, MAX_COLS>& lh
                const Matrix<SCALAR, ROWS, COLS, OPTIONS, MAX_ROWS, MAX_COLS>& rhs) {
     LESS_OR_GREATER(lhs.rows(), rhs.rows())
     LESS_OR_GREATER(lhs.cols(), rhs.cols())
-    for (decltype(lhs.rows()) i = 0; i < lhs.rows(); ++i) {
-        for (decltype(lhs.cols()) j = 0; j < lhs.cols(); ++j) {
-            LESS_OR_GREATER(lhs.coeff(i, j), rhs.coeff(i, j))
-        }
-    }
-    return false;
+    return std::lexicographical_compare(lhs.data(), lhs.data() + lhs.size(),
+                                        rhs.data(), rhs.data() + rhs.size());
 }
 
 //! Less than on an Eigen memory mapped matrix.
@@ -80,12 +77,8 @@ bool operator<(const Map<PLAIN_OBJECT_TYPE, OPTIONS, STRIDE_TYPE>& lhs,
                const Map<PLAIN_OBJECT_TYPE, OPTIONS, STRIDE_TYPE>& rhs) {
     LESS_OR_GREATER(lhs.rows(), rhs.rows())
     LESS_OR_GREATER(lhs.cols(), rhs.cols())
-    for (decltype(lhs.rows()) i = 0; i < lhs.rows(); ++i) {
-        for (decltype(lhs.cols()) j = 0; j < lhs.cols(); ++j) {
-            LESS_OR_GREATER(lhs.coeff(i, j), rhs.coeff(i, j))
-        }
-    }
-    return false;
+    return std::lexicographical_compare(lhs.data(), lhs.data() + lhs.size(),
+                                        rhs.data(), rhs.data() + rhs.size());
 }
 
 #undef LESS_OR_GREATER
@@ -302,7 +295,7 @@ class CDenseVector : public Eigen::Matrix<SCALAR, Eigen::Dynamic, 1> {
         return result;
     }
 
-private:
+    //! Convert from a std::vector.
     static CDenseVector<SCALAR> fromStdVector(const std::vector<SCALAR>& vector) {
         CDenseVector<SCALAR> result(vector.size());
         for (std::size_t i = 0; i < vector.size(); ++i) {
diff --git a/include/maths/CLinearAlgebraShims.h b/include/maths/CLinearAlgebraShims.h
index 914d128abf..cab1e3b8b1 100644
--- a/include/maths/CLinearAlgebraShims.h
+++ b/include/maths/CLinearAlgebraShims.h
@@ -20,6 +20,32 @@ namespace ml {
 namespace maths {
 namespace las {
 
+//! Swap two vectors or matrices efficiently.
+template<typename T>
+void swap(T& lhs, T& rhs) {
+    lhs.swap(rhs);
+}
+
+//! Swap two stack vectors.
+template<typename T, std::size_t N>
+void swap(CVectorNx1<T, N>& lhs, CVectorNx1<T, N>& rhs) {
+    std::swap(lhs, rhs);
+}
+
+//! Swap two annotated vectors and their annotations.
+template<typename VECTOR, typename ANNOTATION>
+void swap(CAnnotatedVector<VECTOR, ANNOTATION>& lhs,
+          CAnnotatedVector<VECTOR, ANNOTATION>& rhs) {
+    swap(static_cast<VECTOR&>(lhs), static_cast<VECTOR&>(rhs));
+    std::swap(lhs.annotation(), rhs.annotation());
+}
+
+//! Swap two stack matrices.
+template<typename T, std::size_t N>
+void swap(CSymmetricMatrixNxN<T, N>& lhs, CSymmetricMatrixNxN<T, N>& rhs) {
+    std::swap(lhs, rhs);
+}
+
 //! Get the dimension of one of our internal vectors.
 template<typename VECTOR>
 std::size_t dimension(const VECTOR& x) {
diff --git a/include/maths/CTools.h b/include/maths/CTools.h
index d51f760564..0e0ed7c03b 100644
--- a/include/maths/CTools.h
+++ b/include/maths/CTools.h
@@ -12,6 +12,7 @@
 #include <core/CoreTypes.h>
 
 #include <maths/CBasicStatistics.h>
+#include <maths/CLinearAlgebraFwd.h>
 #include <maths/ImportExport.h>
 #include <maths/MathsTypes.h>
 
@@ -683,6 +684,31 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable {
         return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width));
     }
 
+    //! Compute the softmax from the multinomial logit values \p logit.
+    //!
+    //! i.e. \f$[\sigma(z)]_i = \frac{exp(z_i)}{\sum_j exp(z_j)}\f$.
+    //!
+    //! \tparam COLLECTION Is assumed to be a collection type, i.e. it
+    //! must support iterator based access.
+    template<typename COLLECTION>
+    static COLLECTION softmax(COLLECTION z) {
+        COLLECTION probabilities{std::move(z)};
+        double Z{0.0};
+        double zmax{*std::max_element(z.begin(), z.end())};
+        for (auto& pi : probabilities) {
+            pi = std::exp(pi - zmax);
+            Z += pi;
+        }
+        for (auto& pi : probabilities) {
+            pi /= Z;
+        }
+        return probabilities;
+    }
+
+    //! Specialize the softmax for our dense vector type.
+    template<typename T>
+    static CDenseVector<T> softmax(CDenseVector<T> z);
+
     //! Linearly interpolate a function on the interval [\p a, \p b].
     static double linearlyInterpolate(double a, double b, double fa, double fb, double x);
 
diff --git a/include/maths/CToolsDetail.h b/include/maths/CToolsDetail.h
index 1224ee8a89..050d86822d 100644
--- a/include/maths/CToolsDetail.h
+++ b/include/maths/CToolsDetail.h
@@ -300,6 +300,13 @@ void CTools::spread(double a, double b, double separation, T& points) {
 
     LOG_TRACE(<< "# iterations = " << iteration << " # points = " << n + 1);
 }
+
+template<typename T>
+CDenseVector<T> CTools::softmax(CDenseVector<T> z) {
+    double zmax{z.maxCoeff()};
+    z = (z.array() - zmax).exp();
+    return z / z.template lpNorm<1>();
+}
 }
 }
 
diff --git a/include/maths/CXMeansOnline.h b/include/maths/CXMeansOnline.h
index 54337669f5..a409da6440 100644
--- a/include/maths/CXMeansOnline.h
+++ b/include/maths/CXMeansOnline.h
@@ -391,7 +391,7 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
                 TKMeansOnline::kmeans(rng, node, 2, candidate);
                 LOG_TRACE(<< "candidate = " << core::CContainerPrinter::print(candidate));
 
-                if (candidate.size() != 2) {
+                if (candidate.size() > 2) {
                     LOG_ERROR(<< "Expected 2-split: "
                               << core::CContainerPrinter::print(candidate));
                     break;
diff --git a/include/maths/Constants.h b/include/maths/Constants.h
index 26b155459d..f15efe6941 100644
--- a/include/maths/Constants.h
+++ b/include/maths/Constants.h
@@ -25,62 +25,62 @@ namespace maths {
 //! the sample mean. However, it means we are insensitive anomalous
 //! deviations in data whose variation is significantly smaller than
 //! this minimum value.
-const double MINIMUM_COEFFICIENT_OF_VARIATION{1e-4};
+constexpr double MINIMUM_COEFFICIENT_OF_VARIATION{1e-4};
 
 //! The largest probability for which an event is considered anomalous
 //! enough to be worthwhile showing a user.
-const double LARGEST_SIGNIFICANT_PROBABILITY{0.05};
+constexpr double LARGEST_SIGNIFICANT_PROBABILITY{0.05};
 
 //! The largest probability that it is deemed significantly anomalous.
-const double SMALL_PROBABILITY{1e-4};
+constexpr double SMALL_PROBABILITY{1e-4};
 
 //! The largest probability that it is deemed extremely anomalous.
 //! Probabilities smaller than this are only weakly discriminated
 //! in the sense that they are given the correct order, but fairly
 //! similar score.
-const double MINUSCULE_PROBABILITY{1e-50};
+constexpr double MINUSCULE_PROBABILITY{1e-50};
 
 //! The margin between the smallest value and the support left end
 //! to use for the gamma distribution.
-const double GAMMA_OFFSET_MARGIN{0.1};
+constexpr double GAMMA_OFFSET_MARGIN{0.1};
 
 //! The margin between the smallest value and the support left end
 //! to use for the log-normal distribution.
-const double LOG_NORMAL_OFFSET_MARGIN{1.0};
+constexpr double LOG_NORMAL_OFFSET_MARGIN{1.0};
 
 //! The minimum amount by which a trend decomposition component can
 //! reduce the prediction error variance and still be worthwhile
 //! modeling. We have different thresholds because we have inductive
 //! bias for particular types of components.
-const double COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[]{0.6, 0.4};
+constexpr double COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[]{0.6, 0.4};
 
 //! The minimum repeated amplitude of a seasonal component, as a
 //! multiple of error standard deviation, to be worthwhile modeling.
 //! We have different thresholds because we have inductive bias for
 //! particular types of components.
-const double SEASONAL_SIGNIFICANT_AMPLITUDE[]{1.0, 2.0};
+constexpr double SEASONAL_SIGNIFICANT_AMPLITUDE[]{1.0, 2.0};
 
 //! The minimum autocorrelation of a seasonal component to be
 //! worthwhile modeling. We have different thresholds because we
 //! have inductive bias for particular types of components.
-const double SEASONAL_SIGNIFICANT_AUTOCORRELATION[]{0.5, 0.6};
+constexpr double SEASONAL_SIGNIFICANT_AUTOCORRELATION[]{0.5, 0.6};
 
 //! The fraction of values which are treated as outliers when testing
 //! for and initializing a seasonal component.
-const double SEASONAL_OUTLIER_FRACTION{0.1};
+constexpr double SEASONAL_OUTLIER_FRACTION{0.1};
 
 //! The minimum multiplier of the mean inlier fraction difference
 //! (from a periodic pattern) to constitute an outlier when testing
 //! for and initializing a seasonal component.
-const double SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD{3.0};
+constexpr double SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD{3.0};
 
 //! The weight to assign outliers when testing for and initializing
 //! a seasonal component.
-const double SEASONAL_OUTLIER_WEIGHT{0.1};
+constexpr double SEASONAL_OUTLIER_WEIGHT{0.1};
 
 //! The significance of a test statistic to choose to model
 //! a trend decomposition component.
-const double COMPONENT_STATISTICALLY_SIGNIFICANT{0.001};
+constexpr double COMPONENT_STATISTICALLY_SIGNIFICANT{0.001};
 
 //! The log of COMPONENT_STATISTICALLY_SIGNIFICANT.
 const double LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE{
@@ -88,17 +88,17 @@ const double LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE{
 
 //! The default number of regression models used in periodic and
 //! calendar cyclic components of the trend decomposition.
-const std::size_t COMPONENT_SIZE{36u};
+constexpr std::size_t COMPONENT_SIZE{36u};
 
 //! The minimum variance scale for which the likelihood function
 //! can be accurately adjusted. For smaller scales errors are
 //! introduced for some priors.
-const double MINIMUM_ACCURATE_VARIANCE_SCALE{0.5};
+constexpr double MINIMUM_ACCURATE_VARIANCE_SCALE{0.5};
 
 //! The maximum variance scale for which the likelihood function
 //! can be accurately adjusted. For larger scales errors are
 //! introduced for some priors.
-const double MAXIMUM_ACCURATE_VARIANCE_SCALE{2.0};
+constexpr double MAXIMUM_ACCURATE_VARIANCE_SCALE{2.0};
 
 //! The confidence interval to use for the seasonal trend and
 //! variation. We detrend to the nearest point in the confidence
@@ -106,16 +106,16 @@ const double MAXIMUM_ACCURATE_VARIANCE_SCALE{2.0};
 //! scaling the likelihood function so that we don't get transient
 //! anomalies after detecting a periodic trend (when the trend
 //! can be in significant error).
-const double DEFAULT_SEASONAL_CONFIDENCE_INTERVAL{50.0};
+constexpr double DEFAULT_SEASONAL_CONFIDENCE_INTERVAL{50.0};
 
 //! The minimum fractional count of points in a cluster.
-const double MINIMUM_CLUSTER_SPLIT_FRACTION{0.0};
+constexpr double MINIMUM_CLUSTER_SPLIT_FRACTION{0.0};
 
 //! The default minimum count of points in a cluster.
-const double MINIMUM_CLUSTER_SPLIT_COUNT{24.0};
+constexpr double MINIMUM_CLUSTER_SPLIT_COUNT{24.0};
 
 //! The minimum count of a category in the sketch to cluster.
-const double MINIMUM_CATEGORY_COUNT{0.5};
+constexpr double MINIMUM_CATEGORY_COUNT{0.5};
 }
 }
 
diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
index 59892c311e..8777a9e439 100644
--- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
+++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
@@ -77,7 +77,8 @@ CDataFrameTrainBoostedTreeClassifierRunner::CDataFrameTrainBoostedTreeClassifier
     const CDataFrameAnalysisSpecification& spec,
     const CDataFrameAnalysisParameters& parameters)
     : CDataFrameTrainBoostedTreeRunner{
-          spec, parameters, std::make_unique<maths::boosted_tree::CBinomialLogistic>()} {
+          spec, parameters,
+          std::make_unique<maths::boosted_tree::CBinomialLogisticLoss>()} {
 
     m_NumTopClasses = parameters[NUM_TOP_CLASSES].fallback(std::size_t{0});
     m_PredictionFieldType =
diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
index 72d0e28be2..7b702b36e2 100644
--- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
+++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
@@ -236,7 +236,7 @@ void addPredictionTestData(EPredictionType type,
     if (type == E_Regression) {
         loss = std::make_unique<maths::boosted_tree::CMse>();
     } else {
-        loss = std::make_unique<maths::boosted_tree::CBinomialLogistic>();
+        loss = std::make_unique<maths::boosted_tree::CBinomialLogisticLoss>();
     }
 
     maths::CBoostedTreeFactory treeFactory{
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index f5819c335f..bfa25e413d 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -357,7 +357,8 @@ void CBoostedTreeImpl::initializePerFoldTestLosses() {
 }
 
 void CBoostedTreeImpl::computeProbabilityAtWhichToAssignClassOne(const core::CDataFrame& frame) {
-    if (m_Loss->name() == boosted_tree::CBinomialLogistic::NAME) {
+    // TODO generalize for multi-class.
+    if (m_Loss->name() == boosted_tree::CBinomialLogisticLoss::NAME) {
         switch (m_ClassAssignmentObjective) {
         case CBoostedTree::E_Accuracy:
             break;
@@ -911,7 +912,8 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
     using TArgMinLossVec = std::vector<CArgMinLoss>;
 
     TArgMinLossVec leafValues(
-        tree.size(), m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier()));
+        tree.size(),
+        m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier(), m_Rng));
     auto nextPass = [&] {
         bool done{true};
         for (const auto& value : leafValues) {
diff --git a/lib/maths/CBoostedTreeLoss.cc b/lib/maths/CBoostedTreeLoss.cc
index 1d7196efaa..45a967b8f8 100644
--- a/lib/maths/CBoostedTreeLoss.cc
+++ b/lib/maths/CBoostedTreeLoss.cc
@@ -6,8 +6,16 @@
 
 #include <maths/CBoostedTreeLoss.h>
 
+#include <maths/CBasicStatistics.h>
+#include <maths/CLbfgs.h>
+#include <maths/CLinearAlgebraEigen.h>
+#include <maths/CPRNG.h>
+#include <maths/CSampling.h>
 #include <maths/CSolvers.h>
 #include <maths/CTools.h>
+#include <maths/CToolsDetail.h>
+
+#include <limits>
 
 namespace ml {
 namespace maths {
@@ -31,6 +39,16 @@ double logLogistic(double logOdds) {
     }
     return CTools::fastLog(CTools::logisticFunction(logOdds));
 }
+
+template<typename T>
+CDenseVector<T> logSoftmax(CDenseVector<T> z) {
+    // Version which handles overflow and underflow when taking exponentials.
+    double zmax{z.maxCoeff()};
+    z = z - zmax * CDenseVector<T>::Ones(z.size());
+    double Z{z.array().exp().matrix().template lpNorm<1>()};
+    z = z - std::log(Z) * CDenseVector<T>::Ones(z.size());
+    return std::move(z);
+}
 }
 
 namespace boosted_tree_detail {
@@ -82,31 +100,31 @@ CArgMinMseImpl::TDoubleVector CArgMinMseImpl::value() const {
     return result;
 }
 
-CArgMinLogisticImpl::CArgMinLogisticImpl(double lambda)
-    : CArgMinLossImpl{lambda}, m_CategoryCounts{0},
-      m_BucketCategoryCounts(128, TDoubleVector2x1{0.0}) {
+CArgMinBinomialLogisticLossImpl::CArgMinBinomialLogisticLossImpl(double lambda)
+    : CArgMinLossImpl{lambda}, m_ClassCounts{0},
+      m_BucketsClassCounts(NUMBER_BUCKETS, TDoubleVector2x1{0.0}) {
 }
 
-std::unique_ptr<CArgMinLossImpl> CArgMinLogisticImpl::clone() const {
-    return std::make_unique<CArgMinLogisticImpl>(*this);
+std::unique_ptr<CArgMinLossImpl> CArgMinBinomialLogisticLossImpl::clone() const {
+    return std::make_unique<CArgMinBinomialLogisticLossImpl>(*this);
 }
 
-bool CArgMinLogisticImpl::nextPass() {
+bool CArgMinBinomialLogisticLossImpl::nextPass() {
     m_CurrentPass += this->bucketWidth() > 0.0 ? 1 : 2;
     return m_CurrentPass < 2;
 }
 
-void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction,
-                              double actual,
-                              double weight) {
+void CArgMinBinomialLogisticLossImpl::add(const TMemoryMappedFloatVector& prediction,
+                                          double actual,
+                                          double weight) {
     switch (m_CurrentPass) {
     case 0: {
         m_PredictionMinMax.add(prediction(0));
-        m_CategoryCounts(static_cast<std::size_t>(actual)) += weight;
+        m_ClassCounts(static_cast<std::size_t>(actual)) += weight;
         break;
     }
     case 1: {
-        auto& count = m_BucketCategoryCounts[this->bucket(prediction(0))];
+        auto& count = m_BucketsClassCounts[this->bucket(prediction(0))];
         count(static_cast<std::size_t>(actual)) += weight;
         break;
     }
@@ -115,17 +133,17 @@ void CArgMinLogisticImpl::add(const TMemoryMappedFloatVector& prediction,
     }
 }
 
-void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) {
-    const auto* logistic = dynamic_cast<const CArgMinLogisticImpl*>(&other);
+void CArgMinBinomialLogisticLossImpl::merge(const CArgMinLossImpl& other) {
+    const auto* logistic = dynamic_cast<const CArgMinBinomialLogisticLossImpl*>(&other);
     if (logistic != nullptr) {
         switch (m_CurrentPass) {
         case 0:
             m_PredictionMinMax += logistic->m_PredictionMinMax;
-            m_CategoryCounts += logistic->m_CategoryCounts;
+            m_ClassCounts += logistic->m_ClassCounts;
             break;
         case 1:
-            for (std::size_t i = 0; i < m_BucketCategoryCounts.size(); ++i) {
-                m_BucketCategoryCounts[i] += logistic->m_BucketCategoryCounts[i];
+            for (std::size_t i = 0; i < m_BucketsClassCounts.size(); ++i) {
+                m_BucketsClassCounts[i] += logistic->m_BucketsClassCounts[i];
             }
             break;
         default:
@@ -134,7 +152,7 @@ void CArgMinLogisticImpl::merge(const CArgMinLossImpl& other) {
     }
 }
 
-CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
+CArgMinBinomialLogisticLossImpl::TDoubleVector CArgMinBinomialLogisticLossImpl::value() const {
 
     std::function<double(double)> objective;
     double minWeight;
@@ -151,16 +169,16 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
                               : 0.0};
         objective = [prediction, this](double weight) {
             double logOdds{prediction + weight};
-            double c0{m_CategoryCounts(0)};
-            double c1{m_CategoryCounts(1)};
+            double c0{m_ClassCounts(0)};
+            double c1{m_ClassCounts(1)};
             return this->lambda() * CTools::pow2(weight) -
                    c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds);
         };
 
         // Weight shrinkage means the optimal weight will be somewhere between
         // the logit of the empirical probability and zero.
-        double c0{m_CategoryCounts(0) + 1.0};
-        double c1{m_CategoryCounts(1) + 1.0};
+        double c0{m_ClassCounts(0) + 1.0};
+        double c1{m_ClassCounts(1) + 1.0};
         double empiricalProbabilityC1{c1 / (c0 + c1)};
         double empiricalLogOddsC1{
             std::log(empiricalProbabilityC1 / (1.0 - empiricalProbabilityC1))};
@@ -170,10 +188,10 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
     } else {
         objective = [this](double weight) {
             double loss{0.0};
-            for (std::size_t i = 0; i < m_BucketCategoryCounts.size(); ++i) {
+            for (std::size_t i = 0; i < m_BucketsClassCounts.size(); ++i) {
                 double logOdds{this->bucketCentre(i) + weight};
-                double c0{m_BucketCategoryCounts[i](0)};
-                double c1{m_BucketCategoryCounts[i](1)};
+                double c0{m_BucketsClassCounts[i](0)};
+                double c1{m_BucketsClassCounts[i](1)};
                 loss -= c0 * logOneMinusLogistic(logOdds) + c1 * logLogistic(logOdds);
             }
             return loss + this->lambda() * CTools::pow2(weight);
@@ -205,6 +223,216 @@ CArgMinLogisticImpl::TDoubleVector CArgMinLogisticImpl::value() const {
     result(0) = minimum;
     return result;
 }
+
+CArgMinMultinomialLogisticLossImpl::CArgMinMultinomialLogisticLossImpl(std::size_t numberClasses,
+                                                                       double lambda,
+                                                                       const CPRNG::CXorOShiro128Plus& rng)
+    : CArgMinLossImpl{lambda}, m_NumberClasses{numberClasses}, m_Rng{rng},
+      m_ClassCounts{TDoubleVector::Zero(numberClasses)},
+      m_PredictionSketch{NUMBER_CENTRES / 2, // The size of the partition
+                         0.0, // The rate at which information is aged out (irrelevant)
+                         0.0, // The minimum permitted cluster size (irrelevant)
+                         NUMBER_CENTRES / 2, // The buffer size
+                         1,   // The number of seeds for k-means to try
+                         2} { // The number of iterations to use in k-means
+}
+
+std::unique_ptr<CArgMinLossImpl> CArgMinMultinomialLogisticLossImpl::clone() const {
+    return std::make_unique<CArgMinMultinomialLogisticLossImpl>(*this);
+}
+
+bool CArgMinMultinomialLogisticLossImpl::nextPass() {
+
+    using TMeanAccumulator = CBasicStatistics::SSampleMean<TDoubleVector>::TAccumulator;
+
+    if (m_CurrentPass++ == 0) {
+        TKMeans::TSphericalClusterVecVec clusters;
+        if (m_PredictionSketch.kmeans(NUMBER_CENTRES / 2, clusters) == false) {
+            m_Centres.push_back(TDoubleVector::Zero(m_NumberClasses));
+            ++m_CurrentPass;
+        } else {
+            // Extract the k-centres.
+            m_Centres.reserve(clusters.size());
+            for (const auto& cluster : clusters) {
+                TMeanAccumulator centre{TDoubleVector::Zero(m_NumberClasses)};
+                for (const auto& point : cluster) {
+                    centre.add(point);
+                }
+                m_Centres.push_back(CBasicStatistics::mean(centre));
+            }
+            std::stable_sort(m_Centres.begin(), m_Centres.end());
+            m_Centres.erase(std::unique(m_Centres.begin(), m_Centres.end()),
+                            m_Centres.end());
+            LOG_TRACE(<< "# centres = " << m_Centres.size());
+            m_CurrentPass += m_Centres.size() == 1 ? 1 : 0;
+            m_CentresClassCounts.resize(m_Centres.size(),
+                                        TDoubleVector::Zero(m_NumberClasses));
+        }
+
+        // Reclaim the memory used by k-means.
+        m_PredictionSketch = TKMeans{0};
+    }
+
+    LOG_TRACE(<< "current pass = " << m_CurrentPass);
+
+    return m_CurrentPass < 2;
+}
+
+void CArgMinMultinomialLogisticLossImpl::add(const TMemoryMappedFloatVector& prediction,
+                                             double actual,
+                                             double weight) {
+
+    using TMinAccumulator = CBasicStatistics::SMin<std::pair<double, std::size_t>>::TAccumulator;
+
+    switch (m_CurrentPass) {
+    case 0: {
+        // We have a member variable to avoid allocating a tempory each time.
+        m_DoublePrediction = prediction;
+        m_PredictionSketch.add(m_DoublePrediction, weight);
+        m_ClassCounts(static_cast<std::size_t>(actual)) += weight;
+        break;
+    }
+    case 1: {
+        TMinAccumulator nearest;
+        for (std::size_t i = 0; i < m_Centres.size(); ++i) {
+            nearest.add({(m_Centres[i] - prediction).squaredNorm(), i});
+        }
+        auto& count = m_CentresClassCounts[nearest[0].second];
+        count(static_cast<std::size_t>(actual)) += weight;
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+void CArgMinMultinomialLogisticLossImpl::merge(const CArgMinLossImpl& other) {
+    const auto* logistic = dynamic_cast<const CArgMinMultinomialLogisticLossImpl*>(&other);
+    if (logistic != nullptr) {
+        switch (m_CurrentPass) {
+        case 0:
+            m_PredictionSketch.merge(logistic->m_PredictionSketch);
+            m_ClassCounts += logistic->m_ClassCounts;
+            break;
+        case 1:
+            for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
+                m_CentresClassCounts[i] += logistic->m_CentresClassCounts[i];
+            }
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+CArgMinMultinomialLogisticLossImpl::TDoubleVector
+CArgMinMultinomialLogisticLossImpl::value() const {
+
+    using TMinAccumulator = CBasicStatistics::SMin<double>::TAccumulator;
+
+    TDoubleVector weightBoundingBox[2];
+    weightBoundingBox[0] = std::numeric_limits<double>::max() *
+                           TDoubleVector::Ones(m_NumberClasses);
+    weightBoundingBox[1] = -weightBoundingBox[0];
+
+    if (m_Centres.size() == 1) {
+
+        // Weight shrinkage means the optimal weight will be somewhere between
+        // the logit of the empirical probability and zero.
+        TDoubleVector empiricalProbabilities{m_ClassCounts.array() + 0.1};
+        empiricalProbabilities = empiricalProbabilities /
+                                 empiricalProbabilities.lpNorm<1>();
+        TDoubleVector empiricalLogOdds{
+            empiricalProbabilities.array().log().matrix() - m_Centres[0]};
+        weightBoundingBox[0] = weightBoundingBox[0].array().min(0.0);
+        weightBoundingBox[1] = weightBoundingBox[1].array().max(0.0);
+        weightBoundingBox[0] = weightBoundingBox[0].array().min(empiricalLogOdds.array());
+        weightBoundingBox[1] = weightBoundingBox[1].array().max(empiricalLogOdds.array());
+
+    } else {
+
+        for (const auto& centre : m_Centres) {
+            weightBoundingBox[0] = weightBoundingBox[0].array().min(-centre.array());
+            weightBoundingBox[1] = weightBoundingBox[1].array().max(-centre.array());
+        }
+    }
+    LOG_TRACE(<< "bounding box blc = " << weightBoundingBox[0].transpose());
+    LOG_TRACE(<< "bounding box trc = " << weightBoundingBox[1].transpose());
+
+    // Optimize via LBFGS with multiple restarts.
+
+    TMinAccumulator minLoss;
+    TDoubleVector result;
+
+    TDoubleVector x0(m_NumberClasses);
+    TObjective objective{this->objective()};
+    TObjectiveGradient objectiveGradient{this->objectiveGradient()};
+    for (std::size_t i = 0; i < NUMBER_RESTARTS; ++i) {
+        for (int j = 0; j < x0.size(); ++j) {
+            double alpha{CSampling::uniformSample(m_Rng, 0.0, 1.0)};
+            x0(j) = weightBoundingBox[0](j) +
+                    alpha * (weightBoundingBox[1](j) - weightBoundingBox[0](j));
+        }
+        LOG_TRACE(<< "x0 = " << x0.transpose());
+
+        double loss;
+        CLbfgs<TDoubleVector> lgbfs{5};
+        std::tie(x0, loss) = lgbfs.minimize(objective, objectiveGradient, std::move(x0));
+        if (minLoss.add(loss)) {
+            result = x0;
+        }
+        LOG_TRACE(<< "loss = " << loss << " weight for loss = " << x0.transpose());
+    }
+    LOG_TRACE(<< "minimum loss = " << minLoss << " weight* = " << result.transpose());
+
+    return result;
+}
+
+CArgMinMultinomialLogisticLossImpl::TObjective
+CArgMinMultinomialLogisticLossImpl::objective() const {
+    TDoubleVector logProbabilities;
+    double lambda{this->lambda()};
+    if (m_Centres.size() == 1) {
+        return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable {
+            logProbabilities = m_Centres[0] + weight;
+            logProbabilities = logSoftmax(std::move(logProbabilities));
+            return lambda * weight.squaredNorm() - m_ClassCounts.transpose() * logProbabilities;
+        };
+    }
+    return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable {
+        double loss{0.0};
+        for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
+            logProbabilities = m_Centres[i] + weight;
+            logProbabilities = logSoftmax(std::move(logProbabilities));
+            loss -= m_CentresClassCounts[i].transpose() * logProbabilities;
+        }
+        return loss + lambda * weight.squaredNorm();
+    };
+}
+
+CArgMinMultinomialLogisticLossImpl::TObjectiveGradient
+CArgMinMultinomialLogisticLossImpl::objectiveGradient() const {
+    TDoubleVector probabilities;
+    double lambda{this->lambda()};
+    if (m_Centres.size() == 1) {
+        return [probabilities, lambda, this](const TDoubleVector& weight) mutable {
+            probabilities = m_Centres[0] + weight;
+            probabilities = CTools::softmax(std::move(probabilities));
+            return TDoubleVector{2.0 * lambda * weight -
+                                 (m_ClassCounts - m_ClassCounts.array().sum() * probabilities)};
+        };
+    }
+    return [probabilities, lambda, this](const TDoubleVector& weight) mutable -> TDoubleVector {
+        TDoubleVector lossGradient{TDoubleVector::Zero(m_NumberClasses)};
+        for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
+            probabilities = m_Centres[i] + weight;
+            probabilities = CTools::softmax(std::move(probabilities));
+            lossGradient -= m_CentresClassCounts[i] -
+                            m_CentresClassCounts[i].array().sum() * probabilities;
+        }
+        return TDoubleVector{2.0 * lambda * weight + lossGradient};
+    };
+}
 }
 
 namespace boosted_tree {
@@ -277,7 +505,7 @@ CMse::TDoubleVector CMse::transform(const TMemoryMappedFloatVector& prediction)
     return TDoubleVector{prediction};
 }
 
-CArgMinLoss CMse::minimizer(double lambda) const {
+CArgMinLoss CMse::minimizer(double lambda, const CPRNG::CXorOShiro128Plus&) const {
     return this->makeMinimizer(CArgMinMseImpl{lambda});
 }
 
@@ -287,25 +515,25 @@ const std::string& CMse::name() const {
 
 const std::string CMse::NAME{"mse"};
 
-std::unique_ptr<CLoss> CBinomialLogistic::clone() const {
-    return std::make_unique<CBinomialLogistic>(*this);
+std::unique_ptr<CLoss> CBinomialLogisticLoss::clone() const {
+    return std::make_unique<CBinomialLogisticLoss>(*this);
 }
 
-std::size_t CBinomialLogistic::numberParameters() const {
+std::size_t CBinomialLogisticLoss::numberParameters() const {
     return 1;
 }
 
-double CBinomialLogistic::value(const TMemoryMappedFloatVector& prediction,
-                                double actual,
-                                double weight) const {
+double CBinomialLogisticLoss::value(const TMemoryMappedFloatVector& prediction,
+                                    double actual,
+                                    double weight) const {
     return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction(0)) +
                       actual * logLogistic(prediction(0)));
 }
 
-void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction,
-                                 double actual,
-                                 TWriter writer,
-                                 double weight) const {
+void CBinomialLogisticLoss::gradient(const TMemoryMappedFloatVector& prediction,
+                                     double actual,
+                                     TWriter writer,
+                                     double weight) const {
     if (prediction(0) > -LOG_EPSILON && actual == 1.0) {
         writer(0, -weight * std::exp(-prediction(0)));
     } else {
@@ -313,10 +541,10 @@ void CBinomialLogistic::gradient(const TMemoryMappedFloatVector& prediction,
     }
 }
 
-void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction,
-                                  double /*actual*/,
-                                  TWriter writer,
-                                  double weight) const {
+void CBinomialLogisticLoss::curvature(const TMemoryMappedFloatVector& prediction,
+                                      double /*actual*/,
+                                      TWriter writer,
+                                      double weight) const {
     if (prediction(0) > -LOG_EPSILON) {
         writer(0, weight * std::exp(-prediction(0)));
     } else {
@@ -325,26 +553,133 @@ void CBinomialLogistic::curvature(const TMemoryMappedFloatVector& prediction,
     }
 }
 
-bool CBinomialLogistic::isCurvatureConstant() const {
+bool CBinomialLogisticLoss::isCurvatureConstant() const {
     return false;
 }
 
-CBinomialLogistic::TDoubleVector
-CBinomialLogistic::transform(const TMemoryMappedFloatVector& prediction) const {
+CBinomialLogisticLoss::TDoubleVector
+CBinomialLogisticLoss::transform(const TMemoryMappedFloatVector& prediction) const {
     TDoubleVector result{prediction};
     result(0) = CTools::logisticFunction(result(0));
     return result;
 }
 
-CArgMinLoss CBinomialLogistic::minimizer(double lambda) const {
-    return this->makeMinimizer(CArgMinLogisticImpl{lambda});
+CArgMinLoss CBinomialLogisticLoss::minimizer(double lambda,
+                                             const CPRNG::CXorOShiro128Plus&) const {
+    return this->makeMinimizer(CArgMinBinomialLogisticLossImpl{lambda});
+}
+
+const std::string& CBinomialLogisticLoss::name() const {
+    return NAME;
+}
+
+const std::string CBinomialLogisticLoss::NAME{"binomial_logistic"};
+
+CMultinomialLogisticLoss::CMultinomialLogisticLoss(std::size_t numberClasses)
+    : m_NumberClasses{numberClasses} {
+}
+
+std::unique_ptr<CLoss> CMultinomialLogisticLoss::clone() const {
+    return std::make_unique<CMultinomialLogisticLoss>(m_NumberClasses);
+}
+
+std::size_t CMultinomialLogisticLoss::numberParameters() const {
+    return m_NumberClasses;
+}
+
+double CMultinomialLogisticLoss::value(const TMemoryMappedFloatVector& predictions,
+                                       double actual,
+                                       double weight) const {
+    double zmax{predictions.maxCoeff()};
+    double logZ{0.0};
+    for (int i = 0; i < predictions.size(); ++i) {
+        logZ += std::exp(predictions(i) - zmax);
+    }
+    logZ = zmax + CTools::fastLog(logZ);
+
+    // i.e. -log(z(actual))
+    return weight * (logZ - predictions(static_cast<std::size_t>(actual)));
+}
+
+void CMultinomialLogisticLoss::gradient(const TMemoryMappedFloatVector& predictions,
+                                        double actual,
+                                        TWriter writer,
+                                        double weight) const {
+
+    // We prefer an implementation which avoids any memory allocations.
+
+    double zmax{predictions.maxCoeff()};
+    double logZ{0.0};
+    for (int i = 0; i < predictions.size(); ++i) {
+        logZ += std::exp(predictions(i) - zmax);
+    }
+    logZ = zmax + CTools::fastLog(logZ);
+
+    for (int i = 0; i < predictions.size(); ++i) {
+        if (i == static_cast<int>(actual)) {
+            if (predictions(i) - logZ > -LOG_EPSILON) {
+                writer(i, -weight * std::exp(-(predictions(i) - logZ)));
+            } else {
+                writer(i, weight * (std::exp(predictions(i) - logZ) - 1.0));
+            }
+        } else {
+            writer(i, weight * std::exp(predictions(i) - logZ));
+        }
+    }
+}
+
+void CMultinomialLogisticLoss::curvature(const TMemoryMappedFloatVector& predictions,
+                                         double /*actual*/,
+                                         TWriter writer,
+                                         double weight) const {
+
+    // Return the lower triangle of the Hessian column major.
+
+    // We prefer an implementation which avoids any memory allocations.
+
+    double zmax{predictions.maxCoeff()};
+    double logZ{0.0};
+    for (int i = 0; i < predictions.size(); ++i) {
+        logZ += std::exp(predictions(i) - zmax);
+    }
+    logZ = zmax + CTools::fastLog(logZ);
+
+    for (std::size_t i = 0, k = 0; i < m_NumberClasses; ++i) {
+        if (predictions(i) - logZ > -LOG_EPSILON) {
+            writer(i, weight * std::exp(-(predictions(i) - logZ)));
+        } else {
+            double probability{std::exp(predictions(i) - logZ)};
+            writer(i, weight * weight * probability * (1.0 - probability));
+        }
+        for (std::size_t j = i + 1; j < m_NumberClasses; ++j, ++k) {
+            double probabilities[]{std::exp(predictions(i) - logZ),
+                                   std::exp(predictions(j) - logZ)};
+            writer(k, -weight * probabilities[0] * probabilities[1]);
+        }
+    }
+}
+
+bool CMultinomialLogisticLoss::isCurvatureConstant() const {
+    return false;
+}
+
+CMultinomialLogisticLoss::TDoubleVector
+CMultinomialLogisticLoss::transform(const TMemoryMappedFloatVector& prediction) const {
+    TDoubleVector result{prediction};
+    return CTools::softmax(std::move(result));
+}
+
+CArgMinLoss CMultinomialLogisticLoss::minimizer(double lambda,
+                                                const CPRNG::CXorOShiro128Plus& rng) const {
+    return this->makeMinimizer(
+        CArgMinMultinomialLogisticLossImpl{m_NumberClasses, lambda, rng});
 }
 
-const std::string& CBinomialLogistic::name() const {
+const std::string& CMultinomialLogisticLoss::name() const {
     return NAME;
 }
 
-const std::string CBinomialLogistic::NAME{"binomial_logistic"};
+const std::string CMultinomialLogisticLoss::NAME{"multinomial_logistic"};
 }
 }
 }
diff --git a/lib/maths/unittest/CBoostedTreeLossTest.cc b/lib/maths/unittest/CBoostedTreeLossTest.cc
new file mode 100644
index 0000000000..c1e03da96c
--- /dev/null
+++ b/lib/maths/unittest/CBoostedTreeLossTest.cc
@@ -0,0 +1,678 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+#include "core/CContainerPrinter.h"
+#include "maths/CLbfgs.h"
+#include <maths/CBoostedTreeLoss.h>
+#include <maths/CPRNG.h>
+#include <maths/CSolvers.h>
+#include <maths/CTools.h>
+#include <maths/CToolsDetail.h>
+
+#include <test/BoostTestCloseAbsolute.h>
+#include <test/CRandomNumbers.h>
+
+#include <boost/test/unit_test.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <utility>
+#include <vector>
+
+BOOST_AUTO_TEST_SUITE(CBoostedTreeLossTest)
+
+using namespace ml;
+using TDoubleVec = std::vector<double>;
+using TDoubleVecVec = std::vector<TDoubleVec>;
+using TDoubleVector = maths::boosted_tree::CLoss::TDoubleVector;
+using TDoubleVectorVec = std::vector<TDoubleVector>;
+using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector;
+using maths::boosted_tree::CBinomialLogisticLoss;
+using maths::boosted_tree::CMultinomialLogisticLoss;
+using maths::boosted_tree_detail::CArgMinBinomialLogisticLossImpl;
+using maths::boosted_tree_detail::CArgMinMultinomialLogisticLossImpl;
+
+namespace {
+void minimizeGridSearch(std::function<double(const TDoubleVector&)> objective,
+                        double scale,
+                        int d,
+                        TDoubleVector& x,
+                        double& min,
+                        TDoubleVector& argmin) {
+    if (d == x.size()) {
+        double value{objective(x)};
+        if (value < min) {
+            min = value;
+            argmin = x;
+        }
+        return;
+    }
+    for (double xd : {-0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5}) {
+        x(d) = scale * xd;
+        minimizeGridSearch(objective, scale, d + 1, x, min, argmin);
+    }
+}
+}
+
+BOOST_AUTO_TEST_CASE(testBinomialLogisticMinimizerEdgeCases) {
+
+    // All predictions equal and zero.
+    {
+        CArgMinBinomialLogisticLossImpl argmin{0.0};
+        maths::CFloatStorage storage[]{0.0};
+        TMemoryMappedFloatVector prediction{storage, 1};
+        argmin.add(prediction, 0.0);
+        argmin.add(prediction, 1.0);
+        argmin.add(prediction, 1.0);
+        argmin.add(prediction, 0.0);
+        BOOST_REQUIRE_EQUAL(false, argmin.nextPass());
+        BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]);
+    }
+
+    // All predictions are equal and 0.5.
+    {
+        test::CRandomNumbers rng;
+
+        TDoubleVec labels;
+        rng.generateUniformSamples(0.0, 1.0, 1000, labels);
+        for (auto& label : labels) {
+            label = std::floor(label + 0.3);
+        }
+
+        CArgMinBinomialLogisticLossImpl argmin{0.0};
+        std::size_t numberPasses{0};
+        std::size_t counts[]{0, 0};
+
+        do {
+            ++numberPasses;
+            for (std::size_t i = 0; i < labels.size(); ++i) {
+                maths::CFloatStorage storage[]{0.5};
+                TMemoryMappedFloatVector prediction{storage, 1};
+                argmin.add(prediction, labels[i]);
+                ++counts[static_cast<std::size_t>(labels[i])];
+            }
+        } while (argmin.nextPass());
+
+        double p{static_cast<double>(counts[1]) / 1000.0};
+        double expected{std::log(p / (1.0 - p)) - 0.5};
+        double actual{argmin.value()[0]};
+
+        BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses);
+        BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected));
+    }
+
+    // Test underflow of probabilities.
+    {
+        CArgMinBinomialLogisticLossImpl argmin{0.0};
+
+        TDoubleVec predictions{-500.0, -30.0, -15.0, -400.0};
+        TDoubleVec labels{1.0, 1.0, 0.0, 1.0};
+        do {
+            for (std::size_t i = 0; i < predictions.size(); ++i) {
+                maths::CFloatStorage storage[]{predictions[i]};
+                TMemoryMappedFloatVector prediction{storage, 1};
+                argmin.add(prediction, labels[i]);
+            }
+        } while (argmin.nextPass());
+
+        double minimizer{argmin.value()[0]};
+
+        // Check we're at the minimum.
+        CBinomialLogisticLoss loss;
+        TDoubleVec losses;
+        for (double eps : {-10.0, 0.0, 10.0}) {
+            double lossAtEps{0.0};
+            for (std::size_t i = 0; i < predictions.size(); ++i) {
+                maths::CFloatStorage storage[]{predictions[i] + minimizer + eps};
+                TMemoryMappedFloatVector probe{storage, 1};
+                lossAtEps += loss.value(probe, labels[i]);
+            }
+            losses.push_back(lossAtEps);
+        }
+        BOOST_TEST_REQUIRE(losses[0] >= losses[1]);
+        BOOST_TEST_REQUIRE(losses[2] >= losses[1]);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testBinomialLogisticMinimizerRandom) {
+
+    // Test that we a good approximation of the additive term for the log-odds
+    // which minimises the exact cross entropy objective.
+
+    test::CRandomNumbers rng;
+
+    TDoubleVec labels;
+    TDoubleVec predictions;
+
+    for (auto lambda : {0.0, 10.0}) {
+
+        LOG_DEBUG(<< "lambda = " << lambda);
+
+        // The exact objective.
+        auto exactObjective = [&](double weight) {
+            double loss{0.0};
+            for (std::size_t i = 0; i < labels.size(); ++i) {
+                double p{maths::CTools::logisticFunction(predictions[i] + weight)};
+                loss -= (1.0 - labels[i]) * maths::CTools::fastLog(1.0 - p) +
+                        labels[i] * maths::CTools::fastLog(p);
+            }
+            return loss + lambda * maths::CTools::pow2(weight);
+        };
+
+        // This loop is fuzzing the predicted log-odds and testing we get consistently
+        // good estimates of the true minimizer.
+        for (std::size_t t = 0; t < 10; ++t) {
+
+            double min{std::numeric_limits<double>::max()};
+            double max{-min};
+
+            rng.generateUniformSamples(0.0, 1.0, 1000, labels);
+            for (auto& label : labels) {
+                label = std::floor(label + 0.5);
+            }
+            predictions.clear();
+            for (const auto& label : labels) {
+                TDoubleVec weight;
+                rng.generateNormalSamples(label, 2.0, 1, weight);
+                predictions.push_back(weight[0]);
+                min = std::min(min, weight[0]);
+                max = std::max(max, weight[0]);
+            }
+
+            double expected;
+            double objectiveAtExpected;
+            std::size_t maxIterations{20};
+            maths::CSolvers::minimize(-max, -min, exactObjective(-max),
+                                      exactObjective(-min), exactObjective, 1e-3,
+                                      maxIterations, expected, objectiveAtExpected);
+            LOG_DEBUG(<< "expected = " << expected
+                      << " objective(expected) = " << objectiveAtExpected);
+
+            CArgMinBinomialLogisticLossImpl argmin{lambda};
+            CArgMinBinomialLogisticLossImpl argminPartition[2]{{lambda}, {lambda}};
+            auto nextPass = [&] {
+                bool done{argmin.nextPass() == false};
+                done &= (argminPartition[0].nextPass() == false);
+                done &= (argminPartition[1].nextPass() == false);
+                return done == false;
+            };
+
+            do {
+                for (std::size_t i = 0; i < labels.size() / 2; ++i) {
+                    maths::CFloatStorage storage[]{predictions[i]};
+                    TMemoryMappedFloatVector prediction{storage, 1};
+                    argmin.add(prediction, labels[i]);
+                    argminPartition[0].add(prediction, labels[i]);
+                }
+                for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) {
+                    maths::CFloatStorage storage[]{predictions[i]};
+                    TMemoryMappedFloatVector prediction{storage, 1};
+                    argmin.add(prediction, labels[i]);
+                    argminPartition[1].add(prediction, labels[i]);
+                }
+                argminPartition[0].merge(argminPartition[1]);
+                argminPartition[1] = argminPartition[0];
+            } while (nextPass());
+
+            double actual{argmin.value()(0)};
+            double actualPartition{argminPartition[0].value()(0)};
+            double objectiveAtActual{exactObjective(actual)};
+            LOG_DEBUG(<< "actual = " << actual << " objective(actual) = " << objectiveAtActual);
+
+            // We should be within 1% for the value and 0.001% for the objective
+            // at the value.
+            BOOST_REQUIRE_EQUAL(actual, actualPartition);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected));
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(objectiveAtExpected, objectiveAtActual,
+                                         1e-5 * objectiveAtExpected);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testBinomialLogisticLossForUnderflow) {
+
+    // Test the behaviour of value, gradient and curvature of the logistic loss in
+    // the vicinity the point at which we switch to using Taylor expansion of the
+    // logistic function is as expected.
+
+    double eps{100.0 * std::numeric_limits<double>::epsilon()};
+
+    CBinomialLogisticLoss loss;
+
+    // Losses should be very nearly linear function of log-odds when they're large.
+    {
+        maths::CFloatStorage storage[]{1.0 - std::log(eps), 1.0 + std::log(eps)};
+        TMemoryMappedFloatVector predictions[]{{&storage[0], 1}, {&storage[1], 1}};
+        TDoubleVec previousLoss{loss.value(predictions[0], 0.0),
+                                loss.value(predictions[1], 1.0)};
+        LOG_DEBUG(<< core::CContainerPrinter::print(previousLoss));
+        for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
+            storage[0] = scale - std::log(eps);
+            storage[1] = scale + std::log(eps);
+            TDoubleVec currentLoss{loss.value(predictions[0], 0.0),
+                                   loss.value(predictions[1], 1.0)};
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, previousLoss[0] - currentLoss[0], 0.005);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, previousLoss[1] - currentLoss[1], 0.005);
+            previousLoss = currentLoss;
+        }
+    }
+
+    // The gradient and curvature should be proportional to the exponential of the
+    // log-odds when they're small.
+    {
+        auto readDerivatives = [&](double prediction, TDoubleVec& gradients,
+                                   TDoubleVec& curvatures) {
+            maths::CFloatStorage storage[]{prediction + std::log(eps),
+                                           prediction - std::log(eps)};
+            TMemoryMappedFloatVector predictions[]{{&storage[0], 1}, {&storage[1], 1}};
+            loss.gradient(predictions[0], 0.0, [&](std::size_t, double value) {
+                gradients[0] = value;
+            });
+            loss.gradient(predictions[1], 1.0, [&](std::size_t, double value) {
+                gradients[1] = value;
+            });
+            loss.curvature(predictions[0], 0.0, [&](std::size_t, double value) {
+                curvatures[0] = value;
+            });
+            loss.curvature(predictions[1], 1.0, [&](std::size_t, double value) {
+                curvatures[1] = value;
+            });
+        };
+
+        TDoubleVec previousGradient(2);
+        TDoubleVec previousCurvature(2);
+        readDerivatives(1.0, previousGradient, previousCurvature);
+
+        for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
+            TDoubleVec currentGradient(2);
+            TDoubleVec currentCurvature(2);
+            readDerivatives(scale, currentGradient, currentCurvature);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(0.25), previousGradient[0] / currentGradient[0], 0.01);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(-0.25), previousGradient[1] / currentGradient[1], 0.01);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(0.25), previousCurvature[0] / currentCurvature[0], 0.01);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(-0.25), previousCurvature[1] / currentCurvature[1], 0.01);
+            previousGradient = currentGradient;
+            previousCurvature = currentCurvature;
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testMultinomialLogisticGradient) {
+
+    // Test that the gradient function is close to the numerical derivative
+    // of the objective.
+
+    maths::CPRNG::CXorOShiro128Plus rng;
+    test::CRandomNumbers testRng;
+
+    for (std::size_t t = 0; t < 10; ++t) {
+
+        CArgMinMultinomialLogisticLossImpl argmin{3, 0.1 * static_cast<double>(t + 1), rng};
+
+        TDoubleVec labels;
+        testRng.generateUniformSamples(0.0, 3.0, 20, labels);
+
+        TDoubleVec predictions;
+        if (t % 2 == 0) {
+            predictions.resize(3 * labels.size(), 0.0);
+        } else {
+            testRng.generateUniformSamples(-1.0, 1.0, 3 * labels.size(), predictions);
+        }
+
+        do {
+            for (std::size_t i = 0; i < labels.size(); ++i) {
+                maths::CFloatStorage storage[]{predictions[3 * i + 0],
+                                               predictions[3 * i + 1],
+                                               predictions[3 * i + 2]};
+                TMemoryMappedFloatVector prediction{storage, 3};
+                argmin.add(prediction, std::floor(labels[i]));
+            }
+        } while (argmin.nextPass());
+
+        auto objective = argmin.objective();
+        auto objectiveGradient = argmin.objectiveGradient();
+
+        double eps{1e-3};
+        TDoubleVec probes;
+        testRng.generateUniformSamples(-1.0, 1.0, 30, probes);
+        for (std::size_t i = 0; i < probes.size(); i += 3) {
+            TDoubleVector probe{3};
+            probe(0) = probes[i];
+            probe(1) = probes[i + 1];
+            probe(2) = probes[i + 2];
+
+            TDoubleVector expectedGradient{3};
+            for (std::size_t j = 0; j < 3; ++j) {
+                TDoubleVector shift{TDoubleVector::Zero(3)};
+                shift(j) = eps;
+                expectedGradient(j) =
+                    (objective(probe + shift) - objective(probe - shift)) / (2.0 * eps);
+            }
+            TDoubleVector actualGradient{objectiveGradient(probe)};
+
+            BOOST_REQUIRE_SMALL((expectedGradient - actualGradient).norm(), eps);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testMultinomialLogisticMinimizerEdgeCases) {
+
+    maths::CPRNG::CXorOShiro128Plus rng;
+    test::CRandomNumbers testRng;
+
+    // All predictions equal and zero.
+    {
+        CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng};
+
+        maths::CFloatStorage storage[]{0.0, 0.0, 0.0};
+        TMemoryMappedFloatVector prediction{storage, 3};
+        argmin.add(prediction, 0.0);
+        argmin.add(prediction, 1.0);
+        argmin.add(prediction, 1.0);
+        argmin.add(prediction, 0.0);
+        argmin.add(prediction, 2.0);
+        argmin.add(prediction, 1.0);
+        BOOST_REQUIRE_EQUAL(false, argmin.nextPass());
+
+        TDoubleVector expectedProbabilities{3};
+        expectedProbabilities(0) = 2.0 / 6.0;
+        expectedProbabilities(1) = 3.0 / 6.0;
+        expectedProbabilities(2) = 1.0 / 6.0;
+        TDoubleVector actualProbabilities{maths::CTools::softmax(argmin.value())};
+
+        BOOST_REQUIRE_SMALL((actualProbabilities - expectedProbabilities).norm(), 1e-3);
+    }
+
+    // All predictions are equal and 0.5.
+    for (std::size_t t = 0; t < 10; ++t) {
+
+        TDoubleVec labels;
+        testRng.generateUniformSamples(0.0, 2.0, 20, labels);
+        for (auto& label : labels) {
+            label = std::floor(label + 0.3);
+        }
+
+        CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng};
+
+        std::size_t numberPasses{0};
+        std::size_t counts[]{0, 0, 0};
+        maths::CFloatStorage storage[]{0.5, 0.5, 0.5};
+        TMemoryMappedFloatVector prediction{storage, 3};
+
+        do {
+            ++numberPasses;
+            for (const auto& label : labels) {
+                argmin.add(prediction, label);
+                ++counts[static_cast<std::size_t>(label)];
+            }
+        } while (argmin.nextPass());
+
+        BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses);
+
+        TDoubleVector expectedProbabilities{3};
+        expectedProbabilities(0) = static_cast<double>(counts[0]) / 20.0;
+        expectedProbabilities(1) = static_cast<double>(counts[1]) / 20.0;
+        expectedProbabilities(2) = static_cast<double>(counts[2]) / 20.0;
+        TDoubleVector actualLogit{prediction + argmin.value()};
+        TDoubleVector actualProbabilities{maths::CTools::softmax(actualLogit)};
+
+        BOOST_REQUIRE_SMALL((actualProbabilities - expectedProbabilities).norm(), 0.001);
+    }
+
+    // Test underflow of probabilities.
+    LOG_DEBUG(<< "Test underflow");
+    {
+        CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng};
+
+        TDoubleVecVec predictions{{-230.0, -200.0, -200.0},
+                                  {-30.0, -10.0, -20.0},
+                                  {-15.0, -50.0, -30.0},
+                                  {-400.0, -350.0, -300.0}};
+        TDoubleVec labels{1.0, 1.0, 0.0, 2.0};
+        do {
+            for (std::size_t i = 0; i < predictions.size(); ++i) {
+                maths::CFloatStorage storage[]{predictions[i][0], predictions[i][1],
+                                               predictions[i][2]};
+                TMemoryMappedFloatVector prediction{storage, 3};
+                argmin.add(prediction, labels[i]);
+            }
+        } while (argmin.nextPass());
+
+        TDoubleVector minimizer{argmin.value()};
+
+        // Check we're at a minimum.
+        CMultinomialLogisticLoss loss{3};
+        for (std::size_t i = 0; i < 3; ++i) {
+            TDoubleVec losses;
+            for (double eps : {-30.0, 0.0, 30.0}) {
+                double lossAtEps{0.0};
+                for (std::size_t j = 0; j < predictions.size(); ++j) {
+                    maths::CFloatStorage storage[]{predictions[j][0] + minimizer(0),
+                                                   predictions[j][1] + minimizer(1),
+                                                   predictions[j][2] + minimizer(2)};
+                    storage[i] += eps;
+                    TMemoryMappedFloatVector probe{storage, 3};
+                    lossAtEps += loss.value(probe, labels[j]);
+                }
+                losses.push_back(lossAtEps);
+            }
+            LOG_DEBUG(<< core::CContainerPrinter::print(losses));
+            BOOST_TEST_REQUIRE(losses[0] >= losses[1]);
+            BOOST_TEST_REQUIRE(losses[2] >= losses[1]);
+        }
+    }
+
+    // All labels equal.
+    {
+        CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng};
+
+        maths::CFloatStorage storage[]{0.0, 0.0, 0.0};
+        TMemoryMappedFloatVector prediction{storage, 3};
+        TDoubleVec labels{1.0, 1.0, 1.0, 1.0};
+
+        do {
+            for (const auto& label : labels) {
+                argmin.add(prediction, label);
+            }
+        } while (argmin.nextPass());
+
+        TDoubleVector minimizer{argmin.value()};
+
+        double totalLoss{0.0};
+        CMultinomialLogisticLoss loss{3};
+        for (const auto& label : labels) {
+            maths::CFloatStorage probeStorage[]{prediction(0) + minimizer(0),
+                                                prediction(1) + minimizer(1),
+                                                prediction(2) + minimizer(2)};
+            TMemoryMappedFloatVector probe{probeStorage, 3};
+            totalLoss += loss.value(probe, label);
+        }
+
+        BOOST_REQUIRE_SMALL(totalLoss, 1e-3);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testMultinomialLogisticMinimizerRandom) {
+
+    // Test that we have a good approximation of the additive term for the log-odds
+    // which minimises the exact cross entropy objective.
+
+    maths::CPRNG::CXorOShiro128Plus rng;
+    test::CRandomNumbers testRng;
+
+    TDoubleVec labels;
+    TDoubleVectorVec predictions;
+
+    double scales[]{6.0, 1.0};
+    double lambdas[]{0.0, 10.0};
+
+    for (auto i : {0, 1}) {
+
+        double lambda{lambdas[i]};
+
+        LOG_DEBUG(<< "lambda = " << lambda);
+
+        // The exact objective.
+        auto exactObjective = [&](const TDoubleVector& weight) {
+            double loss{0.0};
+            for (std::size_t j = 0; j < labels.size(); ++j) {
+                TDoubleVector probabilities{predictions[j] + weight};
+                probabilities = maths::CTools::softmax(std::move(probabilities));
+                loss -= maths::CTools::fastLog(probabilities(static_cast<int>(labels[j])));
+            }
+            return loss + lambda * weight.squaredNorm();
+        };
+
+        // This loop is fuzzing the predicted log-odds and testing we get consistently
+        // good estimates of the true minimizer.
+
+        double sumObjectiveGridSearch{0.0};
+        double sumObjectiveAtActual{0.0};
+
+        for (std::size_t t = 0; t < 10; ++t) {
+
+            testRng.generateUniformSamples(0.0, 2.0, 500, labels);
+            for (auto& label : labels) {
+                label = std::floor(label + 0.5);
+            }
+
+            predictions.clear();
+            for (const auto& label : labels) {
+                TDoubleVec prediction;
+                testRng.generateNormalSamples(0.0, 2.0, 3, prediction);
+                prediction[static_cast<std::size_t>(label)] += 1.0;
+                predictions.push_back(TDoubleVector::fromStdVector(prediction));
+            }
+
+            TDoubleVector weight{3};
+            double objectiveGridSearch{std::numeric_limits<double>::max()};
+            TDoubleVector argminGridSearch;
+            minimizeGridSearch(exactObjective, scales[i], 0, weight,
+                               objectiveGridSearch, argminGridSearch);
+            LOG_DEBUG(<< "argmin grid search = " << argminGridSearch.transpose()
+                      << ", min objective grid search = " << objectiveGridSearch);
+
+            std::size_t numberPasses{0};
+            maths::CFloatStorage storage[]{0.0, 0.0, 0.0};
+            TMemoryMappedFloatVector prediction{storage, 3};
+
+            CArgMinMultinomialLogisticLossImpl argmin{3, lambda, rng};
+
+            do {
+                ++numberPasses;
+                for (std::size_t j = 0; j < labels.size(); ++j) {
+                    storage[0] = predictions[j](0);
+                    storage[1] = predictions[j](1);
+                    storage[2] = predictions[j](2);
+                    argmin.add(prediction, labels[j]);
+                }
+            } while (argmin.nextPass());
+
+            BOOST_REQUIRE_EQUAL(std::size_t{2}, numberPasses);
+
+            TDoubleVector actual{argmin.value()};
+            double objectiveAtActual{exactObjective(actual)};
+            LOG_DEBUG(<< "actual = " << actual.transpose()
+                      << ", objective(actual) = " << objectiveAtActual);
+
+            BOOST_TEST_REQUIRE(objectiveAtActual < 1.01 * objectiveGridSearch);
+
+            sumObjectiveGridSearch += objectiveGridSearch;
+            sumObjectiveAtActual += objectiveAtActual;
+        }
+
+        LOG_DEBUG(<< "sum min objective grid search = " << sumObjectiveGridSearch);
+        LOG_DEBUG(<< "sum objective(actual) = " << sumObjectiveAtActual);
+        BOOST_TEST_REQUIRE(sumObjectiveAtActual < sumObjectiveGridSearch);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(testMultinomialLogisticLossForUnderflow) {
+
+    // Test the behaviour of value, gradient and Hessian of the logistic loss in
+    // the regime where the probabilities underflow.
+
+    using TFloatVec = std::vector<maths::CFloatStorage>;
+
+    double eps{std::numeric_limits<double>::epsilon()};
+
+    auto logits = [](double x, TFloatVec& result) { result.assign({0.0, x}); };
+
+    CMultinomialLogisticLoss loss{2};
+
+    // Losses should be very nearly linear function of log-odds when they're large.
+    {
+        TFloatVec storage[2];
+        logits(1.0 - std::log(eps), storage[0]);
+        logits(1.0 + std::log(eps), storage[1]);
+
+        TMemoryMappedFloatVector predictions[]{{&storage[0][0], 2}, {&storage[1][0], 2}};
+        TDoubleVec previousLoss{loss.value(predictions[0], 0.0),
+                                loss.value(predictions[1], 1.0)};
+
+        for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
+            logits(scale - std::log(eps), storage[0]);
+            logits(scale + std::log(eps), storage[1]);
+            TDoubleVec currentLoss{loss.value(predictions[0], 0.0),
+                                   loss.value(predictions[1], 1.0)};
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, previousLoss[0] - currentLoss[0], 0.005);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, previousLoss[1] - currentLoss[1], 0.005);
+            previousLoss = currentLoss;
+        }
+    }
+
+    // The gradient and curvature should be proportional to the exponential of the
+    // log-odds when they're small.
+    /* TODO {
+        auto readDerivatives = [&](double prediction, TDoubleVecVec& gradients,
+                                   TDoubleVecVec& curvatures) {
+            TFloatVec storage[2];
+            logits(prediction + std::log(eps), storage[0]);
+            logits(prediction - std::log(eps), storage[1]);
+            TMemoryMappedFloatVector predictions[]{{&storage[0][0], 2},
+                                                   {&storage[1][0], 2}};
+            loss.gradient(predictions[0], 0.0, [&](std::size_t i, double value) {
+                gradients[0][i] = value;
+            });
+            loss.gradient(predictions[1], 1.0, [&](std::size_t i, double value) {
+                gradients[1][i] = value;
+            });
+            loss.curvature(predictions[0], 0.0, [&](std::size_t i, double value) {
+                curvatures[0][i] = value;
+            });
+            loss.curvature(predictions[1], 1.0, [&](std::size_t i, double value) {
+                curvatures[1][i] = value;
+            });
+        };
+
+        TDoubleVecVec previousGradient(2, TDoubleVec(2));
+        TDoubleVecVec previousCurvature(2, TDoubleVec(3));
+        readDerivatives(1.0, previousGradient, previousCurvature);
+
+        for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
+            TDoubleVecVec currentGradient(2, TDoubleVec(2));
+            TDoubleVecVec currentCurvature(2, TDoubleVec(3));
+            readDerivatives(scale, currentGradient, currentCurvature);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(0.25), previousGradient[0][1] / currentGradient[0][1], 0.01);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(-0.25), previousGradient[1][1] / currentGradient[1][1], 0.01);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(0.25), previousCurvature[0][2] / currentCurvature[0][1], 0.01);
+            BOOST_REQUIRE_CLOSE_ABSOLUTE(
+                std::exp(-0.25), previousCurvature[1][2] / currentCurvature[1][1], 0.01);
+            previousGradient = currentGradient;
+            previousCurvature = currentCurvature;
+        }
+    }*/
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
index 2b10bf0cb7..84d7e88832 100644
--- a/lib/maths/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -13,7 +13,6 @@
 #include <maths/CBoostedTree.h>
 #include <maths/CBoostedTreeFactory.h>
 #include <maths/CBoostedTreeLoss.h>
-#include <maths/CSolvers.h>
 #include <maths/CTools.h>
 
 #include <test/BoostTestCloseAbsolute.h>
@@ -23,11 +22,13 @@
 #include <boost/make_shared.hpp>
 #include <boost/test/unit_test.hpp>
 
+#include <algorithm>
 #include <fstream>
 #include <functional>
 #include <memory>
 #include <streambuf>
 #include <utility>
+#include <vector>
 
 BOOST_AUTO_TEST_SUITE(CBoostedTreeTest)
 
@@ -43,7 +44,6 @@ using TRowRef = core::CDataFrame::TRowRef;
 using TRowItr = core::CDataFrame::TRowItr;
 using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
 using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
-using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatVector;
 
 namespace {
 
@@ -885,259 +885,6 @@ BOOST_AUTO_TEST_CASE(testDepthBasedRegularization) {
     }
 }
 
-BOOST_AUTO_TEST_CASE(testLogisticMinimizerEdgeCases) {
-
-    using maths::boosted_tree_detail::CArgMinLogisticImpl;
-
-    // All predictions equal and zero.
-    {
-        CArgMinLogisticImpl argmin{0.0};
-        maths::CFloatStorage storage[]{0.0};
-        TMemoryMappedFloatVector prediction{storage, 1};
-        argmin.add(prediction, 0.0);
-        argmin.add(prediction, 1.0);
-        argmin.add(prediction, 1.0);
-        argmin.add(prediction, 0.0);
-        argmin.nextPass();
-        BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]);
-    }
-
-    // All predictions are equal.
-    {
-        test::CRandomNumbers rng;
-
-        TDoubleVec labels;
-        TDoubleVec weights;
-        rng.generateUniformSamples(0.0, 1.0, 1000, labels);
-        for (auto& label : labels) {
-            label = std::floor(label + 0.3);
-        }
-        weights.resize(labels.size(), 0.0);
-
-        CArgMinLogisticImpl argmin{0.0};
-        std::size_t numberPasses{0};
-        std::size_t counts[2]{0, 0};
-
-        do {
-            ++numberPasses;
-            for (std::size_t i = 0; i < labels.size(); ++i) {
-                maths::CFloatStorage storage[]{weights[i]};
-                TMemoryMappedFloatVector prediction{storage, 1};
-                argmin.add(prediction, labels[i]);
-                ++counts[static_cast<std::size_t>(labels[i])];
-            }
-        } while (argmin.nextPass());
-
-        double p{static_cast<double>(counts[1]) / 1000.0};
-        double expected{std::log(p / (1.0 - p))};
-        double actual{argmin.value()[0]};
-
-        BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses);
-        BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected));
-    }
-
-    // Test underflow of probabilities.
-    {
-        CArgMinLogisticImpl argmin{0.0};
-
-        TDoubleVec predictions{-500.0, -30.0, -15.0, -400.0};
-        TDoubleVec actuals{1.0, 1.0, 0.0, 1.0};
-        do {
-            for (std::size_t i = 0; i < predictions.size(); ++i) {
-                maths::CFloatStorage storage[]{predictions[i]};
-                TMemoryMappedFloatVector prediction{storage, 1};
-                argmin.add(prediction, actuals[i]);
-            }
-        } while (argmin.nextPass());
-
-        double minimizer{argmin.value()[0]};
-
-        // Check we're at the minimum.
-        maths::boosted_tree::CBinomialLogistic loss;
-        TDoubleVec losses;
-        for (double eps : {-10.0, 0.0, 10.0}) {
-            double lossAtEps{0.0};
-            for (std::size_t i = 0; i < predictions.size(); ++i) {
-                maths::CFloatStorage storage[]{predictions[i] + minimizer + eps};
-                TMemoryMappedFloatVector probe{storage, 1};
-                lossAtEps += loss.value(probe, actuals[i]);
-            }
-            losses.push_back(lossAtEps);
-        }
-        BOOST_TEST_REQUIRE(losses[0] >= losses[1]);
-        BOOST_TEST_REQUIRE(losses[2] >= losses[1]);
-    }
-}
-
-BOOST_AUTO_TEST_CASE(testLogisticMinimizerRandom) {
-
-    // Test that we a good approximation of the additive term for the log-odds
-    // which minimises the cross entropy objective.
-
-    using maths::boosted_tree_detail::CArgMinLogisticImpl;
-
-    test::CRandomNumbers rng;
-
-    TDoubleVec labels;
-    TDoubleVec weights;
-
-    for (auto lambda : {0.0, 10.0}) {
-
-        LOG_DEBUG(<< "lambda = " << lambda);
-
-        // The true objective.
-        auto objective = [&](double weight) {
-            double loss{0.0};
-            for (std::size_t i = 0; i < labels.size(); ++i) {
-                double p{maths::CTools::logisticFunction(weights[i] + weight)};
-                loss -= (1.0 - labels[i]) * maths::CTools::fastLog(1.0 - p) +
-                        labels[i] * maths::CTools::fastLog(p);
-            }
-            return loss + lambda * maths::CTools::pow2(weight);
-        };
-
-        // This loop is fuzzing the predicted log-odds and testing we get consistently
-        // good estimates of the true minimizer.
-        for (std::size_t t = 0; t < 10; ++t) {
-
-            double min{std::numeric_limits<double>::max()};
-            double max{-min};
-
-            rng.generateUniformSamples(0.0, 1.0, 1000, labels);
-            for (auto& label : labels) {
-                label = std::floor(label + 0.5);
-            }
-            weights.clear();
-            for (const auto& label : labels) {
-                TDoubleVec weight;
-                rng.generateNormalSamples(label, 2.0, 1, weight);
-                weights.push_back(weight[0]);
-                min = std::min(min, weight[0]);
-                max = std::max(max, weight[0]);
-            }
-
-            double expected;
-            double objectiveAtExpected;
-            std::size_t maxIterations{20};
-            maths::CSolvers::minimize(-max, -min, objective(-max), objective(-min),
-                                      objective, 1e-3, maxIterations, expected,
-                                      objectiveAtExpected);
-            LOG_DEBUG(<< "expected = " << expected
-                      << " objective at expected = " << objectiveAtExpected);
-
-            CArgMinLogisticImpl argmin{lambda};
-            CArgMinLogisticImpl argminPartition[2]{{lambda}, {lambda}};
-            auto nextPass = [&] {
-                bool done{argmin.nextPass() == false};
-                done &= (argminPartition[0].nextPass() == false);
-                done &= (argminPartition[1].nextPass() == false);
-                return done == false;
-            };
-
-            do {
-                for (std::size_t i = 0; i < labels.size() / 2; ++i) {
-                    maths::CFloatStorage storage[]{weights[i]};
-                    TMemoryMappedFloatVector prediction{storage, 1};
-                    argmin.add(prediction, labels[i]);
-                    argminPartition[0].add(prediction, labels[i]);
-                }
-                for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) {
-                    maths::CFloatStorage storage[]{weights[i]};
-                    TMemoryMappedFloatVector prediction{storage, 1};
-                    argmin.add(prediction, labels[i]);
-                    argminPartition[1].add(prediction, labels[i]);
-                }
-                argminPartition[0].merge(argminPartition[1]);
-                argminPartition[1] = argminPartition[0];
-            } while (nextPass());
-
-            double actual{argmin.value()(0)};
-            double actualPartition{argminPartition[0].value()(0)};
-            LOG_DEBUG(<< "actual = " << actual
-                      << " objective at actual = " << objective(actual));
-
-            // We should be within 1% for the value and 0.001% for the objective
-            // at the value.
-            BOOST_REQUIRE_EQUAL(actual, actualPartition);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected));
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(objectiveAtExpected, objective(actual),
-                                         1e-5 * objectiveAtExpected);
-        }
-    }
-}
-
-BOOST_AUTO_TEST_CASE(testLogisticLossForUnderflow) {
-
-    // Test the behaviour of value, gradient and curvature of the logistic loss in
-    // the vicinity the point at which we switch to using Taylor expansion of the
-    // logistic function is as expected.
-
-    double eps{100.0 * std::numeric_limits<double>::epsilon()};
-
-    maths::boosted_tree::CBinomialLogistic loss;
-
-    // Losses should be very nearly linear function of log-odds when they're large.
-    {
-        maths::CFloatStorage predictions[]{1.0 - std::log(eps), 1.0 + std::log(eps)};
-        TMemoryMappedFloatVector prediction0{&predictions[0], 1};
-        TMemoryMappedFloatVector prediction1{&predictions[1], 1};
-        TDoubleVec lastLoss{loss.value(prediction0, 0.0), loss.value(prediction1, 1.0)};
-        for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
-            predictions[0] = scale - std::log(eps);
-            predictions[1] = scale + std::log(eps);
-            TDoubleVec currentLoss{loss.value(prediction0, 0.0),
-                                   loss.value(prediction1, 1.0)};
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, lastLoss[0] - currentLoss[0], 0.005);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, lastLoss[1] - currentLoss[1], 0.005);
-            lastLoss = currentLoss;
-        }
-    }
-
-    // The gradient and curvature should be proportional to the exponential of the
-    // log-odds when they're small.
-    {
-        auto readDerivatives = [&](double prediction, TDoubleVec& gradients,
-                                   TDoubleVec& curvatures) {
-            maths::CFloatStorage predictions[]{prediction + std::log(eps),
-                                               prediction - std::log(eps)};
-            TMemoryMappedFloatVector prediction0{&predictions[0], 1};
-            TMemoryMappedFloatVector prediction1{&predictions[1], 1};
-            loss.gradient(prediction0, 0.0, [&](std::size_t, double value) {
-                gradients[0] = value;
-            });
-            loss.gradient(prediction1, 1.0, [&](std::size_t, double value) {
-                gradients[1] = value;
-            });
-            loss.curvature(prediction0, 0.0, [&](std::size_t, double value) {
-                curvatures[0] = value;
-            });
-            loss.curvature(prediction1, 1.0, [&](std::size_t, double value) {
-                curvatures[1] = value;
-            });
-        };
-
-        TDoubleVec lastGradient(2);
-        TDoubleVec lastCurvature(2);
-        readDerivatives(1.0, lastGradient, lastCurvature);
-
-        for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) {
-            TDoubleVec currentGradient(2);
-            TDoubleVec currentCurvature(2);
-            readDerivatives(scale, currentGradient, currentCurvature);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(0.25),
-                                         lastGradient[0] / currentGradient[0], 0.01);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(std::exp(-0.25),
-                                         lastGradient[1] / currentGradient[1], 0.01);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(
-                std::exp(0.25), lastCurvature[0] / currentCurvature[0], 0.01);
-            BOOST_REQUIRE_CLOSE_ABSOLUTE(
-                std::exp(-0.25), lastCurvature[1] / currentCurvature[1], 0.01);
-            lastGradient = currentGradient;
-            lastCurvature = currentCurvature;
-        }
-    }
-}
-
 BOOST_AUTO_TEST_CASE(testLogisticRegression) {
 
     // The idea of this test is to create a random linear relationship between
@@ -1191,9 +938,10 @@ BOOST_AUTO_TEST_CASE(testLogisticRegression) {
         fillDataFrame(trainRows, rows - trainRows, cols, {false, false, false, true},
                       x, TDoubleVec(rows, 0.0), target, *frame);
 
-        auto regression = maths::CBoostedTreeFactory::constructFromParameters(
-                              1, std::make_unique<maths::boosted_tree::CBinomialLogistic>())
-                              .buildFor(*frame, cols - 1);
+        auto regression =
+            maths::CBoostedTreeFactory::constructFromParameters(
+                1, std::make_unique<maths::boosted_tree::CBinomialLogisticLoss>())
+                .buildFor(*frame, cols - 1);
 
         regression->train();
         regression->predict();
@@ -1263,7 +1011,7 @@ BOOST_AUTO_TEST_CASE(testImbalancedClasses) {
     frame->finishWritingRows();
 
     auto regression = maths::CBoostedTreeFactory::constructFromParameters(
-                          1, std::make_unique<maths::boosted_tree::CBinomialLogistic>())
+                          1, std::make_unique<maths::boosted_tree::CBinomialLogisticLoss>())
                           .buildFor(*frame, cols - 1);
 
     regression->train();
diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc
index 2394334220..cb89e921bb 100644
--- a/lib/maths/unittest/CDataFrameUtilsTest.cc
+++ b/lib/maths/unittest/CDataFrameUtilsTest.cc
@@ -975,7 +975,7 @@ BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategoriesWithMissing) {
             }
         }
         rng.generateUniformSamples(0.0, 1.0, 1, u01);
-        if (u01[i] < 0.9) {
+        if (u01[0] < 0.9) {
             for (std::size_t j = 0; j + 1 < cols; ++j) {
                 if (maths::CDataFrameUtils::isMissing(values[j][i]) == false) {
                     values[cols - 1][i] += values[j][i];
diff --git a/lib/maths/unittest/CKMeansOnlineTest.cc b/lib/maths/unittest/CKMeansOnlineTest.cc
index 56d7179235..0fd144e31e 100644
--- a/lib/maths/unittest/CKMeansOnlineTest.cc
+++ b/lib/maths/unittest/CKMeansOnlineTest.cc
@@ -4,6 +4,7 @@
  * you may not use this file except in compliance with the Elastic License.
  */
 
+#include "core/CContainerPrinter.h"
 #include <core/CLogger.h>
 #include <core/CRapidXmlParser.h>
 #include <core/CRapidXmlStatePersistInserter.h>
@@ -13,6 +14,7 @@
 #include <maths/CBasicStatisticsPersist.h>
 #include <maths/CKMeansOnline.h>
 #include <maths/CLinearAlgebra.h>
+#include <maths/CLinearAlgebraShims.h>
 #include <maths/CRestoreParams.h>
 
 #include <test/BoostTestCloseAbsolute.h>
@@ -32,6 +34,7 @@ using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar<double>::TAc
 using TVector2 = maths::CVectorNx1<double, 2>;
 using TVector2Vec = std::vector<TVector2>;
 using TVector2VecVec = std::vector<TVector2Vec>;
+using TFloatVector2 = maths::CVectorNx1<maths::CFloatStorage, 2>;
 using TMean2Accumulator = maths::CBasicStatistics::SSampleMean<TVector2>::TAccumulator;
 using TMeanVar2Accumulator = maths::CBasicStatistics::SSampleMeanVar<TVector2>::TAccumulator;
 using TVector5 = maths::CVectorNx1<double, 5>;
@@ -39,22 +42,43 @@ using TVector5Vec = std::vector<TVector5>;
 using TMeanVar5Accumulator = maths::CBasicStatistics::SSampleMeanVar<TVector5>::TAccumulator;
 
 template<typename POINT>
-class CKMeansOnlineTestForTest : public maths::CKMeansOnline<POINT> {
+class CKMeansOnlineForTest : public maths::CKMeansOnline<POINT> {
 public:
     using TSphericalClusterVec = typename maths::CKMeansOnline<POINT>::TSphericalClusterVec;
-    using TDoubleMeanVarAccumulator = typename maths::CKMeansOnline<POINT>::TDoubleMeanVarAccumulator;
-    using TFloatMeanAccumulatorDoublePr =
-        typename maths::CKMeansOnline<POINT>::TFloatMeanAccumulatorDoublePr;
+    using TFloatCoordinate = typename maths::CKMeansOnline<POINT>::TFloatCoordinate;
+    using TFloatPoint = typename maths::CKMeansOnline<POINT>::TFloatPoint;
+    using TFloatPointMeanAccumulatorDoublePr =
+        typename maths::CKMeansOnline<POINT>::TFloatPointMeanAccumulatorDoublePr;
+    using TFloatPointMeanAccumulatorDoublePrVec =
+        typename maths::CKMeansOnline<POINT>::TFloatPointMeanAccumulatorDoublePrVec;
+    using TDoublePointMeanVarAccumulator =
+        typename maths::CKMeansOnline<POINT>::TDoublePointMeanVarAccumulator;
+    using TDoublePoint = typename maths::CKMeansOnline<POINT>::TDoublePoint;
 
 public:
-    CKMeansOnlineTestForTest(std::size_t k, double decayRate = 0.0)
+    CKMeansOnlineForTest(std::size_t k, double decayRate = 0.0)
         : maths::CKMeansOnline<POINT>(k, decayRate) {}
 
-    static void add(const POINT& x, double count, TFloatMeanAccumulatorDoublePr& cluster) {
-        maths::CKMeansOnline<POINT>::add(x, count, cluster);
+    static void deduplicate(TFloatPointMeanAccumulatorDoublePrVec& clusters) {
+        maths::CKMeansOnline<POINT>::deduplicate(clusters);
     }
 
-    static double variance(const TDoubleMeanVarAccumulator& moments) {
+    static void add(const POINT& mx, double count, TFloatPointMeanAccumulatorDoublePr& cluster) {
+        double nx{count};
+        TDoublePoint vx{maths::las::zero(mx)};
+        double nc{maths::CBasicStatistics::count(cluster.first)};
+        TDoublePoint mc{maths::CBasicStatistics::mean(cluster.first)};
+        TDoublePoint vc{cluster.second * maths::las::ones(mx)};
+        TDoublePointMeanVarAccumulator moments{
+            maths::CBasicStatistics::momentsAccumulator(nc, mc, vc) +
+            maths::CBasicStatistics::momentsAccumulator(nx, mx, vx)};
+        TFloatCoordinate ncx{maths::CBasicStatistics::count(moments)};
+        TFloatPoint mcx{maths::CBasicStatistics::mean(moments)};
+        cluster.first = maths::CBasicStatistics::momentsAccumulator(ncx, mcx);
+        cluster.second = variance(moments);
+    }
+
+    static double variance(const TDoublePointMeanVarAccumulator& moments) {
         return maths::CKMeansOnline<POINT>::variance(moments);
     }
 };
@@ -93,13 +117,13 @@ BOOST_AUTO_TEST_CASE(testVariance) {
             expected.add(coordinates[i] - maths::CBasicStatistics::mean(actual)(i % 5));
         }
 
-        LOG_DEBUG(<< "actual   = " << CKMeansOnlineTestForTest<TVector5>::variance(actual));
+        LOG_DEBUG(<< "actual   = " << CKMeansOnlineForTest<TVector5>::variance(actual));
         LOG_DEBUG(<< "expected = "
                   << maths::CBasicStatistics::maximumLikelihoodVariance(expected));
 
         BOOST_REQUIRE_CLOSE_ABSOLUTE(
             maths::CBasicStatistics::maximumLikelihoodVariance(expected),
-            CKMeansOnlineTestForTest<TVector5>::variance(actual),
+            CKMeansOnlineForTest<TVector5>::variance(actual),
             1e-10 * maths::CBasicStatistics::maximumLikelihoodVariance(expected));
     }
 }
@@ -128,7 +152,7 @@ BOOST_AUTO_TEST_CASE(testAdd) {
         TMean2AccumulatorDoublePr actual;
         TMeanVar2Accumulator expected;
         for (std::size_t i = 0u; i < points.size(); ++i) {
-            CKMeansOnlineTestForTest<TVector2>::add(points[i], counts[i], actual);
+            CKMeansOnlineForTest<TVector2>::add(points[i], counts[i], actual);
             expected.add(points[i], counts[i]);
         }
 
@@ -152,6 +176,93 @@ BOOST_AUTO_TEST_CASE(testAdd) {
     }
 }
 
+BOOST_AUTO_TEST_CASE(testDeduplicate) {
+    // Test we behaviour:
+    //   - If all points are duplicates
+    //   - If no points are duplicates
+    //   - For random permutation of duplicates
+
+    CKMeansOnlineForTest<TVector2>::TFloatPointMeanAccumulatorDoublePrVec points;
+
+    points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                            maths::CFloatStorage{1.0}, TFloatVector2{0.0}),
+                        0.0);
+    points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                            maths::CFloatStorage{2.0}, TFloatVector2{0.0}),
+                        0.0);
+    points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                            maths::CFloatStorage{1.0}, TFloatVector2{0.0}),
+                        0.0);
+    CKMeansOnlineForTest<TVector2>::deduplicate(points);
+    BOOST_REQUIRE_EQUAL(1, points.size());
+    BOOST_REQUIRE_EQUAL(TFloatVector2{0.0},
+                        maths::CBasicStatistics::mean(points[0].first));
+    BOOST_REQUIRE_EQUAL(4.0, maths::CBasicStatistics::count(points[0].first));
+    BOOST_REQUIRE_EQUAL(0.0, points[0].second);
+    points.clear();
+
+    points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                            maths::CFloatStorage{1.0}, TFloatVector2{0.0}),
+                        0.0);
+    points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                            maths::CFloatStorage{2.0}, TFloatVector2{1.0}),
+                        0.0);
+    points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                            maths::CFloatStorage{1.0}, TFloatVector2{2.0}),
+                        0.0);
+    CKMeansOnlineForTest<TVector2>::deduplicate(points);
+    BOOST_REQUIRE_EQUAL(3, points.size());
+    BOOST_REQUIRE_EQUAL(TFloatVector2{0.0},
+                        maths::CBasicStatistics::mean(points[0].first));
+    BOOST_REQUIRE_EQUAL(1.0, maths::CBasicStatistics::count(points[0].first));
+    BOOST_REQUIRE_EQUAL(0.0, points[0].second);
+    BOOST_REQUIRE_EQUAL(TFloatVector2{1.0},
+                        maths::CBasicStatistics::mean(points[1].first));
+    BOOST_REQUIRE_EQUAL(2.0, maths::CBasicStatistics::count(points[1].first));
+    BOOST_REQUIRE_EQUAL(0.0, points[1].second);
+    BOOST_REQUIRE_EQUAL(TFloatVector2{2.0},
+                        maths::CBasicStatistics::mean(points[2].first));
+    BOOST_REQUIRE_EQUAL(1.0, maths::CBasicStatistics::count(points[2].first));
+    BOOST_REQUIRE_EQUAL(0.0, points[2].second);
+    points.clear();
+
+    test::CRandomNumbers rng;
+    for (std::size_t t = 1; t <= 100; ++t) {
+        for (std::size_t i = 0; i < 5; ++i) {
+            points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                                    maths::CFloatStorage{static_cast<double>(i)},
+                                    TFloatVector2{0.0}),
+                                0.0);
+        }
+        for (std::size_t i = 0; i < 7; ++i) {
+            points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                                    maths::CFloatStorage{1.0}, TFloatVector2{1.0}),
+                                0.0);
+        }
+        for (std::size_t i = 0; i < 3; ++i) {
+            points.emplace_back(maths::CBasicStatistics::momentsAccumulator(
+                                    maths::CFloatStorage{2.0}, TFloatVector2{2.0}),
+                                0.0);
+        }
+        rng.random_shuffle(points.begin(), points.end());
+
+        CKMeansOnlineForTest<TVector2>::deduplicate(points);
+
+        BOOST_REQUIRE_EQUAL(3, points.size());
+        BOOST_REQUIRE_EQUAL(TFloatVector2{0.0},
+                            maths::CBasicStatistics::mean(points[0].first));
+        BOOST_REQUIRE_EQUAL(10.0, maths::CBasicStatistics::count(points[0].first));
+        BOOST_REQUIRE_EQUAL(TFloatVector2{1.0},
+                            maths::CBasicStatistics::mean(points[1].first));
+        BOOST_REQUIRE_EQUAL(7.0, maths::CBasicStatistics::count(points[1].first));
+        BOOST_REQUIRE_EQUAL(TFloatVector2{2.0},
+                            maths::CBasicStatistics::mean(points[2].first));
+        BOOST_REQUIRE_EQUAL(6.0, maths::CBasicStatistics::count(points[2].first));
+
+        points.clear();
+    }
+}
+
 BOOST_AUTO_TEST_CASE(testReduce) {
     // Test some invariants:
     //   - Number of clusters should be no more than k after
@@ -186,8 +297,8 @@ BOOST_AUTO_TEST_CASE(testReduce) {
             kmeans.add(points[i], counts[i]);
             expected.add(points[i], counts[i]);
 
-            if ((i + 1) % 7 == 0) {
-                CKMeansOnlineTestForTest<TVector2>::TSphericalClusterVec clusters;
+            if (((i - 10) + 1) % 7 == 0) {
+                CKMeansOnlineForTest<TVector2>::TSphericalClusterVec clusters;
                 kmeans.clusters(clusters);
                 BOOST_TEST_REQUIRE(clusters.size() <= 10);
 
@@ -362,7 +473,7 @@ BOOST_AUTO_TEST_CASE(testSplit) {
     TVector2Vec points;
     for (std::size_t i = 0u; i < 2; ++i) {
         TDoubleVec coordinates;
-        rng.generateNormalSamples(m[i], v[i], 350, coordinates);
+        rng.generateNormalSamples(m[i], v[i], 352, coordinates);
         for (std::size_t j = 0u; j < coordinates.size(); j += 2) {
             double c[]{coordinates[j + 0], coordinates[j + 1]};
             points.push_back(TVector2(c));
@@ -370,10 +481,18 @@ BOOST_AUTO_TEST_CASE(testSplit) {
     }
 
     maths::CKMeansOnline<TVector2> kmeansOnline(30);
-    for (std::size_t i = 0u; i < points.size(); ++i) {
+    const std::size_t BUFFERING{0};
+    const std::size_t NOT_BUFFERING{1};
+    std::size_t counts[2]{0, 0};
+    for (std::size_t i = 0; i < 30; ++i) {
+        kmeansOnline.add(points[i]);
+    }
+    for (std::size_t i = 30; i < points.size(); ++i) {
         kmeansOnline.add(points[i]);
+        ++counts[kmeansOnline.buffering() ? BUFFERING : NOT_BUFFERING];
     }
-    BOOST_TEST_REQUIRE(!kmeansOnline.buffering());
+    BOOST_REQUIRE_EQUAL(counts[BUFFERING], maths::CKMeansOnline<TVector2>::BUFFER_SIZE *
+                                               counts[NOT_BUFFERING]);
 
     std::size_t one[]{0, 2, 7, 18, 19, 22};
     std::size_t two[]{3, 4, 5, 6, 10, 11, 23, 24};
@@ -443,7 +562,7 @@ BOOST_AUTO_TEST_CASE(testMerge) {
 
     TMeanVar2Accumulator expected;
     for (std::size_t i = 0u; i < 2; ++i) {
-        CKMeansOnlineTestForTest<TVector2>::TSphericalClusterVec clusters;
+        CKMeansOnlineForTest<TVector2>::TSphericalClusterVec clusters;
         kmeans[i].clusters(clusters);
         for (std::size_t j = 0u; j < clusters.size(); ++j) {
             expected.add(clusters[j]);
@@ -453,7 +572,7 @@ BOOST_AUTO_TEST_CASE(testMerge) {
     kmeans[0].merge(kmeans[1]);
 
     TMeanVar2Accumulator actual;
-    CKMeansOnlineTestForTest<TVector2>::TSphericalClusterVec clusters;
+    CKMeansOnlineForTest<TVector2>::TSphericalClusterVec clusters;
     kmeans[0].clusters(clusters);
     for (std::size_t j = 0u; j < clusters.size(); ++j) {
         actual.add(clusters[j]);
@@ -498,7 +617,7 @@ BOOST_AUTO_TEST_CASE(testPropagateForwardsByTime) {
         kmeans.add(points[i]);
     }
 
-    CKMeansOnlineTestForTest<TVector2>::TSphericalClusterVec clusters;
+    CKMeansOnlineForTest<TVector2>::TSphericalClusterVec clusters;
     kmeans.clusters(clusters);
     LOG_DEBUG(<< "clusters before = " << core::CContainerPrinter::print(clusters));
 
diff --git a/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc b/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc
index 66aa7c8415..92f405cc9e 100644
--- a/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc
+++ b/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc
@@ -511,7 +511,7 @@ BOOST_AUTO_TEST_CASE(testSplitAndMerge) {
         LOG_DEBUG(<< "mean meanError = " << maths::CBasicStatistics::mean(meanMeanError));
         LOG_DEBUG(<< "mean covError  = " << maths::CBasicStatistics::mean(meanCovError));
         BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanMeanError) < 0.013);
-        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanCovError) < 0.030);
+        BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanCovError) < 0.044);
     }
 }
 
diff --git a/lib/maths/unittest/CXMeansOnlineTest.cc b/lib/maths/unittest/CXMeansOnlineTest.cc
index e879883a75..bd7da22a5d 100644
--- a/lib/maths/unittest/CXMeansOnlineTest.cc
+++ b/lib/maths/unittest/CXMeansOnlineTest.cc
@@ -312,8 +312,8 @@ BOOST_AUTO_TEST_CASE(testClusteringVanilla) {
             }
             LOG_DEBUG(<< "mean error = " << meanError[0]);
             LOG_DEBUG(<< "covariance error = " << covError[0]);
-            BOOST_TEST_REQUIRE(meanError[0] < 0.034);
-            BOOST_TEST_REQUIRE(covError[0] < 0.39);
+            BOOST_TEST_REQUIRE(meanError[0] < 0.045);
+            BOOST_TEST_REQUIRE(covError[0] < 0.36);
             meanMeanError.add(meanError[0]);
             meanCovError.add(covError[0]);
         }
@@ -653,6 +653,9 @@ BOOST_AUTO_TEST_CASE(testLargeHistory) {
 }
 
 BOOST_AUTO_TEST_CASE(testLatLongData) {
+    // Check that the log likelihood of the data in the lat_long.csv
+    // is significantly increased by clustering.
+
     using TTimeDoubleVecPr = std::pair<core_t::TTime, TDoubleVec>;
     using TTimeDoubleVecPrVec = std::vector<TTimeDoubleVecPr>;
     using TMeanAccumulator = maths::CBasicStatistics::SSampleMean<double>::TAccumulator;
@@ -714,8 +717,8 @@ BOOST_AUTO_TEST_CASE(testLatLongData) {
 
     LOG_DEBUG(<< "gaussian log(L)  = " << maths::CBasicStatistics::mean(LLR));
     LOG_DEBUG(<< "clustered log(L) = " << maths::CBasicStatistics::mean(LLC));
-    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(LLC) <
-                       0.6 * maths::CBasicStatistics::mean(LLR));
+    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(LLC) >
+                       0.4 * maths::CBasicStatistics::mean(LLR));
 }
 
 BOOST_AUTO_TEST_CASE(testPersist) {
diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile
index a6dfab62a6..e985f1fc6c 100644
--- a/lib/maths/unittest/Makefile
+++ b/lib/maths/unittest/Makefile
@@ -25,6 +25,7 @@ SRCS=\
 	CBayesianOptimisationTest.cc \
 	CBjkstUniqueValuesTest.cc \
 	CBoostedTreeLeafNodeStatisticsTest.cc \
+	CBoostedTreeLossTest.cc \
 	CBoostedTreeTest.cc \
 	CBootstrapClustererTest.cc \
 	CBoundingBoxTest.cc \