elastic · tveasey · Jun 29, 2018 · Jun 8, 2018 · Jun 11, 2018 · Jun 14, 2018
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -37,6 +37,7 @@ Explicit change point detection and modelling ({pull}92[#92])
 Improve partition analysis memory usage ({pull}97[#97])
 Reduce model memory by storing state for periodicity testing in a compressed format ({pull}100[#100])
 Improve the accuracy of model memory control ({pull}122[#122])
+Improve adaption of the modelling of cyclic components to very localised features ({pull}134[#134])
 
 Forecasting of Machine Learning job time series is now supported for large jobs by temporarily storing
 model state on disk ({pull}89[#89])

diff --git a/include/maths/CAdaptiveBucketing.h b/include/maths/CAdaptiveBucketing.h
@@ -61,10 +61,9 @@ namespace maths {
 //!
 //! For sufficiently smooth functions and a given number of buckets
 //! the objective is minimized by ensuring that "bucket width" x
-//! "function range" is approximately equal in all buckets.
+//! "function range" is equal in all buckets.
 //!
-//! The bucketing is aged by relaxing it back towards uniform and
-//! aging the counts of the mean value for each bucket as usual.
+//! The bucketing is aged by relaxing it back towards uniform.
 class MATHS_EXPORT CAdaptiveBucketing {
 public:
     using TDoubleVec = std::vector<double>;
@@ -73,26 +72,92 @@ class MATHS_EXPORT CAdaptiveBucketing {
     using TFloatMeanAccumulatorVec = std::vector<TFloatMeanAccumulator>;
 
 public:
-    //! Restore by traversing a state document
-    bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
+    //! Refine the bucket end points to minimize the maximum averaging
+    //! error in any bucket.
+    //!
+    //! \param[in] time The time at which to refine.
+    void refine(core_t::TTime time);
 
-    //! Persist by passing information to the supplied inserter.
-    void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
+    //! Check if the bucketing has been initialized.
+    bool initialized() const;
+
+    //! Get the number of buckets.
+    std::size_t size() const;
+
+    //! Set the rate at which the bucketing loses information.
+    void decayRate(double value);
+
+    //! Get the rate at which the bucketing loses information.
+    double decayRate() const;
+
+    //! Get the minimum permitted bucket length.
+    double minimumBucketLength() const;
+
+    //! Get the bucket end points.
+    const TFloatVec& endpoints() const;
+
+    //! Get the bucket value centres.
+    const TFloatVec& centres() const;
+
+    //! Get the bucket value centres.
+    const TFloatVec& largeErrorCounts() const;
+
+    //! Get a set of knot points and knot point values to use for
+    //! interpolating the bucket values.
+    //!
+    //! \param[in] time The time at which to get the knot points.
+    //! \param[in] boundary Controls the style of start and end knots.
+    //! \param[out] knots Filled in with the knot points to interpolate.
+    //! \param[out] values Filled in with the values at \p knots.
+    //! \param[out] variances Filled in with the variances at \p knots.
+    //! \return True if there are sufficient knot points to interpolate
+    //! and false otherwise.
+    bool knots(core_t::TTime time,
+               CSplineTypes::EBoundaryCondition boundary,
+               TDoubleVec& knots,
+               TDoubleVec& values,
+               TDoubleVec& variances) const;
+
+    //! \name Test Functions
+    //@{
+    //! Get the total count of in the bucketing.
+    double count() const;
+
+    //! Get the bucket regressions.
+    TDoubleVec values(core_t::TTime time) const;
+
+    //! Get the bucket variances.
+    TDoubleVec variances() const;
+    //@}
+
+protected:
+    using TRestoreFunc = std::function<bool(core::CStateRestoreTraverser&)>;
+    using TPersistFunc = std::function<void(core::CStatePersistInserter&)>;
+
+protected:
+    //! The minimum number of standard deviations for an error to be
+    //! considered large.
+    static const double LARGE_ERROR_STANDARD_DEVIATIONS;
 
 protected:
     CAdaptiveBucketing(double decayRate, double minimumBucketLength);
-    //! Construct by traversing a state document.
-    CAdaptiveBucketing(double decayRate,
-                       double minimumBucketLength,
-                       core::CStateRestoreTraverser& traverser);
     virtual ~CAdaptiveBucketing() = default;
 
+    //! Get the restore function bound to this object.
+    TRestoreFunc getAcceptRestoreTraverser();
+
+    //! Get the accept persist function bound to this object.
+    TPersistFunc getAcceptPersistInserter() const;
+
+    //! Restore by traversing a state document
+    bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
+
+    //! Persist by passing information to the supplied inserter.
+    void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
+
     //! Efficiently swap the contents of two bucketing objects.
     void swap(CAdaptiveBucketing& other);
 
-    //! Check if the bucketing has been initialized.
-    bool initialized() const;
-
     //! Create a new uniform bucketing with \p n buckets on the
     //! interval [\p a, \p b].
     //!
@@ -113,75 +178,32 @@ class MATHS_EXPORT CAdaptiveBucketing {
                        core_t::TTime endTime,
                        const TFloatMeanAccumulatorVec& values);
 
-    //! Get the number of buckets.
-    std::size_t size() const;
-
     //! Clear the contents of this bucketing and recover any
     //! allocated memory.
     void clear();
 
     //! Add the function value at \p time.
     //!
     //! \param[in] bucket The index of the bucket of \p time.
-    //! \param[in] time The time of \p value.
-    //! \param[in] weight The weight of function point. The smaller
-    //! this is the less influence it has on the bucket.
+    //! \param[in] time The time of the value being added.
+    //! \param[in] weight The weight of the value being added. The
+    //! smaller this is the less influence it has on the bucket.
     void add(std::size_t bucket, core_t::TTime time, double weight);
 
-    //! Set the rate at which the bucketing loses information.
-    void decayRate(double value);
-
-    //! Get the rate at which the bucketing loses information.
-    double decayRate() const;
+    //! Add a large error in \p bucket.
+    void addLargeError(std::size_t bucket, core_t::TTime time);
 
     //! Age the force moments.
     void age(double factor);
 
-    //! Get the minimum permitted bucket length.
-    double minimumBucketLength() const;
-
-    //! Refine the bucket end points to minimize the maximum averaging
-    //! error in any bucket.
-    //!
-    //! \param[in] time The time at which to refine.
-    void refine(core_t::TTime time);
-
-    //! Get a set of knot points and knot point values to use for
-    //! interpolating the bucket values.
-    //!
-    //! \param[in] time The time at which to get the knot points.
-    //! \param[in] boundary Controls the style of start and end knots.
-    //! \param[out] knots Filled in with the knot points to interpolate.
-    //! \param[out] values Filled in with the values at \p knots.
-    //! \param[out] variances Filled in with the variances at \p knots.
-    //! \return True if there are sufficient knot points to interpolate
-    //! and false otherwise.
-    bool knots(core_t::TTime time,
-               CSplineTypes::EBoundaryCondition boundary,
-               TDoubleVec& knots,
-               TDoubleVec& values,
-               TDoubleVec& variances) const;
-
-    //! Get the bucket end points.
-    const TFloatVec& endpoints() const;
-
-    //! Get the bucket end points.
-    TFloatVec& endpoints();
-
-    //! Get the bucket value centres.
-    const TFloatVec& centres() const;
-
     //! Get the bucket value centres.
     TFloatVec& centres();
 
-    //! Get the total count of in the bucketing.
-    double count() const;
-
-    //! Get the bucket regressions.
-    TDoubleVec values(core_t::TTime time) const;
+    //! Get the bucket value centres.
+    TFloatVec& largeErrorCounts();
 
-    //! Get the bucket variances.
-    TDoubleVec variances() const;
+    //! Adjust \p weight for significant large error counts.
+    double adjustedWeight(std::size_t bucket, double weight) const;
 
     //! Compute the index of the bucket to which \p time belongs
     bool bucket(core_t::TTime time, std::size_t& result) const;
@@ -192,6 +214,10 @@ class MATHS_EXPORT CAdaptiveBucketing {
     //! Get the memory used by this component
     std::size_t memoryUsage() const;
 
+private:
+    using TFloatUInt32Pr = std::pair<CFloatStorage, std::uint32_t>;
+    using TFloatUInt32PrMinAccumulator = CBasicStatistics::SMin<TFloatUInt32Pr, 2>::TAccumulator;
+
 private:
     //! Compute the values corresponding to the change in end
     //! points from \p endpoints. The values are assigned based
@@ -208,15 +234,25 @@ class MATHS_EXPORT CAdaptiveBucketing {
     //! Get the offset w.r.t. the start of the bucketing of \p time.
     virtual double offset(core_t::TTime time) const = 0;
 
-    //! The count in \p bucket.
-    virtual double count(std::size_t bucket) const = 0;
+    //! Get the count in \p bucket.
+    virtual double bucketCount(std::size_t bucket) const = 0;
 
-    //! Get the predicted value for the \p bucket at \p time.
+    //! Get the predicted value for \p bucket at \p time.
     virtual double predict(std::size_t bucket, core_t::TTime time, double offset) const = 0;
 
     //! Get the variance of \p bucket.
     virtual double variance(std::size_t bucket) const = 0;
 
+    //! Implements split of \p bucket for derived state.
+    virtual void split(std::size_t bucket) = 0;
+
+    //! Check if there is evidence of systematically large errors in a
+    //! bucket and split it if there is.
+    void maybeSplitBucket();
+
+    //! Split \p bucket.
+    void splitBucket(std::size_t bucket);
+
 private:
     //! The rate at which information is aged out of the bucket values.
     double m_DecayRate;
@@ -225,12 +261,34 @@ class MATHS_EXPORT CAdaptiveBucketing {
     //! is ignored.
     double m_MinimumBucketLength;
 
+    //! The desired number of buckets. We can use more if we determine
+    //! that we aren't capturing the periodic pattern effectively.
+    //!
+    //! \see maybeSplitBucketMostSignificantBuckets for details.
+    std::size_t m_TargetSize = 0;
+
+    //! The bucket of the last large error added.
+    std::size_t m_LastLargeErrorBucket = 0;
+
+    //! The period of the last large error added.
+    core_t::TTime m_LastLargeErrorPeriod = 0;
+
+    //! The p-values of the most significant large error counts.
+    TFloatUInt32PrMinAccumulator m_LargeErrorCountSignificances;
+
+    //! The mean weight of values added.
+    TFloatMeanAccumulator m_MeanWeight;
+
     //! The bucket end points.
     TFloatVec m_Endpoints;
 
-    //! The mean periodic time of each regression.
+    //! The mean offset (relative to the start of the bucket) of samples
+    //! in each bucket.
     TFloatVec m_Centres;
 
+    //! The count of large errors in each bucket.
+    TFloatVec m_LargeErrorCounts;
+
     //! An IIR low pass filter for the total desired end point displacement
     //! in refine.
     TFloatMeanAccumulator m_MeanDesiredDisplacement;

diff --git a/include/maths/CBasicStatistics.h b/include/maths/CBasicStatistics.h
@@ -1032,6 +1032,8 @@ class MATHS_EXPORT CBasicStatistics {
         using const_iterator = typename CONTAINER::const_iterator;
         using reverse_iterator = typename CONTAINER::reverse_iterator;
         using const_reverse_iterator = typename CONTAINER::const_reverse_iterator;
+        using TToString = std::function<std::string(const T&)>;
+        using TFromString = std::function<bool(const std::string&, T&)>;
 
     public:
         COrderStatisticsImpl(const CONTAINER& statistics, const LESS& less)
@@ -1043,8 +1045,20 @@ class MATHS_EXPORT CBasicStatistics {
         //! Initialize from a delimited string.
         bool fromDelimited(const std::string& value);
 
+        //! Initialize from a delimited string using \p fromString to initialize
+        //! values of type T from a string.
+        //!
+        //! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER.
+        bool fromDelimited(const std::string& value, const TFromString& fromString);
+
         //! Convert to a delimited string.
         std::string toDelimited() const;
+
+        //! Convert to a delimited string using \p toString to convert individual
+        //! values of type T to a string.
+        //!
+        //! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER.
+        std::string toDelimited(const TToString& toString) const;
         //@}
 
         //! \name Update
@@ -1367,15 +1381,15 @@ class MATHS_EXPORT CBasicStatistics {
     //! \name Accumulator Typedefs
     //@{
     //! Accumulator object to compute the sample maximum.
-    template<typename T>
+    template<typename T, std::size_t N = 1>
     struct SMax {
-        using TAccumulator = COrderStatisticsStack<T, 1, std::greater<T>>;
+        using TAccumulator = COrderStatisticsStack<T, N, std::greater<T>>;
     };
 
     //! Accumulator object to compute the sample minimum.
-    template<typename T>
+    template<typename T, std::size_t N = 1>
     struct SMin {
-        using TAccumulator = COrderStatisticsStack<T, 1>;
+        using TAccumulator = COrderStatisticsStack<T, N>;
     };
     //@}