diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 96251cdf84..190f86734d 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -37,6 +37,7 @@ Explicit change point detection and modelling ({pull}92[#92]) Improve partition analysis memory usage ({pull}97[#97]) Reduce model memory by storing state for periodicity testing in a compressed format ({pull}100[#100]) Improve the accuracy of model memory control ({pull}122[#122]) +Improve adaption of the modelling of cyclic components to very localised features ({pull}134[#134]) Forecasting of Machine Learning job time series is now supported for large jobs by temporarily storing model state on disk ({pull}89[#89]) diff --git a/include/maths/CAdaptiveBucketing.h b/include/maths/CAdaptiveBucketing.h index 8a081e1f0a..be6450a64c 100644 --- a/include/maths/CAdaptiveBucketing.h +++ b/include/maths/CAdaptiveBucketing.h @@ -61,10 +61,9 @@ namespace maths { //! //! For sufficiently smooth functions and a given number of buckets //! the objective is minimized by ensuring that "bucket width" x -//! "function range" is approximately equal in all buckets. +//! "function range" is equal in all buckets. //! -//! The bucketing is aged by relaxing it back towards uniform and -//! aging the counts of the mean value for each bucket as usual. +//! The bucketing is aged by relaxing it back towards uniform. class MATHS_EXPORT CAdaptiveBucketing { public: using TDoubleVec = std::vector; @@ -73,26 +72,92 @@ class MATHS_EXPORT CAdaptiveBucketing { using TFloatMeanAccumulatorVec = std::vector; public: - //! Restore by traversing a state document - bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + //! Refine the bucket end points to minimize the maximum averaging + //! error in any bucket. + //! + //! \param[in] time The time at which to refine. + void refine(core_t::TTime time); - //! Persist by passing information to the supplied inserter. - void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + //! Check if the bucketing has been initialized. + bool initialized() const; + + //! Get the number of buckets. + std::size_t size() const; + + //! Set the rate at which the bucketing loses information. + void decayRate(double value); + + //! Get the rate at which the bucketing loses information. + double decayRate() const; + + //! Get the minimum permitted bucket length. + double minimumBucketLength() const; + + //! Get the bucket end points. + const TFloatVec& endpoints() const; + + //! Get the bucket value centres. + const TFloatVec& centres() const; + + //! Get the bucket value centres. + const TFloatVec& largeErrorCounts() const; + + //! Get a set of knot points and knot point values to use for + //! interpolating the bucket values. + //! + //! \param[in] time The time at which to get the knot points. + //! \param[in] boundary Controls the style of start and end knots. + //! \param[out] knots Filled in with the knot points to interpolate. + //! \param[out] values Filled in with the values at \p knots. + //! \param[out] variances Filled in with the variances at \p knots. + //! \return True if there are sufficient knot points to interpolate + //! and false otherwise. + bool knots(core_t::TTime time, + CSplineTypes::EBoundaryCondition boundary, + TDoubleVec& knots, + TDoubleVec& values, + TDoubleVec& variances) const; + + //! \name Test Functions + //@{ + //! Get the total count of in the bucketing. + double count() const; + + //! Get the bucket regressions. + TDoubleVec values(core_t::TTime time) const; + + //! Get the bucket variances. + TDoubleVec variances() const; + //@} + +protected: + using TRestoreFunc = std::function; + using TPersistFunc = std::function; + +protected: + //! The minimum number of standard deviations for an error to be + //! considered large. + static const double LARGE_ERROR_STANDARD_DEVIATIONS; protected: CAdaptiveBucketing(double decayRate, double minimumBucketLength); - //! Construct by traversing a state document. - CAdaptiveBucketing(double decayRate, - double minimumBucketLength, - core::CStateRestoreTraverser& traverser); virtual ~CAdaptiveBucketing() = default; + //! Get the restore function bound to this object. + TRestoreFunc getAcceptRestoreTraverser(); + + //! Get the accept persist function bound to this object. + TPersistFunc getAcceptPersistInserter() const; + + //! Restore by traversing a state document + bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + + //! Persist by passing information to the supplied inserter. + void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + //! Efficiently swap the contents of two bucketing objects. void swap(CAdaptiveBucketing& other); - //! Check if the bucketing has been initialized. - bool initialized() const; - //! Create a new uniform bucketing with \p n buckets on the //! interval [\p a, \p b]. //! @@ -113,9 +178,6 @@ class MATHS_EXPORT CAdaptiveBucketing { core_t::TTime endTime, const TFloatMeanAccumulatorVec& values); - //! Get the number of buckets. - std::size_t size() const; - //! Clear the contents of this bucketing and recover any //! allocated memory. void clear(); @@ -123,65 +185,25 @@ class MATHS_EXPORT CAdaptiveBucketing { //! Add the function value at \p time. //! //! \param[in] bucket The index of the bucket of \p time. - //! \param[in] time The time of \p value. - //! \param[in] weight The weight of function point. The smaller - //! this is the less influence it has on the bucket. + //! \param[in] time The time of the value being added. + //! \param[in] weight The weight of the value being added. The + //! smaller this is the less influence it has on the bucket. void add(std::size_t bucket, core_t::TTime time, double weight); - //! Set the rate at which the bucketing loses information. - void decayRate(double value); - - //! Get the rate at which the bucketing loses information. - double decayRate() const; + //! Add a large error in \p bucket. + void addLargeError(std::size_t bucket, core_t::TTime time); //! Age the force moments. void age(double factor); - //! Get the minimum permitted bucket length. - double minimumBucketLength() const; - - //! Refine the bucket end points to minimize the maximum averaging - //! error in any bucket. - //! - //! \param[in] time The time at which to refine. - void refine(core_t::TTime time); - - //! Get a set of knot points and knot point values to use for - //! interpolating the bucket values. - //! - //! \param[in] time The time at which to get the knot points. - //! \param[in] boundary Controls the style of start and end knots. - //! \param[out] knots Filled in with the knot points to interpolate. - //! \param[out] values Filled in with the values at \p knots. - //! \param[out] variances Filled in with the variances at \p knots. - //! \return True if there are sufficient knot points to interpolate - //! and false otherwise. - bool knots(core_t::TTime time, - CSplineTypes::EBoundaryCondition boundary, - TDoubleVec& knots, - TDoubleVec& values, - TDoubleVec& variances) const; - - //! Get the bucket end points. - const TFloatVec& endpoints() const; - - //! Get the bucket end points. - TFloatVec& endpoints(); - - //! Get the bucket value centres. - const TFloatVec& centres() const; - //! Get the bucket value centres. TFloatVec& centres(); - //! Get the total count of in the bucketing. - double count() const; - - //! Get the bucket regressions. - TDoubleVec values(core_t::TTime time) const; + //! Get the bucket value centres. + TFloatVec& largeErrorCounts(); - //! Get the bucket variances. - TDoubleVec variances() const; + //! Adjust \p weight for significant large error counts. + double adjustedWeight(std::size_t bucket, double weight) const; //! Compute the index of the bucket to which \p time belongs bool bucket(core_t::TTime time, std::size_t& result) const; @@ -192,6 +214,10 @@ class MATHS_EXPORT CAdaptiveBucketing { //! Get the memory used by this component std::size_t memoryUsage() const; +private: + using TFloatUInt32Pr = std::pair; + using TFloatUInt32PrMinAccumulator = CBasicStatistics::SMin::TAccumulator; + private: //! Compute the values corresponding to the change in end //! points from \p endpoints. The values are assigned based @@ -208,15 +234,25 @@ class MATHS_EXPORT CAdaptiveBucketing { //! Get the offset w.r.t. the start of the bucketing of \p time. virtual double offset(core_t::TTime time) const = 0; - //! The count in \p bucket. - virtual double count(std::size_t bucket) const = 0; + //! Get the count in \p bucket. + virtual double bucketCount(std::size_t bucket) const = 0; - //! Get the predicted value for the \p bucket at \p time. + //! Get the predicted value for \p bucket at \p time. virtual double predict(std::size_t bucket, core_t::TTime time, double offset) const = 0; //! Get the variance of \p bucket. virtual double variance(std::size_t bucket) const = 0; + //! Implements split of \p bucket for derived state. + virtual void split(std::size_t bucket) = 0; + + //! Check if there is evidence of systematically large errors in a + //! bucket and split it if there is. + void maybeSplitBucket(); + + //! Split \p bucket. + void splitBucket(std::size_t bucket); + private: //! The rate at which information is aged out of the bucket values. double m_DecayRate; @@ -225,12 +261,34 @@ class MATHS_EXPORT CAdaptiveBucketing { //! is ignored. double m_MinimumBucketLength; + //! The desired number of buckets. We can use more if we determine + //! that we aren't capturing the periodic pattern effectively. + //! + //! \see maybeSplitBucketMostSignificantBuckets for details. + std::size_t m_TargetSize = 0; + + //! The bucket of the last large error added. + std::size_t m_LastLargeErrorBucket = 0; + + //! The period of the last large error added. + core_t::TTime m_LastLargeErrorPeriod = 0; + + //! The p-values of the most significant large error counts. + TFloatUInt32PrMinAccumulator m_LargeErrorCountSignificances; + + //! The mean weight of values added. + TFloatMeanAccumulator m_MeanWeight; + //! The bucket end points. TFloatVec m_Endpoints; - //! The mean periodic time of each regression. + //! The mean offset (relative to the start of the bucket) of samples + //! in each bucket. TFloatVec m_Centres; + //! The count of large errors in each bucket. + TFloatVec m_LargeErrorCounts; + //! An IIR low pass filter for the total desired end point displacement //! in refine. TFloatMeanAccumulator m_MeanDesiredDisplacement; diff --git a/include/maths/CBasicStatistics.h b/include/maths/CBasicStatistics.h index dfcf624fe8..41ea07e3ac 100644 --- a/include/maths/CBasicStatistics.h +++ b/include/maths/CBasicStatistics.h @@ -1032,6 +1032,8 @@ class MATHS_EXPORT CBasicStatistics { using const_iterator = typename CONTAINER::const_iterator; using reverse_iterator = typename CONTAINER::reverse_iterator; using const_reverse_iterator = typename CONTAINER::const_reverse_iterator; + using TToString = std::function; + using TFromString = std::function; public: COrderStatisticsImpl(const CONTAINER& statistics, const LESS& less) @@ -1043,8 +1045,20 @@ class MATHS_EXPORT CBasicStatistics { //! Initialize from a delimited string. bool fromDelimited(const std::string& value); + //! Initialize from a delimited string using \p fromString to initialize + //! values of type T from a string. + //! + //! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER. + bool fromDelimited(const std::string& value, const TFromString& fromString); + //! Convert to a delimited string. std::string toDelimited() const; + + //! Convert to a delimited string using \p toString to convert individual + //! values of type T to a string. + //! + //! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER. + std::string toDelimited(const TToString& toString) const; //@} //! \name Update @@ -1367,15 +1381,15 @@ class MATHS_EXPORT CBasicStatistics { //! \name Accumulator Typedefs //@{ //! Accumulator object to compute the sample maximum. - template + template struct SMax { - using TAccumulator = COrderStatisticsStack>; + using TAccumulator = COrderStatisticsStack>; }; //! Accumulator object to compute the sample minimum. - template + template struct SMin { - using TAccumulator = COrderStatisticsStack; + using TAccumulator = COrderStatisticsStack; }; //@} diff --git a/include/maths/CBasicStatisticsPersist.h b/include/maths/CBasicStatisticsPersist.h index ebd8fcd4fe..e31c91af65 100644 --- a/include/maths/CBasicStatisticsPersist.h +++ b/include/maths/CBasicStatisticsPersist.h @@ -197,6 +197,15 @@ uint64_t CBasicStatistics::SSampleCovariances::checksum() const { template bool CBasicStatistics::COrderStatisticsImpl::fromDelimited(const std::string& value) { + return this->fromDelimited(value, [](const std::string& value_, T& result) { + return basic_statistics_detail::stringToType(value_, result); + }); +} + +template +bool CBasicStatistics::COrderStatisticsImpl::fromDelimited( + const std::string& value, + const TFromString& fromString) { this->clear(); if (value.empty()) { @@ -207,7 +216,7 @@ bool CBasicStatistics::COrderStatisticsImpl::fromDelimited(c std::size_t delimPos{value.find(INTERNAL_DELIMITER)}; if (delimPos == std::string::npos) { - if (basic_statistics_detail::stringToType(value, statistic) == false) { + if (fromString(value, statistic) == false) { LOG_ERROR(<< "Invalid statistic in '" << value << "'"); return false; } @@ -217,11 +226,11 @@ bool CBasicStatistics::COrderStatisticsImpl::fromDelimited(c m_UnusedCount = m_Statistics.size(); - std::string statistic_; - statistic_.reserve(15); - statistic_.assign(value, 0, delimPos); - if (basic_statistics_detail::stringToType(statistic_, statistic) == false) { - LOG_ERROR(<< "Invalid statistic '" << statistic_ << "' in '" << value << "'"); + std::string token; + token.reserve(15); + token.assign(value, 0, delimPos); + if (fromString(token, statistic) == false) { + LOG_ERROR(<< "Invalid statistic '" << token << "' in '" << value << "'"); return false; } m_Statistics[--m_UnusedCount] = statistic; @@ -229,9 +238,9 @@ bool CBasicStatistics::COrderStatisticsImpl::fromDelimited(c while (delimPos != value.size()) { std::size_t nextDelimPos{ std::min(value.find(INTERNAL_DELIMITER, delimPos + 1), value.size())}; - statistic_.assign(value, delimPos + 1, nextDelimPos - delimPos - 1); - if (basic_statistics_detail::stringToType(statistic_, statistic) == false) { - LOG_ERROR(<< "Invalid statistic '" << statistic_ << "' in '" << value << "'"); + token.assign(value, delimPos + 1, nextDelimPos - delimPos - 1); + if (fromString(token, statistic) == false) { + LOG_ERROR(<< "Invalid statistic '" << token << "' in '" << value << "'"); return false; } m_Statistics[--m_UnusedCount] = statistic; @@ -243,14 +252,22 @@ bool CBasicStatistics::COrderStatisticsImpl::fromDelimited(c template std::string CBasicStatistics::COrderStatisticsImpl::toDelimited() const { + return this->toDelimited([](const T& value_) { + return basic_statistics_detail::typeToString(value_); + }); +} + +template +std::string +CBasicStatistics::COrderStatisticsImpl::toDelimited(const TToString& toString) const { if (this->count() == 0) { return std::string{}; } std::size_t i{m_Statistics.size()}; - std::string result{basic_statistics_detail::typeToString(m_Statistics[i - 1])}; + std::string result{toString(m_Statistics[i - 1])}; for (--i; i > m_UnusedCount; --i) { result += INTERNAL_DELIMITER; - result += basic_statistics_detail::typeToString(m_Statistics[i - 1]); + result += toString(m_Statistics[i - 1]); } return result; } diff --git a/include/maths/CCalendarComponentAdaptiveBucketing.h b/include/maths/CCalendarComponentAdaptiveBucketing.h index dd389e9c0d..bbfb427367 100644 --- a/include/maths/CCalendarComponentAdaptiveBucketing.h +++ b/include/maths/CCalendarComponentAdaptiveBucketing.h @@ -32,9 +32,10 @@ class CSeasonalTime; //! //! DESCRIPTION:\n //! See CAdaptiveBucketing for details. -class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucketing { +class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : public CAdaptiveBucketing { public: using TFloatMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; + using CAdaptiveBucketing::count; public: CCalendarComponentAdaptiveBucketing(); @@ -52,17 +53,11 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket //! Efficiently swap the contents of two bucketing objects. void swap(CCalendarComponentAdaptiveBucketing& other); - //! Check if the bucketing has been initialized. - bool initialized() const; - //! Create a new uniform bucketing with \p n buckets. //! //! \param[in] n The number of buckets. bool initialize(std::size_t n); - //! Get the number of buckets. - std::size_t size() const; - //! Clear the contents of this bucketing and recover any //! allocated memory. void clear(); @@ -78,26 +73,11 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket //! this is the less influence it has on the bucket. void add(core_t::TTime time, double value, double weight = 1.0); - //! Get the calendar feature. - CCalendarFeature feature() const; - - //! Set the rate at which the bucketing loses information. - void decayRate(double value); - - //! Get the rate at which the bucketing loses information. - double decayRate() const; - //! Age the bucket values to account for \p time elapsed time. void propagateForwardsByTime(double time); - //! Get the minimum permitted bucket length. - double minimumBucketLength() const; - - //! Refine the bucket end points to minimize the maximum averaging - //! error in any bucket. - //! - //! \param[in] time The time at which to refine. - void refine(core_t::TTime time); + //! Get the calendar feature. + CCalendarFeature feature() const; //! The count in the bucket containing \p time. double count(core_t::TTime time) const; @@ -108,22 +88,6 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket //! Get the value at \p time. const TFloatMeanVarAccumulator* value(core_t::TTime time) const; - //! Get a set of knot points and knot point values to use for - //! interpolating the bucket values. - //! - //! \param[in] time The time at which to get the knot points. - //! \param[in] boundary Controls the style of start and end knots. - //! \param[out] knots Filled in with the knot points to interpolate. - //! \param[out] values Filled in with the values at \p knots. - //! \param[out] variances Filled in with the variances at \p knots. - //! \return True if there are sufficient knot points to interpolate - //! and false otherwise. - bool knots(core_t::TTime time, - CSplineTypes::EBoundaryCondition boundary, - TDoubleVec& knots, - TDoubleVec& values, - TDoubleVec& variances) const; - //! Get a checksum for this object. uint64_t checksum(uint64_t seed = 0) const; @@ -133,21 +97,6 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket //! Get the memory used by this component std::size_t memoryUsage() const; - //! \name Test Functions - //@{ - //! Get the bucket end points. - const TFloatVec& endpoints() const; - - //! Get the total count of in the bucketing. - double count() const; - - //! Get the bucket regressions. - TDoubleVec values(core_t::TTime time) const; - - //! Get the bucket variances. - TDoubleVec variances() const; - //@} - private: using TFloatMeanVarVec = std::vector; @@ -172,15 +121,18 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket //! Get the offset w.r.t. the start of the bucketing of \p time. virtual double offset(core_t::TTime time) const; - //! The count in \p bucket. - virtual double count(std::size_t bucket) const; + //! Get the count in \p bucket. + virtual double bucketCount(std::size_t bucket) const; - //! Get the predicted value for the \p bucket at \p time. + //! Get the predicted value for \p bucket at \p time. virtual double predict(std::size_t bucket, core_t::TTime time, double offset) const; //! Get the variance of \p bucket. virtual double variance(std::size_t bucket) const; + //! Split \p bucket. + virtual void split(std::size_t bucket); + private: //! The time provider. CCalendarFeature m_Feature; diff --git a/include/maths/CSeasonalComponent.h b/include/maths/CSeasonalComponent.h index acf1bab420..7e0cc92864 100644 --- a/include/maths/CSeasonalComponent.h +++ b/include/maths/CSeasonalComponent.h @@ -97,8 +97,9 @@ class MATHS_EXPORT CSeasonalComponent : private CDecompositionComponent { //! Shift the component's values by \p shift. void shiftLevel(double shift); - //! Shift the component's slope by \p shift. - void shiftSlope(double shift); + //! Shift the component's slope by \p shift keeping the prediction at + //! \p time fixed. + void shiftSlope(core_t::TTime time, double shift); //! Linearly scale the component's by \p scale. void linearScale(core_t::TTime time, double scale); diff --git a/include/maths/CSeasonalComponentAdaptiveBucketing.h b/include/maths/CSeasonalComponentAdaptiveBucketing.h index 1d4a23aaa4..063733028e 100644 --- a/include/maths/CSeasonalComponentAdaptiveBucketing.h +++ b/include/maths/CSeasonalComponentAdaptiveBucketing.h @@ -32,11 +32,12 @@ namespace maths { //! //! DESCRIPTION:\n //! See CAdaptiveBucketing for details. -class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucketing { +class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : public CAdaptiveBucketing { public: using CAdaptiveBucketing::TFloatMeanAccumulatorVec; using TDoubleRegression = CRegression::CLeastSquaresOnline<1, double>; using TRegression = CRegression::CLeastSquaresOnline<1, CFloatStorage>; + using CAdaptiveBucketing::count; public: CSeasonalComponentAdaptiveBucketing(); @@ -59,9 +60,6 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket //! Efficiently swap the contents of two bucketing objects. void swap(CSeasonalComponentAdaptiveBucketing& other); - //! Check if the bucketing has been initialized. - bool initialized() const; - //! Create a new uniform bucketing with \p n buckets. //! //! \param[in] n The number of buckets. @@ -79,9 +77,6 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket core_t::TTime endTime, const TFloatMeanAccumulatorVec& values); - //! Get the number of buckets. - std::size_t size() const; - //! Clear the contents of this bucketing and recover any //! allocated memory. void clear(); @@ -92,8 +87,9 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket //! Shift the regressions' ordinates by \p shift. void shiftLevel(double shift); - //! Shift the regressions' gradients by \p shift. - void shiftSlope(double shift); + //! Shift the regressions' gradients by \p shift keeping the prediction + //! at \p time fixed. + void shiftSlope(core_t::TTime time, double shift); //! Linearly scale the regressions by \p scale. void linearScale(double scale); @@ -107,49 +103,18 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket //! this is the less influence it has on the bucket. void add(core_t::TTime time, double value, double prediction, double weight = 1.0); - //! Get the time provider. - const CSeasonalTime& time() const; - - //! Set the rate at which the bucketing loses information. - void decayRate(double value); - - //! Get the rate at which the bucketing loses information. - double decayRate() const; - //! Age the bucket values to account for \p time elapsed time. void propagateForwardsByTime(double time, bool meanRevert = false); - //! Get the minimum permitted bucket length. - double minimumBucketLength() const; - - //! Refine the bucket end points to minimize the maximum averaging - //! error in any bucket. - //! - //! \param[in] time The time at which to refine. - void refine(core_t::TTime time); + //! Get the time provider. + const CSeasonalTime& time() const; - //! The count in the bucket containing \p time. + //! Get the count in the bucket containing \p time. double count(core_t::TTime time) const; //! Get the regression to use at \p time. const TRegression* regression(core_t::TTime time) const; - //! Get a set of knot points and knot point values to use for - //! interpolating the bucket values. - //! - //! \param[in] time The time at which to get the knot points. - //! \param[in] boundary Controls the style of start and end knots. - //! \param[out] knots Filled in with the knot points to interpolate. - //! \param[out] values Filled in with the values at \p knots. - //! \param[out] variances Filled in with the variances at \p knots. - //! \return True if there are sufficient knot points to interpolate - //! and false otherwise. - bool knots(core_t::TTime time, - CSplineTypes::EBoundaryCondition boundary, - TDoubleVec& knots, - TDoubleVec& values, - TDoubleVec& variances) const; - //! Get the common slope of the bucket regression models. double slope() const; @@ -165,21 +130,6 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket //! Get the memory used by this component std::size_t memoryUsage() const; - //! \name Test Functions - //@{ - //! Get the bucket end points. - const TFloatVec& endpoints() const; - - //! Get the total count of in the bucketing. - double count() const; - - //! Get the bucket regression predictions at \p time. - TDoubleVec values(core_t::TTime time) const; - - //! Get the bucket variances. - TDoubleVec variances() const; - //@} - private: using TSeasonalTimePtr = std::unique_ptr; @@ -225,14 +175,17 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket virtual double offset(core_t::TTime time) const; //! The count in \p bucket. - virtual double count(std::size_t bucket) const; + virtual double bucketCount(std::size_t bucket) const; - //! Get the predicted value for the \p bucket at \p time. + //! Get the predicted value for \p bucket at \p time. virtual double predict(std::size_t bucket, core_t::TTime time, double offset) const; //! Get the variance of \p bucket. virtual double variance(std::size_t bucket) const; + //! Split \p bucket. + virtual void split(std::size_t bucket); + //! Get the interval which has been observed at \p time. double observedInterval(core_t::TTime time) const; diff --git a/include/maths/CTools.h b/include/maths/CTools.h index 60e726e8cf..caa09fd33b 100644 --- a/include/maths/CTools.h +++ b/include/maths/CTools.h @@ -680,6 +680,9 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width)); } + //! Linearly interpolate a function on the interval [\p a, \p b]. + static double linearlyInterpolate(double a, double b, double fa, double fb, double x); + //! A custom, numerically robust, implementation of \f$(1 - x) ^ p\f$. //! //! \note It is assumed that p is integer. diff --git a/include/maths/CTrendComponent.h b/include/maths/CTrendComponent.h index 8f4d67aab9..e9dcaa4377 100644 --- a/include/maths/CTrendComponent.h +++ b/include/maths/CTrendComponent.h @@ -39,8 +39,8 @@ struct SDistributionRestoreParams; //! scale. It also allows us to accurately estimate confidence intervals //! (since these can be estimated from the variation of observed values //! we see w.r.t. the predictions from the next longer time scale component). -//! This produces plausible looking and this sort of mean reversion is common -//! in many real world time series. +//! This produces plausible looking forecasts and this sort of mean reversion +//! is common in many real world time series. class MATHS_EXPORT CTrendComponent { public: using TDoubleDoublePr = maths_t::TDoubleDoublePr; @@ -79,9 +79,9 @@ class MATHS_EXPORT CTrendComponent { //! Shift the regression models' time origins to \p time. void shiftOrigin(core_t::TTime time); - //! Shift the slope of all regression models' whose decay rate is - //! greater than \p decayRate. - void shiftSlope(double decayRate, double shift); + //! Shift the slope of the regression models' keeping the prediction + //! at \p time fixed. + void shiftSlope(core_t::TTime time, double shift); //! Apply a level shift of \p value at \p time and \p value. void shiftLevel(core_t::TTime time, double value, double shift); diff --git a/lib/maths/CAdaptiveBucketing.cc b/lib/maths/CAdaptiveBucketing.cc index 3e7ed9e58a..e0f739c571 100644 --- a/lib/maths/CAdaptiveBucketing.cc +++ b/lib/maths/CAdaptiveBucketing.cc @@ -19,9 +19,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -31,7 +33,31 @@ namespace ml { namespace maths { namespace { -using TDoubleMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; +using TSizeVec = std::vector; +using TFloatUInt32Pr = std::pair; + +//! Convert to a delimited string. +std::string significanceToDelimited(const TFloatUInt32Pr& value) { + return value.first.toString() + CBasicStatistics::EXTERNAL_DELIMITER + + core::CStringUtils::typeToString(value.second); +} + +//! Initialize from a delimited string. +bool significanceFromDelimited(const std::string& delimited, TFloatUInt32Pr& value) { + std::size_t pos{delimited.find(CBasicStatistics::EXTERNAL_DELIMITER)}; + if (pos == std::string::npos) { + LOG_ERROR(<< "Failed to delimiter in '" << delimited << "'"); + return false; + } + unsigned int count{}; + if (value.first.fromString(delimited.substr(0, pos)) == false || + core::CStringUtils::stringToType(delimited.substr(pos + 1), count) == false) { + LOG_ERROR(<< "Failed to extract value from '" << delimited << "'"); + return false; + } + value.second = count; + return true; +} //! Clear a vector and recover its memory. template @@ -45,6 +71,12 @@ const std::string ENDPOINT_TAG{"b"}; const std::string CENTRES_TAG{"c"}; const std::string MEAN_DESIRED_DISPLACEMENT_TAG{"d"}; const std::string MEAN_ABS_DESIRED_DISPLACEMENT_TAG{"e"}; +const std::string LARGE_ERROR_COUNTS_TAG{"f"}; +const std::string TARGET_SIZE_TAG{"g"}; +const std::string LAST_LARGE_ERROR_BUCKET_TAG{"h"}; +const std::string LAST_LARGE_ERROR_PERIOD_TAG{"i"}; +const std::string LARGE_ERROR_COUNT_SIGNIFICANCES_TAG{"j"}; +const std::string MEAN_WEIGHT_TAG{"k"}; const std::string EMPTY_STRING; const double SMOOTHING_FUNCTION[]{0.25, 0.5, 0.25}; @@ -53,49 +85,82 @@ const double ALPHA{0.25}; const double EPS{std::numeric_limits::epsilon()}; const double WEIGHTS[]{1.0, 1.0, 1.0, 0.75, 0.5}; const double MINIMUM_DECAY_RATE{0.001}; +const double MINIMUM_LARGE_ERROR_COUNT_TO_SPLIT{10.0}; +const double MODERATE_SIGNIFICANCE{1e-2}; +const double HIGH_SIGNIFICANCE{1e-3}; +const double LOG_MODERATE_SIGNIFICANCE{std::log(MODERATE_SIGNIFICANCE)}; +const double LOG_HIGH_SIGNIFICANCE{std::log(HIGH_SIGNIFICANCE)}; +} + +CAdaptiveBucketing::CAdaptiveBucketing(double decayRate, double minimumBucketLength) + : m_DecayRate{std::max(decayRate, MINIMUM_DECAY_RATE)}, m_MinimumBucketLength{minimumBucketLength} { +} + +CAdaptiveBucketing::TRestoreFunc CAdaptiveBucketing::getAcceptRestoreTraverser() { + return boost::bind(&CAdaptiveBucketing::acceptRestoreTraverser, this, _1); +} + +CAdaptiveBucketing::TPersistFunc CAdaptiveBucketing::getAcceptPersistInserter() const { + return boost::bind(&CAdaptiveBucketing::acceptPersistInserter, this, _1); } bool CAdaptiveBucketing::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { do { const std::string& name{traverser.name()}; RESTORE_BUILT_IN(DECAY_RATE_TAG, m_DecayRate) + RESTORE_BUILT_IN(TARGET_SIZE_TAG, m_TargetSize) + RESTORE_BUILT_IN(LAST_LARGE_ERROR_BUCKET_TAG, m_LastLargeErrorBucket) + RESTORE_BUILT_IN(LAST_LARGE_ERROR_PERIOD_TAG, m_LastLargeErrorPeriod) + RESTORE(LARGE_ERROR_COUNT_SIGNIFICANCES_TAG, + m_LargeErrorCountSignificances.fromDelimited(traverser.value(), significanceFromDelimited)) + RESTORE(MEAN_WEIGHT_TAG, m_MeanWeight.fromDelimited(traverser.value())) RESTORE(ENDPOINT_TAG, core::CPersistUtils::fromString(traverser.value(), m_Endpoints)) RESTORE(CENTRES_TAG, core::CPersistUtils::fromString(traverser.value(), m_Centres)) + RESTORE(LARGE_ERROR_COUNTS_TAG, + core::CPersistUtils::fromString(traverser.value(), m_LargeErrorCounts)) RESTORE(MEAN_DESIRED_DISPLACEMENT_TAG, m_MeanDesiredDisplacement.fromDelimited(traverser.value())) RESTORE(MEAN_ABS_DESIRED_DISPLACEMENT_TAG, m_MeanAbsDesiredDisplacement.fromDelimited(traverser.value())) } while (traverser.next()); + if (m_TargetSize == 0) { + m_TargetSize = this->size(); + } + if (m_LargeErrorCounts.empty()) { + m_LargeErrorCounts.resize(m_Centres.size(), 0.0); + } return true; } void CAdaptiveBucketing::acceptPersistInserter(core::CStatePersistInserter& inserter) const { inserter.insertValue(DECAY_RATE_TAG, m_DecayRate, core::CIEEE754::E_SinglePrecision); + inserter.insertValue(TARGET_SIZE_TAG, m_TargetSize); + inserter.insertValue(LAST_LARGE_ERROR_BUCKET_TAG, m_LastLargeErrorBucket); + inserter.insertValue(LAST_LARGE_ERROR_PERIOD_TAG, m_LastLargeErrorPeriod); + inserter.insertValue(LARGE_ERROR_COUNT_SIGNIFICANCES_TAG, + m_LargeErrorCountSignificances.toDelimited(significanceToDelimited)); + inserter.insertValue(MEAN_WEIGHT_TAG, m_MeanWeight.toDelimited()); inserter.insertValue(ENDPOINT_TAG, core::CPersistUtils::toString(m_Endpoints)); inserter.insertValue(CENTRES_TAG, core::CPersistUtils::toString(m_Centres)); + inserter.insertValue(LARGE_ERROR_COUNTS_TAG, + core::CPersistUtils::toString(m_LargeErrorCounts)); inserter.insertValue(MEAN_DESIRED_DISPLACEMENT_TAG, m_MeanDesiredDisplacement.toDelimited()); inserter.insertValue(MEAN_ABS_DESIRED_DISPLACEMENT_TAG, m_MeanAbsDesiredDisplacement.toDelimited()); } -CAdaptiveBucketing::CAdaptiveBucketing(double decayRate, double minimumBucketLength) - : m_DecayRate{std::max(decayRate, MINIMUM_DECAY_RATE)}, m_MinimumBucketLength{minimumBucketLength} { -} - -CAdaptiveBucketing::CAdaptiveBucketing(double decayRate, - double minimumBucketLength, - core::CStateRestoreTraverser& traverser) - : m_DecayRate{std::max(decayRate, MINIMUM_DECAY_RATE)}, m_MinimumBucketLength{minimumBucketLength} { - traverser.traverseSubLevel( - boost::bind(&CAdaptiveBucketing::acceptRestoreTraverser, this, _1)); -} - void CAdaptiveBucketing::swap(CAdaptiveBucketing& other) { std::swap(m_DecayRate, other.m_DecayRate); std::swap(m_MinimumBucketLength, other.m_MinimumBucketLength); + std::swap(m_TargetSize, other.m_TargetSize); + std::swap(m_LastLargeErrorBucket, other.m_LastLargeErrorBucket); + std::swap(m_LastLargeErrorPeriod, other.m_LastLargeErrorPeriod); + std::swap(m_LargeErrorCountSignificances, other.m_LargeErrorCountSignificances); + std::swap(m_MeanWeight, other.m_MeanWeight); m_Endpoints.swap(other.m_Endpoints); m_Centres.swap(other.m_Centres); + m_LargeErrorCounts.swap(other.m_LargeErrorCounts); std::swap(m_MeanDesiredDisplacement, other.m_MeanDesiredDisplacement); std::swap(m_MeanAbsDesiredDisplacement, other.m_MeanAbsDesiredDisplacement); } @@ -117,6 +182,7 @@ bool CAdaptiveBucketing::initialize(double a, double b, std::size_t n) { n = std::min(n, static_cast((b - a) / m_MinimumBucketLength)); } + m_TargetSize = n; m_Endpoints.clear(); m_Endpoints.reserve(n + 1); double width{(b - a) / static_cast(n)}; @@ -125,6 +191,8 @@ bool CAdaptiveBucketing::initialize(double a, double b, std::size_t n) { } m_Centres.clear(); m_Centres.resize(n); + m_LargeErrorCounts.clear(); + m_LargeErrorCounts.resize(n, 0.0); return true; } @@ -166,13 +234,27 @@ std::size_t CAdaptiveBucketing::size() const { void CAdaptiveBucketing::clear() { clearAndShrink(m_Endpoints); clearAndShrink(m_Centres); + clearAndShrink(m_LargeErrorCounts); } void CAdaptiveBucketing::add(std::size_t bucket, core_t::TTime time, double weight) { + using TDoubleMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; TDoubleMeanAccumulator centre{CBasicStatistics::accumulator( - this->count(bucket), static_cast(m_Centres[bucket]))}; + this->bucketCount(bucket), static_cast(m_Centres[bucket]))}; centre.add(this->offset(time), weight); m_Centres[bucket] = CBasicStatistics::mean(centre); + m_MeanWeight.add(weight); +} + +void CAdaptiveBucketing::addLargeError(std::size_t bucket, core_t::TTime time) { + core_t::TTime period{static_cast( + m_Endpoints[m_Endpoints.size() - 1] - m_Endpoints[0])}; + time = CIntegerTools::floor(time, period); + if (bucket != m_LastLargeErrorBucket || time != m_LastLargeErrorPeriod) { + m_LargeErrorCounts[bucket] += 1.0; + } + m_LastLargeErrorBucket = bucket; + m_LastLargeErrorPeriod = time; } void CAdaptiveBucketing::decayRate(double value) { @@ -184,8 +266,12 @@ double CAdaptiveBucketing::decayRate() const { } void CAdaptiveBucketing::age(double factor) { + for (auto& count : m_LargeErrorCounts) { + count *= factor; + } m_MeanDesiredDisplacement.age(factor); m_MeanAbsDesiredDisplacement.age(factor); + m_MeanWeight.age(factor); } double CAdaptiveBucketing::minimumBucketLength() const { @@ -196,17 +282,18 @@ void CAdaptiveBucketing::refine(core_t::TTime time) { using TDoubleDoublePr = std::pair; using TDoubleDoublePrVec = std::vector; using TDoubleSizePr = std::pair; - using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; - using TMaxAccumulator = CBasicStatistics::SMax::TAccumulator; + using TMinMaxAccumulator = CBasicStatistics::CMinMax; LOG_TRACE(<< "refining at " << time); - std::size_t n{m_Endpoints.size()}; - if (n < 2) { + if (m_Endpoints.size() < 2) { return; } - --n; + // Check if any buckets should be split based on the large error counts. + this->maybeSplitBucket(); + + std::size_t n{m_Endpoints.size() - 1}; double a{m_Endpoints[0]}; double b{m_Endpoints[n]}; @@ -214,7 +301,7 @@ void CAdaptiveBucketing::refine(core_t::TTime time) { TDoubleDoublePrVec values; values.reserve(n); for (std::size_t i = 0u; i < n; ++i) { - values.emplace_back(this->count(i), this->predict(i, time, m_Centres[i])); + values.emplace_back(this->bucketCount(i), this->predict(i, time, m_Centres[i])); } LOG_TRACE(<< "values = " << core::CContainerPrinter::print(values)); @@ -224,23 +311,21 @@ void CAdaptiveBucketing::refine(core_t::TTime time) { ranges.reserve(n); for (std::size_t i = 0u; i < n; ++i) { TDoubleDoublePr v[]{values[(n + i - 2) % n], values[(n + i - 1) % n], - values[(n + i + 0) % n], values[(n + i + 1) % n], - values[(n + i + 2) % n]}; + values[(n + i + 0) % n], // centre + values[(n + i + 1) % n], values[(n + i + 2) % n]}; - TMinAccumulator min; - TMaxAccumulator max; + TMinMaxAccumulator minmax; for (std::size_t j = 0u; j < sizeof(v) / sizeof(v[0]); ++j) { if (v[j].first > 0.0) { - min.add({v[j].second, j}); - max.add({v[j].second, j}); + minmax.add({v[j].second, j}); } } - if (min.count() > 0) { - ranges.push_back( - WEIGHTS[max[0].second > min[0].second ? max[0].second - min[0].second - : min[0].second - max[0].second] * - std::pow(max[0].first - min[0].first, 0.75)); + if (minmax.initialized() > 0) { + ranges.push_back(WEIGHTS[minmax.max().second > minmax.min().second + ? minmax.max().second - minmax.min().second + : minmax.min().second - minmax.max().second] * + std::pow(minmax.max().first - minmax.min().first, 0.75)); } else { ranges.push_back(0.0); } @@ -250,24 +335,28 @@ void CAdaptiveBucketing::refine(core_t::TTime time) { // We do this in the "time" domain because the smoothing // function is narrow. Estimate the averaging error in each // bucket by multiplying the smoothed range by the bucket width. - double totalAveragingError{0.0}; TDoubleVec averagingErrors; averagingErrors.reserve(n); for (std::size_t i = 0u; i < n; ++i) { double ai{m_Endpoints[i]}; double bi{m_Endpoints[i + 1]}; - double error{0.0}; for (std::size_t j = 0u; j < boost::size(SMOOTHING_FUNCTION); ++j) { error += SMOOTHING_FUNCTION[j] * ranges[(n + i + j - WIDTH) % n]; } - double h{bi - ai}; error *= h / (b - a); - averagingErrors.push_back(error); - totalAveragingError += error; } + double maxAveragingError{ + *std::max_element(averagingErrors.begin(), averagingErrors.end())}; + for (const auto& significance : m_LargeErrorCountSignificances) { + if (significance.first < MODERATE_SIGNIFICANCE) { + averagingErrors[significance.second] = maxAveragingError; + } + } + double totalAveragingError{ + std::accumulate(averagingErrors.begin(), averagingErrors.end(), 0.0)}; LOG_TRACE(<< "averagingErrors = " << core::CContainerPrinter::print(averagingErrors)); LOG_TRACE(<< "totalAveragingError = " << totalAveragingError); @@ -349,15 +438,13 @@ bool CAdaptiveBucketing::knots(core_t::TTime time, TDoubleVec& knots, TDoubleVec& values, TDoubleVec& variances) const { - using TSizeVec = std::vector; - knots.clear(); values.clear(); variances.clear(); std::size_t n{m_Centres.size()}; for (std::size_t i = 0u; i < n; ++i) { - if (this->count(i) > 0.0) { + if (this->bucketCount(i) > 0.0) { double wide{3.0 * (m_Endpoints[n] - m_Endpoints[0]) / static_cast(n)}; LOG_TRACE(<< "period " << m_Endpoints[n] - m_Endpoints[0] << ", # buckets = " << n << ", wide = " << wide); @@ -377,7 +464,7 @@ bool CAdaptiveBucketing::knots(core_t::TTime time, values.push_back(this->predict(i, time, c)); variances.push_back(this->variance(i)); for (/**/; i < n; ++i) { - if (this->count(i) > 0.0) { + if (this->bucketCount(i) > 0.0) { a = m_Endpoints[i]; b = m_Endpoints[i + 1]; c = m_Centres[i]; @@ -412,7 +499,7 @@ bool CAdaptiveBucketing::knots(core_t::TTime time, // start and end of the period because the gradient can vary, // but we expect them to be continuous. for (std::size_t j = n - 1; j > 0; --j) { - if (this->count(j) > 0.0) { + if (this->bucketCount(j) > 0.0) { double alpha{m_Endpoints[n] - m_Centres[j]}; double beta{c0}; double Z{alpha + beta}; @@ -426,7 +513,7 @@ bool CAdaptiveBucketing::knots(core_t::TTime time, } } for (std::size_t j = 0u; j < n; ++j) { - if (this->count(j) > 0.0) { + if (this->bucketCount(j) > 0.0) { double alpha{m_Centres[j]}; double beta{m_Endpoints[n] - knots.back()}; double Z{alpha + beta}; @@ -476,10 +563,6 @@ const CAdaptiveBucketing::TFloatVec& CAdaptiveBucketing::endpoints() const { return m_Endpoints; } -CAdaptiveBucketing::TFloatVec& CAdaptiveBucketing::endpoints() { - return m_Endpoints; -} - const CAdaptiveBucketing::TFloatVec& CAdaptiveBucketing::centres() const { return m_Centres; } @@ -488,10 +571,30 @@ CAdaptiveBucketing::TFloatVec& CAdaptiveBucketing::centres() { return m_Centres; } +const CAdaptiveBucketing::TFloatVec& CAdaptiveBucketing::largeErrorCounts() const { + return m_LargeErrorCounts; +} + +CAdaptiveBucketing::TFloatVec& CAdaptiveBucketing::largeErrorCounts() { + return m_LargeErrorCounts; +} + +double CAdaptiveBucketing::adjustedWeight(std::size_t bucket, double weight) const { + for (const auto& significance : m_LargeErrorCountSignificances) { + if (bucket == significance.second) { + double maxWeight{CBasicStatistics::mean(m_MeanWeight)}; + double logSignificance{CTools::fastLog(significance.first)}; + return CTools::linearlyInterpolate(LOG_HIGH_SIGNIFICANCE, LOG_MODERATE_SIGNIFICANCE, + maxWeight, weight, logSignificance); + } + } + return weight; +} + double CAdaptiveBucketing::count() const { double result = 0.0; for (std::size_t i = 0u; i < m_Centres.size(); ++i) { - result += this->count(i); + result += this->bucketCount(i); } return result; } @@ -516,7 +619,6 @@ CAdaptiveBucketing::TDoubleVec CAdaptiveBucketing::variances() const { bool CAdaptiveBucketing::bucket(core_t::TTime time, std::size_t& result) const { double t{this->offset(time)}; - std::size_t i(std::upper_bound(m_Endpoints.begin(), m_Endpoints.end(), t) - m_Endpoints.begin()); std::size_t n{m_Endpoints.size()}; @@ -525,7 +627,6 @@ bool CAdaptiveBucketing::bucket(core_t::TTime time, std::size_t& result) const { << m_Endpoints[n - 1] << ")"); return false; } - result = i - 1; return true; } @@ -533,14 +634,87 @@ bool CAdaptiveBucketing::bucket(core_t::TTime time, std::size_t& result) const { uint64_t CAdaptiveBucketing::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, m_DecayRate); seed = CChecksum::calculate(seed, m_MinimumBucketLength); + seed = CChecksum::calculate(seed, m_TargetSize); + seed = CChecksum::calculate(seed, m_LastLargeErrorBucket); + seed = CChecksum::calculate(seed, m_LastLargeErrorPeriod); + seed = CChecksum::calculate( + seed, m_LargeErrorCountSignificances.toDelimited(significanceToDelimited)); + seed = CChecksum::calculate(seed, m_MeanWeight); seed = CChecksum::calculate(seed, m_Endpoints); - return CChecksum::calculate(seed, m_Centres); + seed = CChecksum::calculate(seed, m_Centres); + seed = CChecksum::calculate(seed, m_LargeErrorCounts); + seed = CChecksum::calculate(seed, m_MeanDesiredDisplacement); + return CChecksum::calculate(seed, m_MeanAbsDesiredDisplacement); } std::size_t CAdaptiveBucketing::memoryUsage() const { std::size_t mem{core::CMemory::dynamicSize(m_Endpoints)}; mem += core::CMemory::dynamicSize(m_Centres); + mem += core::CMemory::dynamicSize(m_LargeErrorCounts); return mem; } + +void CAdaptiveBucketing::maybeSplitBucket() { + double largeErrorCount{std::accumulate(m_LargeErrorCounts.begin(), + m_LargeErrorCounts.end(), 0.0)}; + if (largeErrorCount >= MINIMUM_LARGE_ERROR_COUNT_TO_SPLIT) { + m_LargeErrorCountSignificances = TFloatUInt32PrMinAccumulator{}; + + // We compute the right tail p-value of the count of large errors + // in a bucket for the null hypothesis that they are uniformly + // distributed on the total bucketed period and split if this is + // less than a specified threshold. + double period{m_Endpoints[m_Endpoints.size() - 1] - m_Endpoints[0]}; + for (std::size_t i = 1u; i < m_Endpoints.size(); ++i) { + double interval{m_Endpoints[i] - m_Endpoints[i - 1]}; + try { + boost::math::binomial binomial{largeErrorCount, interval / period}; + double oneMinusCdf{ + CTools::safeCdfComplement(binomial, m_LargeErrorCounts[i - 1])}; + m_LargeErrorCountSignificances.add({oneMinusCdf, i - 1}); + } catch (const std::exception& e) { + LOG_ERROR(<< "Failed to calculate splitting significance: " << e.what()); + } + } + if (m_LargeErrorCountSignificances.count() > 0) { + // We're choosing the minimum p-value of number of buckets + // independent statistics so the significance is one minus + // the chance that all of them are greater than the observation. + for (auto& significance : m_LargeErrorCountSignificances) { + significance.first = CTools::oneMinusPowOneMinusX( + significance.first, static_cast(this->size())); + LOG_TRACE(<< "bucket [" << m_Endpoints[significance.second] + << "," << m_Endpoints[significance.second + 1] + << ") split significance = " << significance.first); + } + m_LargeErrorCountSignificances.sort(); + } + + if (2 * this->size() < 3 * m_TargetSize && + largeErrorCount > MINIMUM_LARGE_ERROR_COUNT_TO_SPLIT && + m_LargeErrorCountSignificances.count() > 0 && + m_LargeErrorCountSignificances[0].first < HIGH_SIGNIFICANCE) { + this->splitBucket(m_LargeErrorCountSignificances[0].second); + } + } +} + +void CAdaptiveBucketing::splitBucket(std::size_t bucket) { + double leftEnd{m_Endpoints[bucket]}; + double rightEnd{m_Endpoints[bucket + 1]}; + LOG_TRACE(<< "splitting [" << leftEnd << "," << rightEnd << ")"); + double midpoint{(leftEnd + rightEnd) / 2.0}; + double centre{m_Centres[bucket]}; + double offset{std::min(centre - leftEnd, rightEnd - centre) / 2.0}; + m_Endpoints.insert(m_Endpoints.begin() + bucket + 1, midpoint); + m_Centres[bucket] = std::max(centre + offset, midpoint); + m_Centres.insert(m_Centres.begin() + bucket, std::min(centre - offset, midpoint)); + m_LargeErrorCounts[bucket] /= 1.75; + m_LargeErrorCounts.insert(m_LargeErrorCounts.begin() + bucket, + m_LargeErrorCounts[bucket]); + this->split(bucket); +} + +const double CAdaptiveBucketing::LARGE_ERROR_STANDARD_DEVIATIONS{3.0}; } } diff --git a/lib/maths/CCalendarComponentAdaptiveBucketing.cc b/lib/maths/CCalendarComponentAdaptiveBucketing.cc index 3e518debfc..8bfe17910d 100644 --- a/lib/maths/CCalendarComponentAdaptiveBucketing.cc +++ b/lib/maths/CCalendarComponentAdaptiveBucketing.cc @@ -64,9 +64,7 @@ CCalendarComponentAdaptiveBucketing::CCalendarComponentAdaptiveBucketing( } void CCalendarComponentAdaptiveBucketing::acceptPersistInserter(core::CStatePersistInserter& inserter) const { - inserter.insertLevel(ADAPTIVE_BUCKETING_TAG, - boost::bind(&CAdaptiveBucketing::acceptPersistInserter, - static_cast(this), _1)); + inserter.insertLevel(ADAPTIVE_BUCKETING_TAG, this->getAcceptPersistInserter()); inserter.insertValue(FEATURE_TAG, m_Feature.toDelimited()); core::CPersistUtils::persist(VALUES_TAG, m_Values, inserter); } @@ -77,14 +75,9 @@ void CCalendarComponentAdaptiveBucketing::swap(CCalendarComponentAdaptiveBucketi m_Values.swap(other.m_Values); } -bool CCalendarComponentAdaptiveBucketing::initialized() const { - return this->CAdaptiveBucketing::initialized(); -} - bool CCalendarComponentAdaptiveBucketing::initialize(std::size_t n) { double a{0.0}; double b{static_cast(m_Feature.window())}; - if (this->CAdaptiveBucketing::initialize(a, b, n)) { m_Values.clear(); m_Values.resize(this->size()); @@ -93,10 +86,6 @@ bool CCalendarComponentAdaptiveBucketing::initialize(std::size_t n) { return false; } -std::size_t CCalendarComponentAdaptiveBucketing::size() const { - return this->CAdaptiveBucketing::size(); -} - void CCalendarComponentAdaptiveBucketing::clear() { this->CAdaptiveBucketing::clear(); clearAndShrink(m_Values); @@ -112,11 +101,19 @@ void CCalendarComponentAdaptiveBucketing::add(core_t::TTime time, double value, std::size_t bucket{0}; if (this->initialized() && this->bucket(time, bucket)) { this->CAdaptiveBucketing::add(bucket, time, weight); - TFloatMeanVarAccumulator variance{m_Values[bucket]}; - variance.add(value, weight * weight); + + TFloatMeanVarAccumulator moments{m_Values[bucket]}; + double prediction{CBasicStatistics::mean(moments)}; + moments.add(value, weight * weight); + m_Values[bucket].add(value, weight); CBasicStatistics::moment<1>(m_Values[bucket]) = - CBasicStatistics::maximumLikelihoodVariance(variance); + CBasicStatistics::maximumLikelihoodVariance(moments); + if (std::fabs(value - prediction) > + LARGE_ERROR_STANDARD_DEVIATIONS * + std::sqrt(CBasicStatistics::maximumLikelihoodVariance(moments))) { + this->addLargeError(bucket, time); + } } } @@ -124,34 +121,18 @@ CCalendarFeature CCalendarComponentAdaptiveBucketing::feature() const { return m_Feature; } -void CCalendarComponentAdaptiveBucketing::decayRate(double value) { - this->CAdaptiveBucketing::decayRate(value); -} - -double CCalendarComponentAdaptiveBucketing::decayRate() const { - return this->CAdaptiveBucketing::decayRate(); -} - void CCalendarComponentAdaptiveBucketing::propagateForwardsByTime(double time) { if (time < 0.0) { LOG_ERROR(<< "Can't propagate bucketing backwards in time"); } else if (this->initialized()) { - double factor{::exp(-this->CAdaptiveBucketing::decayRate() * time)}; - this->CAdaptiveBucketing::age(factor); + double factor{std::exp(-this->decayRate() * time)}; + this->age(factor); for (auto& value : m_Values) { value.age(factor); } } } -double CCalendarComponentAdaptiveBucketing::minimumBucketLength() const { - return this->CAdaptiveBucketing::minimumBucketLength(); -} - -void CCalendarComponentAdaptiveBucketing::refine(core_t::TTime time) { - this->CAdaptiveBucketing::refine(time); -} - double CCalendarComponentAdaptiveBucketing::count(core_t::TTime time) const { const TFloatMeanVarAccumulator* value = this->value(time); return value ? static_cast(CBasicStatistics::count(*value)) : 0.0; @@ -169,14 +150,6 @@ CCalendarComponentAdaptiveBucketing::value(core_t::TTime time) const { return result; } -bool CCalendarComponentAdaptiveBucketing::knots(core_t::TTime time, - CSplineTypes::EBoundaryCondition boundary, - TDoubleVec& knots, - TDoubleVec& values, - TDoubleVec& variances) const { - return this->CAdaptiveBucketing::knots(time, boundary, knots, values, variances); -} - uint64_t CCalendarComponentAdaptiveBucketing::checksum(uint64_t seed) const { seed = this->CAdaptiveBucketing::checksum(seed); seed = CChecksum::calculate(seed, m_Feature); @@ -185,9 +158,9 @@ uint64_t CCalendarComponentAdaptiveBucketing::checksum(uint64_t seed) const { void CCalendarComponentAdaptiveBucketing::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { mem->setName("CCalendarComponentAdaptiveBucketing"); - core::CMemoryDebug::dynamicSize("m_Endpoints", - this->CAdaptiveBucketing::endpoints(), mem); - core::CMemoryDebug::dynamicSize("m_Centres", this->CAdaptiveBucketing::centres(), mem); + core::CMemoryDebug::dynamicSize("m_Endpoints", this->endpoints(), mem); + core::CMemoryDebug::dynamicSize("m_Centres", this->centres(), mem); + core::CMemoryDebug::dynamicSize("m_LargeErrorCounts", this->largeErrorCounts(), mem); core::CMemoryDebug::dynamicSize("m_Values", m_Values, mem); } @@ -195,32 +168,11 @@ std::size_t CCalendarComponentAdaptiveBucketing::memoryUsage() const { return this->CAdaptiveBucketing::memoryUsage() + core::CMemory::dynamicSize(m_Values); } -const CCalendarComponentAdaptiveBucketing::TFloatVec& -CCalendarComponentAdaptiveBucketing::endpoints() const { - return this->CAdaptiveBucketing::endpoints(); -} - -double CCalendarComponentAdaptiveBucketing::count() const { - return this->CAdaptiveBucketing::count(); -} - -CCalendarComponentAdaptiveBucketing::TDoubleVec -CCalendarComponentAdaptiveBucketing::values(core_t::TTime time) const { - return this->CAdaptiveBucketing::values(time); -} - -CCalendarComponentAdaptiveBucketing::TDoubleVec -CCalendarComponentAdaptiveBucketing::variances() const { - return this->CAdaptiveBucketing::variances(); -} - bool CCalendarComponentAdaptiveBucketing::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { do { const std::string& name{traverser.name()}; RESTORE(ADAPTIVE_BUCKETING_TAG, - traverser.traverseSubLevel( - boost::bind(&CAdaptiveBucketing::acceptRestoreTraverser, - static_cast(this), _1))); + traverser.traverseSubLevel(this->getAcceptRestoreTraverser())); RESTORE(FEATURE_TAG, m_Feature.fromDelimited(traverser.value())); RESTORE(VALUES_TAG, core::CPersistUtils::restore(VALUES_TAG, m_Values, traverser)) } while (traverser.next()); @@ -228,7 +180,7 @@ bool CCalendarComponentAdaptiveBucketing::acceptRestoreTraverser(core::CStateRes return true; } -void CCalendarComponentAdaptiveBucketing::refresh(const TFloatVec& endpoints) { +void CCalendarComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) { // Values are assigned based on their intersection with each // bucket in the previous configuration. The regression and // variance are computed using the appropriate combination @@ -256,70 +208,79 @@ void CCalendarComponentAdaptiveBucketing::refresh(const TFloatVec& endpoints) { using TDoubleMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; std::size_t m{m_Values.size()}; - std::size_t n{endpoints.size()}; + std::size_t n{oldEndpoints.size()}; if (m + 1 != n) { LOG_ERROR(<< "Inconsistent end points and regressions"); return; } - TFloatVec& m_Endpoints{this->CAdaptiveBucketing::endpoints()}; - TFloatVec& m_Centres{this->CAdaptiveBucketing::centres()}; + const TFloatVec& newEndpoints{this->endpoints()}; + const TFloatVec& oldCentres{this->centres()}; + const TFloatVec& oldLargeErrorCounts{this->largeErrorCounts()}; - TFloatMeanVarVec values; - TFloatVec centres; - values.reserve(m); - centres.reserve(m); + TFloatMeanVarVec newValues; + TFloatVec newCentres; + TFloatVec newLargeErrorCounts; + newValues.reserve(m); + newCentres.reserve(m); + newLargeErrorCounts.reserve(m); for (std::size_t i = 1u; i < n; ++i) { - double yl{m_Endpoints[i - 1]}; - double yr{m_Endpoints[i]}; - std::size_t r = std::lower_bound(endpoints.begin(), endpoints.end(), yr) - - endpoints.begin(); + double yl{newEndpoints[i - 1]}; + double yr{newEndpoints[i]}; + std::size_t r = std::lower_bound(oldEndpoints.begin(), oldEndpoints.end(), yr) - + oldEndpoints.begin(); r = CTools::truncate(r, std::size_t(1), n - 1); - std::size_t l = std::upper_bound(endpoints.begin(), endpoints.end(), yl) - - endpoints.begin(); + std::size_t l = std::upper_bound(oldEndpoints.begin(), oldEndpoints.end(), yl) - + oldEndpoints.begin(); l = CTools::truncate(l, std::size_t(1), r); LOG_TRACE(<< "interval = [" << yl << "," << yr << "]"); LOG_TRACE(<< "l = " << l << ", r = " << r); - LOG_TRACE(<< "[x(l), x(r)] = [" << endpoints[l - 1] << "," << endpoints[r] << "]"); + LOG_TRACE(<< "[x(l), x(r)] = [" << oldEndpoints[l - 1] << "," + << oldEndpoints[r] << "]"); - double xl{endpoints[l - 1]}; - double xr{endpoints[l]}; + double xl{oldEndpoints[l - 1]}; + double xr{oldEndpoints[l]}; if (l == r) { - double interval{m_Endpoints[i] - m_Endpoints[i - 1]}; + double interval{newEndpoints[i] - newEndpoints[i - 1]}; double w{CTools::truncate(interval / (xr - xl), 0.0, 1.0)}; - values.push_back(CBasicStatistics::scaled(m_Values[l - 1], w * w)); - centres.push_back( - CTools::truncate(static_cast(m_Centres[l - 1]), yl, yr)); + newValues.push_back(CBasicStatistics::scaled(m_Values[l - 1], w * w)); + newCentres.push_back( + CTools::truncate(static_cast(oldCentres[l - 1]), yl, yr)); + newLargeErrorCounts.push_back(w * oldLargeErrorCounts[l - 1]); } else { - double interval{xr - m_Endpoints[i - 1]}; + double interval{xr - newEndpoints[i - 1]}; double w{CTools::truncate(interval / (xr - xl), 0.0, 1.0)}; TDoubleMeanVarAccumulator value{CBasicStatistics::scaled(m_Values[l - 1], w)}; TDoubleMeanAccumulator centre{CBasicStatistics::accumulator( w * CBasicStatistics::count(m_Values[l - 1]), - static_cast(m_Centres[l - 1]))}; + static_cast(oldCentres[l - 1]))}; + double largeErrorCount{w * oldLargeErrorCounts[l - 1]}; double count{w * w * CBasicStatistics::count(m_Values[l - 1])}; while (++l < r) { value += m_Values[l - 1]; centre += CBasicStatistics::accumulator( CBasicStatistics::count(m_Values[l - 1]), - static_cast(m_Centres[l - 1])); + static_cast(oldCentres[l - 1])); + largeErrorCount += oldLargeErrorCounts[l - 1]; count += CBasicStatistics::count(m_Values[l - 1]); } - xl = endpoints[l - 1]; - xr = endpoints[l]; - interval = m_Endpoints[i] - xl; + xl = oldEndpoints[l - 1]; + xr = oldEndpoints[l]; + interval = newEndpoints[i] - xl; w = CTools::truncate(interval / (xr - xl), 0.0, 1.0); value += CBasicStatistics::scaled(m_Values[l - 1], w); centre += CBasicStatistics::accumulator( w * CBasicStatistics::count(m_Values[l - 1]), - static_cast(m_Centres[l - 1])); + static_cast(oldCentres[l - 1])); + largeErrorCount += w * oldLargeErrorCounts[l - 1]; count += w * w * CBasicStatistics::count(m_Values[l - 1]); double scale{count / CBasicStatistics::count(value)}; - values.push_back(CBasicStatistics::scaled(value, scale)); - centres.push_back(CTools::truncate(CBasicStatistics::mean(centre), yl, yr)); + newValues.push_back(CBasicStatistics::scaled(value, scale)); + newCentres.push_back(CTools::truncate(CBasicStatistics::mean(centre), yl, yr)); + newLargeErrorCounts.push_back(largeErrorCount); } } @@ -328,26 +289,25 @@ void CCalendarComponentAdaptiveBucketing::refresh(const TFloatVec& endpoints) { // that is equal to the number of points they will receive in one // period. double count{0.0}; - for (const auto& value : values) { + for (const auto& value : newValues) { count += CBasicStatistics::count(value); } - count /= (endpoints[m] - endpoints[0]); + count /= (oldEndpoints[m] - oldEndpoints[0]); for (std::size_t i = 0u; i < m; ++i) { - double ci{CBasicStatistics::count(values[i])}; + double ci{CBasicStatistics::count(newValues[i])}; if (ci > 0.0) { - CBasicStatistics::scale(count * (endpoints[i + 1] - endpoints[i]) / ci, - values[i]); + CBasicStatistics::scale( + count * (oldEndpoints[i + 1] - oldEndpoints[i]) / ci, newValues[i]); } } - LOG_TRACE(<< "old endpoints = " << core::CContainerPrinter::print(endpoints)); - LOG_TRACE(<< "old values = " << core::CContainerPrinter::print(m_Values)); - LOG_TRACE(<< "old centres = " << core::CContainerPrinter::print(m_Centres)); - LOG_TRACE(<< "new endpoints = " << core::CContainerPrinter::print(m_Endpoints)); - LOG_TRACE(<< "new value = " << core::CContainerPrinter::print(values)); - LOG_TRACE(<< "new centres = " << core::CContainerPrinter::print(centres)); - m_Values.swap(values); - m_Centres.swap(centres); + LOG_TRACE(<< "old endpoints = " << core::CContainerPrinter::print(oldEndpoints)); + LOG_TRACE(<< "old centres = " << core::CContainerPrinter::print(oldCentres)); + LOG_TRACE(<< "new endpoints = " << core::CContainerPrinter::print(newEndpoints)); + LOG_TRACE(<< "new centres = " << core::CContainerPrinter::print(newCentres)); + m_Values.swap(newValues); + this->centres().swap(newCentres); + this->largeErrorCounts().swap(newLargeErrorCounts); } bool CCalendarComponentAdaptiveBucketing::inWindow(core_t::TTime time) const { @@ -365,7 +325,7 @@ double CCalendarComponentAdaptiveBucketing::offset(core_t::TTime time) const { return static_cast(m_Feature.offset(time)); } -double CCalendarComponentAdaptiveBucketing::count(std::size_t bucket) const { +double CCalendarComponentAdaptiveBucketing::bucketCount(std::size_t bucket) const { return CBasicStatistics::count(m_Values[bucket]); } @@ -378,5 +338,18 @@ double CCalendarComponentAdaptiveBucketing::predict(std::size_t bucket, double CCalendarComponentAdaptiveBucketing::variance(std::size_t bucket) const { return CBasicStatistics::maximumLikelihoodVariance(m_Values[bucket]); } + +void CCalendarComponentAdaptiveBucketing::split(std::size_t bucket) { + // We don't know the fraction of values' (weights) which would + // have fallen in each half of the split bucket. However, some + // fraction of them would ideally not be included in these + // statistics, i.e. the values in the other half of the split. + // If we assume an equal split but assign a weight of 0.0 to the + // samples included in error we arrive at a multiplier of 0.25. + // In practice this simply means we increase the significance + // of new samples for some time which is reasonable. + CBasicStatistics::scale(0.25, m_Values[bucket]); + m_Values.insert(m_Values.begin() + bucket, m_Values[bucket]); +} } } diff --git a/lib/maths/CSeasonalComponent.cc b/lib/maths/CSeasonalComponent.cc index ecb22ae884..91bc6c9531 100644 --- a/lib/maths/CSeasonalComponent.cc +++ b/lib/maths/CSeasonalComponent.cc @@ -135,8 +135,8 @@ void CSeasonalComponent::shiftLevel(double shift) { m_Bucketing.shiftLevel(shift); } -void CSeasonalComponent::shiftSlope(double shift) { - m_Bucketing.shiftSlope(shift); +void CSeasonalComponent::shiftSlope(core_t::TTime time, double shift) { + m_Bucketing.shiftSlope(time, shift); } void CSeasonalComponent::linearScale(core_t::TTime time, double scale) { diff --git a/lib/maths/CSeasonalComponentAdaptiveBucketing.cc b/lib/maths/CSeasonalComponentAdaptiveBucketing.cc index 4bbbfae67c..1c7b7d1775 100644 --- a/lib/maths/CSeasonalComponentAdaptiveBucketing.cc +++ b/lib/maths/CSeasonalComponentAdaptiveBucketing.cc @@ -111,9 +111,7 @@ operator=(const CSeasonalComponentAdaptiveBucketing& rhs) { void CSeasonalComponentAdaptiveBucketing::acceptPersistInserter(core::CStatePersistInserter& inserter) const { inserter.insertValue(VERSION_6_3_TAG, ""); - inserter.insertLevel(ADAPTIVE_BUCKETING_6_3_TAG, - boost::bind(&CAdaptiveBucketing::acceptPersistInserter, - static_cast(this), _1)); + inserter.insertLevel(ADAPTIVE_BUCKETING_6_3_TAG, this->getAcceptPersistInserter()); inserter.insertLevel(TIME_6_3_TAG, boost::bind(&CSeasonalTimeStateSerializer::acceptPersistInserter, boost::cref(*m_Time), _1)); core::CPersistUtils::persist(BUCKETS_6_3_TAG, m_Buckets, inserter); @@ -125,10 +123,6 @@ void CSeasonalComponentAdaptiveBucketing::swap(CSeasonalComponentAdaptiveBucketi m_Buckets.swap(other.m_Buckets); } -bool CSeasonalComponentAdaptiveBucketing::initialized() const { - return this->CAdaptiveBucketing::initialized(); -} - bool CSeasonalComponentAdaptiveBucketing::initialize(std::size_t n) { double a{0.0}; double b{static_cast( @@ -149,15 +143,11 @@ void CSeasonalComponentAdaptiveBucketing::initialValues(core_t::TTime startTime, this->shiftOrigin(startTime); if (!values.empty()) { this->CAdaptiveBucketing::initialValues(startTime, endTime, values); - this->shiftSlope(-this->slope()); + this->shiftSlope(startTime, -this->slope()); } } } -std::size_t CSeasonalComponentAdaptiveBucketing::size() const { - return this->CAdaptiveBucketing::size(); -} - void CSeasonalComponentAdaptiveBucketing::clear() { this->CAdaptiveBucketing::clear(); clearAndShrink(m_Buckets); @@ -180,9 +170,12 @@ void CSeasonalComponentAdaptiveBucketing::shiftLevel(double shift) { } } -void CSeasonalComponentAdaptiveBucketing::shiftSlope(double shift) { - for (auto& bucket : m_Buckets) { - bucket.s_Regression.shiftGradient(shift); +void CSeasonalComponentAdaptiveBucketing::shiftSlope(core_t::TTime time, double shift) { + if (std::fabs(shift) > 0.0) { + for (auto& bucket : m_Buckets) { + bucket.s_Regression.shiftGradient(shift); + bucket.s_Regression.shiftOrdinate(-shift * m_Time->regression(time)); + } } } @@ -201,6 +194,7 @@ void CSeasonalComponentAdaptiveBucketing::add(core_t::TTime time, return; } + weight = this->adjustedWeight(bucket, weight); this->CAdaptiveBucketing::add(bucket, time, weight); SBucket& bucket_{m_Buckets[bucket]}; @@ -214,6 +208,11 @@ void CSeasonalComponentAdaptiveBucketing::add(core_t::TTime time, regression.add(t, value, weight); bucket_.s_Variance = CBasicStatistics::maximumLikelihoodVariance(moments); + if (std::fabs(value - prediction) > + LARGE_ERROR_STANDARD_DEVIATIONS * std::sqrt(bucket_.s_Variance)) { + this->addLargeError(bucket, time); + } + if (m_Time->regressionInterval(bucket_.s_FirstUpdate, bucket_.s_LastUpdate) < SUFFICIENT_INTERVAL_TO_ESTIMATE_SLOPE) { double delta{regression.predict(t)}; @@ -234,35 +233,18 @@ const CSeasonalTime& CSeasonalComponentAdaptiveBucketing::time() const { return *m_Time; } -void CSeasonalComponentAdaptiveBucketing::decayRate(double value) { - this->CAdaptiveBucketing::decayRate(value); -} - -double CSeasonalComponentAdaptiveBucketing::decayRate() const { - return this->CAdaptiveBucketing::decayRate(); -} - void CSeasonalComponentAdaptiveBucketing::propagateForwardsByTime(double time, bool meanRevert) { if (time < 0.0) { LOG_ERROR(<< "Can't propagate bucketing backwards in time"); } else if (this->initialized()) { - double factor{std::exp(-this->CAdaptiveBucketing::decayRate() * - m_Time->fractionInWindow() * time)}; - this->CAdaptiveBucketing::age(factor); + double factor{std::exp(-this->decayRate() * m_Time->fractionInWindow() * time)}; + this->age(factor); for (auto& bucket : m_Buckets) { bucket.s_Regression.age(factor, meanRevert); } } } -double CSeasonalComponentAdaptiveBucketing::minimumBucketLength() const { - return this->CAdaptiveBucketing::minimumBucketLength(); -} - -void CSeasonalComponentAdaptiveBucketing::refine(core_t::TTime time) { - this->CAdaptiveBucketing::refine(time); -} - double CSeasonalComponentAdaptiveBucketing::count(core_t::TTime time) const { const TRegression* regression = this->regression(time); return regression ? regression->count() : 0.0; @@ -279,14 +261,6 @@ const TRegression* CSeasonalComponentAdaptiveBucketing::regression(core_t::TTime return result; } -bool CSeasonalComponentAdaptiveBucketing::knots(core_t::TTime time, - CSplineTypes::EBoundaryCondition boundary, - TDoubleVec& knots, - TDoubleVec& values, - TDoubleVec& variances) const { - return this->CAdaptiveBucketing::knots(time, boundary, knots, values, variances); -} - double CSeasonalComponentAdaptiveBucketing::slope() const { CBasicStatistics::CMinMax minmax; for (const auto& bucket : m_Buckets) { @@ -309,9 +283,9 @@ uint64_t CSeasonalComponentAdaptiveBucketing::checksum(uint64_t seed) const { void CSeasonalComponentAdaptiveBucketing::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { mem->setName("CSeasonalComponentAdaptiveBucketing"); - core::CMemoryDebug::dynamicSize("m_Endpoints", - this->CAdaptiveBucketing::endpoints(), mem); - core::CMemoryDebug::dynamicSize("m_Centres", this->CAdaptiveBucketing::centres(), mem); + core::CMemoryDebug::dynamicSize("m_Endpoints", this->endpoints(), mem); + core::CMemoryDebug::dynamicSize("m_Centres", this->centres(), mem); + core::CMemoryDebug::dynamicSize("m_LargeErrorCounts", this->largeErrorCounts(), mem); core::CMemoryDebug::dynamicSize("m_Buckets", m_Buckets, mem); } @@ -319,33 +293,12 @@ std::size_t CSeasonalComponentAdaptiveBucketing::memoryUsage() const { return this->CAdaptiveBucketing::memoryUsage() + core::CMemory::dynamicSize(m_Buckets); } -const CSeasonalComponentAdaptiveBucketing::TFloatVec& -CSeasonalComponentAdaptiveBucketing::endpoints() const { - return this->CAdaptiveBucketing::endpoints(); -} - -double CSeasonalComponentAdaptiveBucketing::count() const { - return this->CAdaptiveBucketing::count(); -} - -CSeasonalComponentAdaptiveBucketing::TDoubleVec -CSeasonalComponentAdaptiveBucketing::values(core_t::TTime time) const { - return this->CAdaptiveBucketing::values(time); -} - -CSeasonalComponentAdaptiveBucketing::TDoubleVec -CSeasonalComponentAdaptiveBucketing::variances() const { - return this->CAdaptiveBucketing::variances(); -} - bool CSeasonalComponentAdaptiveBucketing::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { if (traverser.name() == VERSION_6_3_TAG) { while (traverser.next()) { const std::string& name{traverser.name()}; RESTORE(ADAPTIVE_BUCKETING_6_3_TAG, - traverser.traverseSubLevel( - boost::bind(&CAdaptiveBucketing::acceptRestoreTraverser, - static_cast(this), _1))); + traverser.traverseSubLevel(this->getAcceptRestoreTraverser())) RESTORE(TIME_6_3_TAG, traverser.traverseSubLevel(boost::bind( &CSeasonalTimeStateSerializer::acceptRestoreTraverser, boost::ref(m_Time), _1))) @@ -365,9 +318,7 @@ bool CSeasonalComponentAdaptiveBucketing::acceptRestoreTraverser(core::CStateRes do { const std::string& name{traverser.name()}; RESTORE(ADAPTIVE_BUCKETING_OLD_TAG, - traverser.traverseSubLevel( - boost::bind(&CAdaptiveBucketing::acceptRestoreTraverser, - static_cast(this), _1))); + traverser.traverseSubLevel(this->getAcceptRestoreTraverser())); RESTORE(TIME_OLD_TAG, traverser.traverseSubLevel(boost::bind( &CSeasonalTimeStateSerializer::acceptRestoreTraverser, boost::ref(m_Time), _1))) @@ -430,13 +381,16 @@ void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) return; } - const TFloatVec& newEndpoints{this->CAdaptiveBucketing::endpoints()}; - const TFloatVec& oldCentres{this->CAdaptiveBucketing::centres()}; + const TFloatVec& newEndpoints{this->endpoints()}; + const TFloatVec& oldCentres{this->centres()}; + const TFloatVec& oldLargeErrorCounts{this->largeErrorCounts()}; TBucketVec buckets; TFloatVec newCentres; + TFloatVec newLargeErrorCounts; buckets.reserve(m); newCentres.reserve(m); + newLargeErrorCounts.reserve(m); for (std::size_t i = 1u; i < n; ++i) { double yl{newEndpoints[i - 1]}; @@ -464,6 +418,7 @@ void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) bucket.s_FirstUpdate, bucket.s_LastUpdate); newCentres.push_back( CTools::truncate(static_cast(oldCentres[l - 1]), yl, yr)); + newLargeErrorCounts.push_back(w * oldLargeErrorCounts[l - 1]); } else { double interval{xr - newEndpoints[i - 1]}; double w{CTools::truncate(interval / (xr - xl), 0.0, 1.0)}; @@ -478,6 +433,7 @@ void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) lastUpdate.add(bucket->s_LastUpdate); TDoubleMeanAccumulator centre{CBasicStatistics::accumulator( w * bucket->s_Regression.count(), static_cast(oldCentres[l - 1]))}; + double largeErrorCount{w * oldLargeErrorCounts[l - 1]}; double count{w * w * bucket->s_Regression.count()}; while (++l < r) { bucket = &m_Buckets[l - 1]; @@ -489,6 +445,7 @@ void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) lastUpdate.add(bucket->s_LastUpdate); centre += CBasicStatistics::accumulator( bucket->s_Regression.count(), static_cast(oldCentres[l - 1])); + largeErrorCount += oldLargeErrorCounts[l - 1]; count += bucket->s_Regression.count(); } xl = oldEndpoints[l - 1]; @@ -504,12 +461,14 @@ void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) lastUpdate.add(bucket->s_LastUpdate); centre += CBasicStatistics::accumulator( w * bucket->s_Regression.count(), static_cast(oldCentres[l - 1])); + largeErrorCount += w * oldLargeErrorCounts[l - 1]; count += w * w * bucket->s_Regression.count(); double scale{count == regression.count() ? 1.0 : count / regression.count()}; buckets.emplace_back(regression.scaled(scale), CBasicStatistics::maximumLikelihoodVariance(variance), firstUpdate[0], lastUpdate[0]); newCentres.push_back(CTools::truncate(CBasicStatistics::mean(centre), yl, yr)); + newLargeErrorCounts.push_back(largeErrorCount); } } @@ -535,7 +494,8 @@ void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) LOG_TRACE(<< "new endpoints = " << core::CContainerPrinter::print(newEndpoints)); LOG_TRACE(<< "new centres = " << core::CContainerPrinter::print(newCentres)); m_Buckets.swap(buckets); - this->CAdaptiveBucketing::centres().swap(newCentres); + this->centres().swap(newCentres); + this->largeErrorCounts().swap(newLargeErrorCounts); } bool CSeasonalComponentAdaptiveBucketing::inWindow(core_t::TTime time) const { @@ -560,7 +520,7 @@ double CSeasonalComponentAdaptiveBucketing::offset(core_t::TTime time) const { return m_Time->periodic(time); } -double CSeasonalComponentAdaptiveBucketing::count(std::size_t bucket) const { +double CSeasonalComponentAdaptiveBucketing::bucketCount(std::size_t bucket) const { return m_Buckets[bucket].s_Regression.count(); } @@ -596,6 +556,19 @@ double CSeasonalComponentAdaptiveBucketing::variance(std::size_t bucket) const { return m_Buckets[bucket].s_Variance; } +void CSeasonalComponentAdaptiveBucketing::split(std::size_t bucket) { + // We don't know the fraction of values' (weights) which would + // have fallen in each half of the split bucket. However, some + // fraction of them would ideally not be included in these + // models, i.e. the values in the other half of the split. If + // we assume an equal split but assign a weight of 0.0 to the + // samples included in error we arrive at a multiplier of 0.25. + // In practice this simply means we increase the significance + // of new samples for some time which is reasonable. + m_Buckets[bucket].s_Regression.scale(0.25); + m_Buckets.insert(m_Buckets.begin() + bucket, m_Buckets[bucket]); +} + double CSeasonalComponentAdaptiveBucketing::observedInterval(core_t::TTime time) const { return m_Time->regressionInterval( std::min_element(m_Buckets.begin(), m_Buckets.end(), diff --git a/lib/maths/CTimeSeriesDecompositionDetail.cc b/lib/maths/CTimeSeriesDecompositionDetail.cc index c9c9190c60..fa05d66e17 100644 --- a/lib/maths/CTimeSeriesDecompositionDetail.cc +++ b/lib/maths/CTimeSeriesDecompositionDetail.cc @@ -1748,7 +1748,7 @@ void CTimeSeriesDecompositionDetail::CComponents::canonicalize(core_t::TTime tim windowSlopes[component.time().window()] += si; } else { slope += si; - component.shiftSlope(-si); + component.shiftSlope(time, -si); } } } @@ -1761,12 +1761,12 @@ void CTimeSeriesDecompositionDetail::CComponents::canonicalize(core_t::TTime tim for (auto& component : seasonal) { if (component.slopeAccurate(time) && component.time().windowed()) { - component.shiftSlope(-windowedSlope.signMargin()); + component.shiftSlope(time, -windowedSlope.signMargin()); } } if (slope != 0.0) { - m_Trend.shiftSlope(m_DecayRate, slope); + m_Trend.shiftSlope(time, slope); } } } diff --git a/lib/maths/CTools.cc b/lib/maths/CTools.cc index 925c2f8c18..8f0bd0978b 100644 --- a/lib/maths/CTools.cc +++ b/lib/maths/CTools.cc @@ -1908,6 +1908,18 @@ double CTools::shiftRight(double x, double eps) { return (x < 0.0 ? 1.0 - eps : 1.0 + eps) * x; } +double CTools::linearlyInterpolate(double a, double b, double fa, double fb, double x) { + if (x <= a) { + return fa; + } + if (x >= b) { + return fb; + } + double wa{(b - x) / (b - a)}; + double wb{(x - a) / (b - a)}; + return wa * fa + wb * fb; +} + double CTools::powOneMinusX(double x, double p) { // For large p, // (1 - x) ^ p ~= exp(-p * x). diff --git a/lib/maths/CTrendComponent.cc b/lib/maths/CTrendComponent.cc index 56c278d9ac..ec412d3bc4 100644 --- a/lib/maths/CTrendComponent.cc +++ b/lib/maths/CTrendComponent.cc @@ -201,10 +201,19 @@ void CTrendComponent::shiftOrigin(core_t::TTime time) { } } -void CTrendComponent::shiftSlope(double decayRate, double shift) { - for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - double shift_{std::min(m_DefaultDecayRate * TIME_SCALES[i] / decayRate, 1.0) * shift}; - m_TrendModels[i].s_Regression.shiftGradient(shift_); +void CTrendComponent::shiftSlope(core_t::TTime time, double shift) { + // This shifts all models gradients by a fixed amount whilst keeping + // the prediction at time fixed. This avoids a jump in the current + // predicted value as a result of adjusting the gradient. Otherwise, + // this changes by "shift" x "scaled time since the regression origin". + if (std::fabs(shift) > 0.0) { + for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { + double modelDecayRate{m_DefaultDecayRate * TIME_SCALES[i]}; + double shift_{std::min(modelDecayRate / m_TargetDecayRate, 1.0) * shift}; + m_TrendModels[i].s_Regression.shiftGradient(shift_); + m_TrendModels[i].s_Regression.shiftOrdinate( + -shift_ * scaleTime(time, m_RegressionOrigin)); + } } } @@ -244,12 +253,12 @@ void CTrendComponent::add(core_t::TTime time, double value, double weight) { modelWeight(m_TargetDecayRate, m_DefaultDecayRate * TIME_SCALES[i])); } - // Update the models. - if (m_FirstUpdate == UNSET_TIME) { m_RegressionOrigin = CIntegerTools::floor(time, core::constants::WEEK); } + // Update the models. + double prediction{CBasicStatistics::mean(this->value(time, 0.0))}; double count{this->count()}; @@ -262,8 +271,8 @@ void CTrendComponent::add(core_t::TTime time, double value, double weight) { double scaledTime{scaleTime(time, m_RegressionOrigin)}; for (auto& model : m_TrendModels) { - model.s_Regression.add(scaledTime, value, weight); model.s_ResidualMoments.add(value - model.s_Regression.predict(scaledTime, MAX_CONDITION)); + model.s_Regression.add(scaledTime, value, weight); } m_ValueMoments.add(value); @@ -387,16 +396,26 @@ void CTrendComponent::forecast(core_t::TTime startTime, TMatrixVec modelCovariances(NUMBER_MODELS); TDoubleVec residualVariances(NUMBER_MODELS); for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { + // Note in the following we multiply the bias by the sample count + // when estimating the regression coefficients' covariance matrix. + // This is because, unlike the noise like component of the residual, + // it is overly optimistic to assume these will cancel in the + // estimates of the regression coefficients when adding multiple + // samples. Instead, we make the most pessimistic assumption that + // this term is from a fixed random variable and so multiply its + // variance by the number of samples to undo the scaling applied + // in the regression model covariance method. const SModel& model{m_TrendModels[i]}; + double n{model.s_Regression.count()}; + double bias{CBasicStatistics::mean(model.s_ResidualMoments)}; + double variance{CBasicStatistics::variance(model.s_ResidualMoments)}; + residualVariances[i] = CTools::pow2(bias) + variance; model.s_Regression.parameters(models[i], MAX_CONDITION); - model.s_Regression.covariances(m_PredictionErrorVariance, + model.s_Regression.covariances(n * CTools::pow2(bias) + variance, modelCovariances[i], MAX_CONDITION); - modelCovariances[i] /= std::max(model.s_Regression.count(), 1.0); - residualVariances[i] = CTools::pow2(CBasicStatistics::mean(model.s_ResidualMoments)) + - CBasicStatistics::variance(model.s_ResidualMoments); - LOG_TRACE("params = " << core::CContainerPrinter::print(models[i])); - LOG_TRACE("covariances = " << modelCovariances[i].toDelimited()) - LOG_TRACE("variances = " << residualVariances[i]); + LOG_TRACE(<< "params = " << core::CContainerPrinter::print(models[i])); + LOG_TRACE(<< "covariances = " << modelCovariances[i].toDelimited()) + LOG_TRACE(<< "variances = " << residualVariances[i]); } LOG_TRACE(<< "long time variance = " << CBasicStatistics::variance(m_ValueMoments)); diff --git a/lib/maths/unittest/CBasicStatisticsTest.cc b/lib/maths/unittest/CBasicStatisticsTest.cc index 5c70ec9037..c11b6f0cf5 100644 --- a/lib/maths/unittest/CBasicStatisticsTest.cc +++ b/lib/maths/unittest/CBasicStatisticsTest.cc @@ -1313,6 +1313,42 @@ void CBasicStatisticsTest::testOrderStatistics() { CPPUNIT_ASSERT_EQUAL( false, ml::core::memory_detail::SDynamicSizeAlwaysZero::value()); } + { + // Test to from delimited with callback to persist values. + + using TDoubleDoublePr = std::pair; + using TDoubleDoublePrMinAccumulator = + ml::maths::CBasicStatistics::COrderStatisticsStack; + + TDoubleDoublePrMinAccumulator orig; + orig.add({1.0, 3.2}); + orig.add({3.1, 1.2}); + + auto toDelimited = [](const TDoubleDoublePr& value) { + return ml::core::CStringUtils::typeToStringPrecise( + value.first, ml::core::CIEEE754::E_DoublePrecision) + + ml::maths::CBasicStatistics::EXTERNAL_DELIMITER + + ml::core::CStringUtils::typeToStringPrecise( + value.second, ml::core::CIEEE754::E_DoublePrecision); + }; + std::string delimited{orig.toDelimited(toDelimited)}; + LOG_DEBUG(<< "delimited = " << delimited); + + TDoubleDoublePrMinAccumulator restored; + restored.fromDelimited(delimited, [](const std::string& value, TDoubleDoublePr& result) { + std::size_t pos{value.find(ml::maths::CBasicStatistics::EXTERNAL_DELIMITER)}; + return ml::core::CStringUtils::stringToType(value.substr(0, pos), + result.first) && + ml::core::CStringUtils::stringToType(value.substr(pos + 1), + result.second); + }); + + CPPUNIT_ASSERT_EQUAL(delimited, restored.toDelimited(toDelimited)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(orig[0].first, restored[0].first, 1e-15); + CPPUNIT_ASSERT_DOUBLES_EQUAL(orig[0].second, restored[0].second, 1e-15); + CPPUNIT_ASSERT_DOUBLES_EQUAL(orig[1].first, restored[1].first, 1e-15); + CPPUNIT_ASSERT_DOUBLES_EQUAL(orig[1].second, restored[1].second, 1e-15); + } } void CBasicStatisticsTest::testMinMax() { diff --git a/lib/maths/unittest/CForecastTest.cc b/lib/maths/unittest/CForecastTest.cc index 45528ae4eb..69e429235e 100644 --- a/lib/maths/unittest/CForecastTest.cc +++ b/lib/maths/unittest/CForecastTest.cc @@ -152,7 +152,7 @@ void CForecastTest::testDailyNoLongTermTrend() { return 40.0 + alpha * y[i / 6] + beta * y[(i / 6 + 1) % y.size()] + noise; }; - this->test(trend, bucketLength, 63, 64.0, 5.0, 0.14); + this->test(trend, bucketLength, 63, 64.0, 5.0, 0.13); } void CForecastTest::testDailyConstantLongTermTrend() { @@ -208,7 +208,7 @@ void CForecastTest::testComplexNoLongTermTrend() { return scale[d] * (20.0 + y[h] + noise); }; - this->test(trend, bucketLength, 63, 24.0, 9.0, 0.14); + this->test(trend, bucketLength, 63, 24.0, 6.0, 0.14); } void CForecastTest::testComplexConstantLongTermTrend() { @@ -225,7 +225,7 @@ void CForecastTest::testComplexConstantLongTermTrend() { scale[d] * (20.0 + y[h] + noise); }; - this->test(trend, bucketLength, 63, 24.0, 10.0, 0.01); + this->test(trend, bucketLength, 63, 24.0, 5.0, 0.01); } void CForecastTest::testComplexVaryingLongTermTrend() { @@ -255,7 +255,7 @@ void CForecastTest::testComplexVaryingLongTermTrend() { return trend_.value(time_) + scale[d] * (20.0 + y[h] + noise); }; - this->test(trend, bucketLength, 63, 4.0, 24.0, 0.05); + this->test(trend, bucketLength, 63, 4.0, 26.0, 0.03); } void CForecastTest::testNonNegative() { @@ -403,7 +403,7 @@ void CForecastTest::testFinancialIndex() { LOG_DEBUG(<< "% out of bounds = " << percentageOutOfBounds); LOG_DEBUG(<< "error = " << maths::CBasicStatistics::mean(error)); - CPPUNIT_ASSERT(percentageOutOfBounds < 52.0); + CPPUNIT_ASSERT(percentageOutOfBounds < 50.0); CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.1); } diff --git a/lib/maths/unittest/CSeasonalComponentAdaptiveBucketingTest.cc b/lib/maths/unittest/CSeasonalComponentAdaptiveBucketingTest.cc index 67c340a48a..33adac4020 100644 --- a/lib/maths/unittest/CSeasonalComponentAdaptiveBucketingTest.cc +++ b/lib/maths/unittest/CSeasonalComponentAdaptiveBucketingTest.cc @@ -656,7 +656,7 @@ void CSeasonalComponentAdaptiveBucketingTest::testSlope() { LOG_DEBUG(<< "slope = " << slopeBefore); CPPUNIT_ASSERT_DOUBLES_EQUAL(7.0, slopeBefore, 0.25); - bucketing.shiftSlope(10.0); + bucketing.shiftSlope(t, 10.0); double slopeAfter = bucketing.slope(); diff --git a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc index cf23d892c9..502adfe3c1 100644 --- a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc +++ b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc @@ -747,8 +747,8 @@ void CTimeSeriesDecompositionTest::testSeasonalOnset() { LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.053 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.071 * totalMaxValue); + CPPUNIT_ASSERT(totalSumResidual < 0.054 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.068 * totalMaxValue); CPPUNIT_ASSERT(totalPercentileError < 0.02 * totalSumValue); } @@ -1947,7 +1947,7 @@ void CTimeSeriesDecompositionTest::testComponentLifecycle() { debug.addPrediction(time, prediction, trend(time) + noise[0] - prediction); } - double bounds[]{0.01, 0.017, 0.012, 0.072}; + double bounds[]{0.01, 0.026, 0.012, 0.074}; for (std::size_t i = 0; i < 4; ++i) { double error{maths::CBasicStatistics::mean(errors[i])}; LOG_DEBUG(<< "error = " << error);