Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Improve adaption of the modelling of cyclic components to very localised features #134

1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Explicit change point detection and modelling ({pull}92[#92])
Improve partition analysis memory usage ({pull}97[#97])
Reduce model memory by storing state for periodicity testing in a compressed format ({pull}100[#100])
Improve the accuracy of model memory control ({pull}122[#122])
Improve adaption of the modelling of cyclic components to very localised features ({pull}134[#134])

Forecasting of Machine Learning job time series is now supported for large jobs by temporarily storing
model state on disk ({pull}89[#89])
Expand Down
198 changes: 128 additions & 70 deletions include/maths/CAdaptiveBucketing.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,9 @@ namespace maths {
//!
//! For sufficiently smooth functions and a given number of buckets
//! the objective is minimized by ensuring that "bucket width" x
//! "function range" is approximately equal in all buckets.
//! "function range" is equal in all buckets.
//!
//! The bucketing is aged by relaxing it back towards uniform and
//! aging the counts of the mean value for each bucket as usual.
//! The bucketing is aged by relaxing it back towards uniform.
class MATHS_EXPORT CAdaptiveBucketing {
public:
using TDoubleVec = std::vector<double>;
Expand All @@ -73,26 +72,92 @@ class MATHS_EXPORT CAdaptiveBucketing {
using TFloatMeanAccumulatorVec = std::vector<TFloatMeanAccumulator>;

public:
//! Restore by traversing a state document
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
//! Refine the bucket end points to minimize the maximum averaging
//! error in any bucket.
//!
//! \param[in] time The time at which to refine.
void refine(core_t::TTime time);

//! Persist by passing information to the supplied inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
//! Check if the bucketing has been initialized.
bool initialized() const;

//! Get the number of buckets.
std::size_t size() const;

//! Set the rate at which the bucketing loses information.
void decayRate(double value);

//! Get the rate at which the bucketing loses information.
double decayRate() const;

//! Get the minimum permitted bucket length.
double minimumBucketLength() const;

//! Get the bucket end points.
const TFloatVec& endpoints() const;

//! Get the bucket value centres.
const TFloatVec& centres() const;

//! Get the bucket value centres.
const TFloatVec& largeErrorCounts() const;

//! Get a set of knot points and knot point values to use for
//! interpolating the bucket values.
//!
//! \param[in] time The time at which to get the knot points.
//! \param[in] boundary Controls the style of start and end knots.
//! \param[out] knots Filled in with the knot points to interpolate.
//! \param[out] values Filled in with the values at \p knots.
//! \param[out] variances Filled in with the variances at \p knots.
//! \return True if there are sufficient knot points to interpolate
//! and false otherwise.
bool knots(core_t::TTime time,
CSplineTypes::EBoundaryCondition boundary,
TDoubleVec& knots,
TDoubleVec& values,
TDoubleVec& variances) const;

//! \name Test Functions
//@{
//! Get the total count of in the bucketing.
double count() const;

//! Get the bucket regressions.
TDoubleVec values(core_t::TTime time) const;

//! Get the bucket variances.
TDoubleVec variances() const;
//@}

protected:
using TRestoreFunc = std::function<bool(core::CStateRestoreTraverser&)>;
using TPersistFunc = std::function<void(core::CStatePersistInserter&)>;

protected:
//! The minimum number of standard deviations for an error to be
//! considered large.
static const double LARGE_ERROR_STANDARD_DEVIATIONS;

protected:
CAdaptiveBucketing(double decayRate, double minimumBucketLength);
//! Construct by traversing a state document.
CAdaptiveBucketing(double decayRate,
double minimumBucketLength,
core::CStateRestoreTraverser& traverser);
virtual ~CAdaptiveBucketing() = default;

//! Get the restore function bound to this object.
TRestoreFunc getAcceptRestoreTraverser();

//! Get the accept persist function bound to this object.
TPersistFunc getAcceptPersistInserter() const;

//! Restore by traversing a state document
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);

//! Persist by passing information to the supplied inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;

//! Efficiently swap the contents of two bucketing objects.
void swap(CAdaptiveBucketing& other);

//! Check if the bucketing has been initialized.
bool initialized() const;

//! Create a new uniform bucketing with \p n buckets on the
//! interval [\p a, \p b].
//!
Expand All @@ -113,75 +178,32 @@ class MATHS_EXPORT CAdaptiveBucketing {
core_t::TTime endTime,
const TFloatMeanAccumulatorVec& values);

//! Get the number of buckets.
std::size_t size() const;

//! Clear the contents of this bucketing and recover any
//! allocated memory.
void clear();

//! Add the function value at \p time.
//!
//! \param[in] bucket The index of the bucket of \p time.
//! \param[in] time The time of \p value.
//! \param[in] weight The weight of function point. The smaller
//! this is the less influence it has on the bucket.
//! \param[in] time The time of the value being added.
//! \param[in] weight The weight of the value being added. The
//! smaller this is the less influence it has on the bucket.
void add(std::size_t bucket, core_t::TTime time, double weight);

//! Set the rate at which the bucketing loses information.
void decayRate(double value);

//! Get the rate at which the bucketing loses information.
double decayRate() const;
//! Add a large error in \p bucket.
void addLargeError(std::size_t bucket, core_t::TTime time);

//! Age the force moments.
void age(double factor);

//! Get the minimum permitted bucket length.
double minimumBucketLength() const;

//! Refine the bucket end points to minimize the maximum averaging
//! error in any bucket.
//!
//! \param[in] time The time at which to refine.
void refine(core_t::TTime time);

//! Get a set of knot points and knot point values to use for
//! interpolating the bucket values.
//!
//! \param[in] time The time at which to get the knot points.
//! \param[in] boundary Controls the style of start and end knots.
//! \param[out] knots Filled in with the knot points to interpolate.
//! \param[out] values Filled in with the values at \p knots.
//! \param[out] variances Filled in with the variances at \p knots.
//! \return True if there are sufficient knot points to interpolate
//! and false otherwise.
bool knots(core_t::TTime time,
CSplineTypes::EBoundaryCondition boundary,
TDoubleVec& knots,
TDoubleVec& values,
TDoubleVec& variances) const;

//! Get the bucket end points.
const TFloatVec& endpoints() const;

//! Get the bucket end points.
TFloatVec& endpoints();

//! Get the bucket value centres.
const TFloatVec& centres() const;

//! Get the bucket value centres.
TFloatVec& centres();

//! Get the total count of in the bucketing.
double count() const;

//! Get the bucket regressions.
TDoubleVec values(core_t::TTime time) const;
//! Get the bucket value centres.
TFloatVec& largeErrorCounts();

//! Get the bucket variances.
TDoubleVec variances() const;
//! Adjust \p weight for significant large error counts.
double adjustedWeight(std::size_t bucket, double weight) const;

//! Compute the index of the bucket to which \p time belongs
bool bucket(core_t::TTime time, std::size_t& result) const;
Expand All @@ -192,6 +214,10 @@ class MATHS_EXPORT CAdaptiveBucketing {
//! Get the memory used by this component
std::size_t memoryUsage() const;

private:
using TFloatUInt32Pr = std::pair<CFloatStorage, std::uint32_t>;
using TFloatUInt32PrMinAccumulator = CBasicStatistics::SMin<TFloatUInt32Pr, 2>::TAccumulator;

private:
//! Compute the values corresponding to the change in end
//! points from \p endpoints. The values are assigned based
Expand All @@ -208,15 +234,25 @@ class MATHS_EXPORT CAdaptiveBucketing {
//! Get the offset w.r.t. the start of the bucketing of \p time.
virtual double offset(core_t::TTime time) const = 0;

//! The count in \p bucket.
virtual double count(std::size_t bucket) const = 0;
//! Get the count in \p bucket.
virtual double bucketCount(std::size_t bucket) const = 0;

//! Get the predicted value for the \p bucket at \p time.
//! Get the predicted value for \p bucket at \p time.
virtual double predict(std::size_t bucket, core_t::TTime time, double offset) const = 0;

//! Get the variance of \p bucket.
virtual double variance(std::size_t bucket) const = 0;

//! Implements split of \p bucket for derived state.
virtual void split(std::size_t bucket) = 0;

//! Check if there is evidence of systematically large errors in a
//! bucket and split it if there is.
void maybeSplitBucket();

//! Split \p bucket.
void splitBucket(std::size_t bucket);

private:
//! The rate at which information is aged out of the bucket values.
double m_DecayRate;
Expand All @@ -225,12 +261,34 @@ class MATHS_EXPORT CAdaptiveBucketing {
//! is ignored.
double m_MinimumBucketLength;

//! The desired number of buckets. We can use more if we determine
//! that we aren't capturing the periodic pattern effectively.
//!
//! \see maybeSplitBucketMostSignificantBuckets for details.
std::size_t m_TargetSize = 0;

//! The bucket of the last large error added.
std::size_t m_LastLargeErrorBucket = 0;

//! The period of the last large error added.
core_t::TTime m_LastLargeErrorPeriod = 0;

//! The p-values of the most significant large error counts.
TFloatUInt32PrMinAccumulator m_LargeErrorCountSignificances;

//! The mean weight of values added.
TFloatMeanAccumulator m_MeanWeight;

//! The bucket end points.
TFloatVec m_Endpoints;

//! The mean periodic time of each regression.
//! The mean offset (relative to the start of the bucket) of samples
//! in each bucket.
TFloatVec m_Centres;

//! The count of large errors in each bucket.
TFloatVec m_LargeErrorCounts;

//! An IIR low pass filter for the total desired end point displacement
//! in refine.
TFloatMeanAccumulator m_MeanDesiredDisplacement;
Expand Down
22 changes: 18 additions & 4 deletions include/maths/CBasicStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,8 @@ class MATHS_EXPORT CBasicStatistics {
using const_iterator = typename CONTAINER::const_iterator;
using reverse_iterator = typename CONTAINER::reverse_iterator;
using const_reverse_iterator = typename CONTAINER::const_reverse_iterator;
using TToString = std::function<std::string(const T&)>;
using TFromString = std::function<bool(const std::string&, T&)>;

public:
COrderStatisticsImpl(const CONTAINER& statistics, const LESS& less)
Expand All @@ -1043,8 +1045,20 @@ class MATHS_EXPORT CBasicStatistics {
//! Initialize from a delimited string.
bool fromDelimited(const std::string& value);

//! Initialize from a delimited string using \p fromString to initialize
//! values of type T from a string.
//!
//! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER.
bool fromDelimited(const std::string& value, const TFromString& fromString);

//! Convert to a delimited string.
std::string toDelimited() const;

//! Convert to a delimited string using \p toString to convert individual
//! values of type T to a string.
//!
//! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER.
std::string toDelimited(const TToString& toString) const;
//@}

//! \name Update
Expand Down Expand Up @@ -1367,15 +1381,15 @@ class MATHS_EXPORT CBasicStatistics {
//! \name Accumulator Typedefs
//@{
//! Accumulator object to compute the sample maximum.
template<typename T>
template<typename T, std::size_t N = 1>
struct SMax {
using TAccumulator = COrderStatisticsStack<T, 1, std::greater<T>>;
using TAccumulator = COrderStatisticsStack<T, N, std::greater<T>>;
};

//! Accumulator object to compute the sample minimum.
template<typename T>
template<typename T, std::size_t N = 1>
struct SMin {
using TAccumulator = COrderStatisticsStack<T, 1>;
using TAccumulator = COrderStatisticsStack<T, N>;
};
//@}

Expand Down
Loading