Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Reduce false positives for the periodic component test for anomaly detection #1177

Merged
merged 9 commits into from May 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 2 additions & 5 deletions docs/CHANGELOG.asciidoc
Expand Up @@ -62,20 +62,17 @@
operations. (See {ml-pull}1142[#1142].)
* Fix spurious anomalies for count and sum functions after no data are received for long
periods of time. (See {ml-pull}1158[#1158].)
* Improve false positive rates from periodicity test for time series anomaly detection.
(See {ml-pull}1177[#1177].)
* Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].)

== {es} version 7.8.0

=== Bug Fixes

* Trap and fail if insufficient features are supplied to data frame analyses. This
caused classification and regression getting stuck at zero progress analyzing.
(See {ml-pull}1160[#1160], issue: {issue}55593[#55593].)
* Make categorization respect the `model_memory_limit`. (See {ml-pull}1167[#1167],
issue: {ml-issue}1130[#1130].)

=== Bug Fixes

* Fix underlying cause for "Failed to calculate splitting significance" log errors.
(See {ml-pull}1157[#1157].)

Expand Down
4 changes: 2 additions & 2 deletions include/maths/CPeriodicityHypothesisTests.h
Expand Up @@ -441,7 +441,7 @@ class MATHS_EXPORT CPeriodicityHypothesisTests final {
STestStats& stats,
double& R,
double& meanRepeats,
double& pVariance,
double& truthVariance,
const TSizeVec& segmentation = TSizeVec{}) const;

//! Run the component amplitude test on the alternative hypothesis.
Expand All @@ -452,7 +452,7 @@ class MATHS_EXPORT CPeriodicityHypothesisTests final {
double v,
double R,
double meanRepeats,
double pVariance,
double truthVariance,
STestStats& stats) const;

private:
Expand Down
167 changes: 100 additions & 67 deletions lib/maths/CPeriodicityHypothesisTests.cc
Expand Up @@ -69,6 +69,44 @@ const std::size_t MINIMUM_REPEATS_TO_TEST_AMPLITUDE{4};
//! A high priority for components we want to take precendence.
double HIGH_PRIORITY{2.0};

//! \brief Fuzzy logical expression with multiplicative AND.
//!
//! DESCRIPTION:
//! This isn't strictly a fuzzy logical expression since we don't ensure
//! that the range of truth values is [0,1]. In fact, we arrange for TRUE
//! to correspond to value > 1. We roll in an implicit threshold such that
//! if individual conditions have values > 0.5 then the expression (just)
//! maps to true.
class CFuzzyExpression {
public:
explicit CFuzzyExpression(double value = 0.0) : m_Value{value} {}

operator bool() const { return m_Value > 1.0; }
bool operator<(const CFuzzyExpression& rhs) const {
return m_Value < rhs.m_Value;
}

double truthValue() const { return m_Value; }

friend CFuzzyExpression operator&&(const CFuzzyExpression& lhs,
const CFuzzyExpression& rhs) {
return CFuzzyExpression{lhs.m_Value * rhs.m_Value};
}

private:
double m_Value;
};

//! Fuzzy check if \p value is greater than \p threshold.
CFuzzyExpression softGreaterThan(double value, double threshold, double margin) {
return CFuzzyExpression{2.0 * CTools::logisticFunction(value, margin, threshold, +1.0)};
}

//! Fuzzy check if \p value is less than \p threshold.
CFuzzyExpression softLessThan(double value, double threshold, double margin) {
return CFuzzyExpression{2.0 * CTools::logisticFunction(value, margin, threshold, -1.0)};
}

//! \brief Accumulates the minimum amplitude.
class CMinAmplitude {
public:
Expand Down Expand Up @@ -1231,26 +1269,27 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec& hypotheses) const
TMinAccumulator vmin;
TMinAccumulator DFmin;
for (const auto& summary : summaries) {
vmin.add(varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 + CONFIDENCE_INTERVAL / 2.0) /
summary.s_VarianceThreshold);
double v{varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 + CONFIDENCE_INTERVAL / 2.0)};
vmin.add(v == summary.s_VarianceThreshold ? 1.0 : v / summary.s_VarianceThreshold);
DFmin.add(summary.s_DF);
}

TMinAccumulator pmin;
TMinAccumulator minMinusTruth;
for (const auto& summary : summaries) {
double v{varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 - CONFIDENCE_INTERVAL / 2.0) /
summary.s_VarianceThreshold / vmin[0]};
double R{summary.s_R / summary.s_AutocorrelationThreshold};
double DF{summary.s_DF / DFmin[0]};
double p{CTools::logisticFunction(v, 0.2, 1.0, -1.0) *
CTools::logisticFunction(R, 0.2, 1.0, +1.0) *
CTools::logisticFunction(DF, 0.2, 1.0, +1.0) *
CTools::logisticFunction(summary.s_TrendSegments, 0.3, 0.0, -1.0) *
CTools::logisticFunction(summary.s_ScaleSegments, 0.3, 0.0, -1.0)};
LOG_TRACE(<< "p = " << p);
if (pmin.add(-p)) {
50.0 - CONFIDENCE_INTERVAL / 2.0)};
v = v == summary.s_VarianceThreshold * vmin[0]
? 1.0
: v / summary.s_VarianceThreshold / vmin[0];
double truth{(softLessThan(v, 1.0, 0.2) &&
softGreaterThan(summary.s_R, summary.s_AutocorrelationThreshold, 0.1) &&
softGreaterThan(summary.s_DF / DFmin[0], 1.0, 0.2) &&
softLessThan(summary.s_TrendSegments, 0.0, 0.3) &&
softLessThan(summary.s_ScaleSegments, 0.0, 0.3))
.truthValue()};
LOG_TRACE(<< "truth(hypothesis) = " << truth);
if (minMinusTruth.add(-truth)) {
result = summary.s_H;
}
}
Expand Down Expand Up @@ -1718,11 +1757,11 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows,

double R;
double meanRepeats;
double pVariance;
double truthVariance;
return this->testVariance(window, values, period_, df1, v1, stats, R,
meanRepeats, pVariance) ||
meanRepeats, truthVariance) ||
this->testAmplitude(window, values, period_, b, v, R, meanRepeats,
pVariance, stats);
truthVariance, stats);
}

bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& windows,
Expand Down Expand Up @@ -1855,9 +1894,9 @@ bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& w

double R;
double meanRepeats;
double pVariance;
double truthVariance;
return this->testVariance({{0, length(windows)}}, values, period_, df1, v1,
stats, R, meanRepeats, pVariance, segmentation);
stats, R, meanRepeats, truthVariance, segmentation);
}

bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition,
Expand Down Expand Up @@ -2062,7 +2101,7 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
return CBasicStatistics::mean(result);
};

double p{0.0};
CFuzzyExpression correlationCondition;
double R{-1.0};

TFloatMeanAccumulatorVec partitionValues;
Expand All @@ -2084,27 +2123,23 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
}

double meanRepeats{calculateMeanRepeats(window, period_)};
double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " relative mean repeats = " << relativeMeanRepeats);

p = std::max(
p, CTools::logisticFunction(RW / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0));
double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

correlationCondition =
std::max(correlationCondition,
softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2));
R = std::max(R, RW);
}

double relativeLogSignificance{
double logSignificance{
CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double meanRepeats{calculateMeanRepeats({{0, windowLength}}, repeat)};
double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
p *= CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
(vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
: CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
LOG_TRACE(<< " p(partition) = " << p);

if (p >= 1.0) {

if (correlationCondition && softGreaterThan(logSignificance, 1.0, 0.1) &&
(vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0) : softLessThan(v1 / vt, 1.0, 0.1))) {
stats.s_StartOfPartition = startOfPartition;
stats.s_R0 = R;
return true;
Expand All @@ -2121,7 +2156,7 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
STestStats& stats,
double& R,
double& meanRepeats,
double& pVariance,
double& truthVariance,
const TSizeVec& segmentation) const {
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};

Expand All @@ -2145,7 +2180,6 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
return CBasicStatistics::mean(result);
}();
LOG_TRACE(<< " mean repeats = " << meanRepeats);

// We're trading off:
// 1) The significance of the variance reduction,
Expand All @@ -2159,22 +2193,24 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
// is equal to the threshold, the variance reduction is equal to the
// threshold and we've observed three periods on average.

double relativeLogSignificance{
double logSignificance{
CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
double segmentsPerRepeat{(stats.s_TrendSegments +
std::max(static_cast<double>(segmentation.size()), 1.0) - 2.0) /
meanRepeats};
pVariance = CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
CTools::logisticFunction(R / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
(vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
: CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
LOG_TRACE(<< " p(variance) = " << pVariance);

if (pVariance >= 1.0) {
double meanRepeatsPerSegment{
meanRepeats /
std::max(stats.s_TrendSegments + static_cast<double>(segmentation.size()), 1.0) /
MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

auto condition = softGreaterThan(logSignificance, 1.0, 0.1) &&
softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
(vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0)
: softLessThan(v1 / vt, 0.1, 1.0)) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2);
truthVariance = condition.truthValue();
LOG_TRACE(<< " truth(variance) = " << truthVariance);

if (condition) {
stats.s_R0 = R;
stats.s_Segmentation = segmentation;
return true;
Expand All @@ -2189,7 +2225,7 @@ bool CPeriodicityHypothesisTests::testAmplitude(const TTimeTimePr2Vec& window,
double v,
double R,
double meanRepeats,
double pVariance,
double truthVariance,
STestStats& stats) const {
core_t::TTime windowLength{length(window)};
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
Expand Down Expand Up @@ -2226,19 +2262,16 @@ bool CPeriodicityHypothesisTests::testAmplitude(const TTimeTimePr2Vec& window,

// Trade off the test significance and the mean number of repeats
// we've observed.
double relativeLogSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double relativeMeanRepeats{
meanRepeats / static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
double minusLogPVariance{-CTools::fastLog(pVariance)};
double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
double pAmplitude{CTools::logisticFunction(relativeLogSignificance, 0.2, 1.0) *
CTools::logisticFunction(relativeMeanRepeats, 0.5, 1.0) *
CTools::logisticFunction(minusLogPVariance, 2.0, 0.0, -1.0) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.0625};
LOG_TRACE(<< " p(amplitude) = " << pAmplitude);

if (pAmplitude >= 1.0) {
double logSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
double minusLogTruthVariance{-CTools::fastLog(truthVariance)};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

if (softLessThan(minusLogTruthVariance, 0.0, 2.0) &&
softGreaterThan(logSignificance, 1.0, 0.2) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2)) {
stats.s_R0 = R;
return true;
}
Expand Down
18 changes: 12 additions & 6 deletions lib/maths/CTimeSeriesModel.cc
Expand Up @@ -1551,8 +1551,9 @@ CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples,
}
}

// Time order is not reliable, for example if the data are polled
// or for count feature, the times of all samples will be the same.
// Time order is not a total order, for example if the data are polled
// the times of all samples will be the same. So break ties using the
// sample value.
TSizeVec timeorder(samples.size());
std::iota(timeorder.begin(), timeorder.end(), 0);
std::stable_sort(timeorder.begin(), timeorder.end(),
Expand Down Expand Up @@ -1656,6 +1657,7 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
// We can't properly handle periodicity in the variance of the rate if
// using a Poisson process so remove it from model detectio if we detect
// seasonality.
double numberSamples{m_ResidualModel->numberSamples()};
m_ResidualModel->removeModels(
maths::CPrior::CModelFilter().remove(maths::CPrior::E_Poisson));
m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());
Expand All @@ -1665,7 +1667,8 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
[](double weight, const TFloatMeanAccumulator& sample) {
return weight + CBasicStatistics::count(sample);
})};
double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
double weightScale{
std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
maths_t::TDoubleWeightsAry1Vec weights(1);
for (const auto& residual : residuals) {
double weight(CBasicStatistics::count(residual));
Expand Down Expand Up @@ -2862,8 +2865,9 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample
}
}

// Time order is not reliable, for example if the data are polled
// or for count feature, the times of all samples will be the same.
// Time order is not a total order, for example if the data are polled
// the times of all samples will be the same. So break ties using the
// sample value.
TSizeVec timeorder(samples.size());
std::iota(timeorder.begin(), timeorder.end(), 0);
std::stable_sort(timeorder.begin(), timeorder.end(),
Expand Down Expand Up @@ -2965,6 +2969,7 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
// re-weight so that the total sample weight corresponds to the sample
// weight the model receives from a fixed (shortish) time interval.

double numberSamples{m_ResidualModel->numberSamples()};
m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());

if (residuals.size() > 0) {
Expand All @@ -2988,7 +2993,8 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
}

double Z{std::accumulate(weights.begin(), weights.end(), 0.0)};
double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
double weightScale{
std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
maths_t::TDouble10VecWeightsAry1Vec weight(1);
for (std::size_t i = 0; i < samples.size(); ++i) {
if (weights[i] > 0.0) {
Expand Down