Skip to content

Commit

Permalink
Add EdPeltChangePointDetector
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreyAkinshin committed Oct 7, 2019
1 parent 51f53a1 commit f89091a
Show file tree
Hide file tree
Showing 3 changed files with 326 additions and 0 deletions.
1 change: 1 addition & 0 deletions BenchmarkDotNet.sln.DotSettings
Expand Up @@ -32,6 +32,7 @@
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpCodeStyle/CONSTRUCTOR_OR_DESTRUCTOR_BODY/@EntryValue">ExpressionBody</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpCodeStyle/LOCAL_FUNCTION_BODY/@EntryValue">ExpressionBody</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpCodeStyle/METHOD_OR_OPERATOR_BODY/@EntryValue">ExpressionBody</s:String>
<s:Int64 x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/BLANK_LINES_AFTER_BLOCK_STATEMENTS/@EntryValue">0</s:Int64>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/EMPTY_BLOCK_STYLE/@EntryValue">TOGETHER_SAME_LINE</s:String>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/KEEP_EXISTING_ATTRIBUTE_ARRANGEMENT/@EntryValue">True</s:Boolean>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/PLACE_ACCESSORHOLDER_ATTRIBUTE_ON_SAME_LINE_EX/@EntryValue">NEVER</s:String>
Expand Down
@@ -0,0 +1,226 @@
using System;
using System.Collections.Generic;
using System.Linq;

// ReSharper disable CommentTypo, CompareOfFloatsByEqualityOperator
namespace BenchmarkDotNet.Mathematics.ChangePointDetection
{
/// <summary>
/// The ED-PELT algorithm for changepoint detection.
///
/// <remarks>
/// The implementation is based on the following papers:
/// <list type="bullet">
/// <item>
/// <b>[Haynes2017]</b> Haynes, Kaylea, Paul Fearnhead, and Idris A. Eckley.
/// "A computationally efficient nonparametric approach for changepoint detection."
/// Statistics and Computing 27, no. 5 (2017): 1293-1305.
/// https://doi.org/10.1007/s11222-016-9687-5
/// </item>
/// <item>
/// <b>[Killick2012]</b> Killick, Rebecca, Paul Fearnhead, and Idris A. Eckley.
/// "Optimal detection of changepoints with a linear computational cost."
/// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
/// https://arxiv.org/pdf/1101.1438.pdf
/// </item>
/// </list>
/// </remarks>
/// </summary>
public class EdPeltChangePointDetector
{
public static readonly EdPeltChangePointDetector Instance = new EdPeltChangePointDetector();

/// <summary>
/// For given array of `double` values, detects locations of changepoints that
/// splits original series of values into "statistically homogeneous" segments.
/// Such points correspond to moments when statistical properties of the distribution are changing.
///
/// This method supports nonparametric distributions and has O(N*log(N)) algorithmic complexity.
/// </summary>
/// <param name="data">An array of double values</param>
/// <param name="minDistance">Minimum distance between changepoints</param>
/// <returns>
/// Returns an `int[]` array with 0-based indexes of changepoint.
/// Changepoints correspond to the end of the detected segments.
/// For example, changepoints for { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 } are { 5, 11 }.
/// </returns>
public int[] GetChangePointIndexes(double[] data, int minDistance = 1)
{
// We will use `n` as the number of elements in the `data` array
int n = data.Length;

// Checking corner cases
if (n <= 2)
return new int[0];
if (minDistance < 1 || minDistance > n)
throw new ArgumentOutOfRangeException(
nameof(minDistance), $"{minDistance} should be in range from 1 to data.Length");

// The penalty which we add to the final cost for each additional changepoint
// Here we use the Modified Bayesian Information Criterion
double penalty = 3 * Math.Log(n);

// `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
// We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
// `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
int k = Math.Min(n, (int) Math.Ceiling(4 * Math.Log(n)));

// We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
var partialSums = GetPartialSums(data, k);

// Since we use the same values of `partialSums`, `k`, `n` all the time,
// we introduce a shortcut `Cost(tau1, tau2)` for segment cost evaluation.
// Hereinafter, we use `tau` to name variables that are changepoint candidates.
double Cost(int tau1, int tau2) => GetSegmentCost(partialSums, tau1, tau2, k, n);

// We will use dynamic programming to find the best solution; `bestCost` is the cost array.
// `bestCost[i]` is the cost for subarray `data[0..i-1]`.
// It's a 1-based array (`data[0]`..`data[n-1]` correspond to `bestCost[1]`..`bestCost[n]`)
var bestCost = new double[n + 1];
bestCost[0] = -penalty;
for (int currentTau = minDistance; currentTau < 2 * minDistance; currentTau++)
bestCost[currentTau] = Cost(0, currentTau);

// `previousChangePointIndex` is an array of references to previous changepoints. If the current segment ends at
// the position `i`, the previous segment ends at the position `previousChangePointIndex[i]`. It's a 1-based
// array (`data[0]`..`data[n-1]` correspond to the `previousChangePointIndex[1]`..`previousChangePointIndex[n]`)
var previousChangePointIndex = new int[n + 1];

// We use PELT (Pruned Exact Linear Time) approach which means that instead of enumerating all possible previous
// tau values, we use a whitelist of "good" tau values that can be used in the optimal solution. If we are 100%
// sure that some of the tau values will not help us to form the optimal solution, such values should be
// removed. See [Killick2012] for details.
var previousTaus = new List<int>(n + 1) { 0, minDistance };
var costForPreviousTau = new List<double>(n + 1);

// Following the dynamic programming approach, we enumerate all tau positions. For each `currentTau`, we pretend
// that it's the end of the last segment and trying to find the end of the previous segment.
for (int currentTau = 2 * minDistance; currentTau < n + 1; currentTau++)
{
// For each previous tau, we should calculate the cost of taking this tau as the end of the previous
// segment. This cost equals the cost for the `previousTau` plus cost of the new segment (from `previousTau`
// to `currentTau`) plus penalty for the new changepoint.
costForPreviousTau.Clear();
foreach (int previousTau in previousTaus)
costForPreviousTau.Add(bestCost[previousTau] + Cost(previousTau, currentTau) + penalty);

// Now we should choose the tau that provides the minimum possible cost.
int bestPreviousTauIndex = WhichMin(costForPreviousTau);
bestCost[currentTau] = costForPreviousTau[bestPreviousTauIndex];
previousChangePointIndex[currentTau] = previousTaus[bestPreviousTauIndex];

// Prune phase: we remove "useless" tau values that will not help to achieve minimum cost in the future
double currentBestCost = bestCost[currentTau];
int newPreviousTausSize = 0;
for (int i = 0; i < previousTaus.Count; i++)
if (costForPreviousTau[i] < currentBestCost + penalty)
previousTaus[newPreviousTausSize++] = previousTaus[i];
previousTaus.RemoveRange(newPreviousTausSize, previousTaus.Count - newPreviousTausSize);

// We add a new tau value that is located on the `minDistance` distance from the next `currentTau` value
previousTaus.Add(currentTau - (minDistance - 1));
}

// Here we collect the result list of changepoint indexes `changePointIndexes` using `previousChangePointIndex`
var changePointIndexes = new List<int>();
int currentIndex = previousChangePointIndex[n]; // The index of the end of the last segment is `n`
while (currentIndex != 0)
{
changePointIndexes.Add(currentIndex - 1); // 1-based indexes should be be transformed to 0-based indexes
currentIndex = previousChangePointIndex[currentIndex];
}
changePointIndexes.Reverse(); // The result changepoints should be sorted in ascending order.
return changePointIndexes.ToArray();
}

/// <summary>
/// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
/// <code>
/// partialSums[i, tau] = (count(data[j] &lt; t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
/// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
/// </code>
/// <remarks>
/// <list type="bullet">
/// <item>
/// We use doubled sum values in order to use <c>int[,]</c> instead of <c>double[,]</c> (it provides noticeable
/// performance boost). Thus, multipliers for <c>count(data[j] &lt; t)</c> and <c>count(data[j] == t)</c> are
/// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
/// </item>
/// <item>
/// Note that these quantiles are not uniformly distributed: tails of the <c>data</c> distribution contain more
/// quantile values than the center of the distribution
/// </item>
/// </list>
/// </remarks>
/// </summary>
private static int[,] GetPartialSums(double[] data, int k)
{
int n = data.Length;
var partialSums = new int[k, n + 1];
var sortedData = data.OrderBy(it => it).ToArray();

for (int i = 0; i < k; i++)
{
double z = -1 + (2 * i + 1.0) / k; // Values from (-1+1/k) to (1-1/k) with step = 2/k
double p = 1.0 / (1 + Math.Pow(2 * n - 1, -z)); // Values from 0.0 to 1.0
double t = sortedData[(int) Math.Truncate((n - 1) * p)]; // Quantile value, formula (2.1) in [Haynes2017]

for (int tau = 1; tau <= n; tau++)
{
partialSums[i, tau] = partialSums[i, tau - 1];
if (data[tau - 1] < t)
partialSums[i, tau] += 2; // We use doubled value (2) instead of original 1.0
if (data[tau - 1] == t)
partialSums[i, tau] += 1; // We use doubled value (1) instead of original 0.5
}
}
return partialSums;
}

/// <summary>
/// Calculates the cost of the (tau1; tau2] segment.
/// </summary>
private static double GetSegmentCost(int[,] partialSums, int tau1, int tau2, int k, int n)
{
double sum = 0;
for (int i = 0; i < k; i++)
{
// actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
int actualSum = partialSums[i, tau2] - partialSums[i, tau1];

// We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid Math.Log values
if (actualSum != 0 && actualSum != (tau2 - tau1) * 2)
{
// Empirical CDF $\hat{F}_i(t)$ (Section 2.1 "Model" in [Haynes2017])
double fit = actualSum * 0.5 / (tau2 - tau1);
// Segment cost $\mathcal{L}_{np}$ (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
double lnp = (tau2 - tau1) * (fit * Math.Log(fit) + (1 - fit) * Math.Log(1 - fit));
sum += lnp;
}
}
double c = -Math.Log(2 * n - 1); // Constant from Lemma 3.1 in [Haynes2017]
return 2.0 * c / k * sum; // See Section 3.1 "Discrete approximation" in [Haynes2017]
}

/// <summary>
/// Returns the index of the minimum element.
/// In case if there are several minimum elements in the given list, the index of the first one will be returned.
/// </summary>
private static int WhichMin(IList<double> values)
{
if (values.Count == 0)
throw new InvalidOperationException("Array should contain elements");

double minValue = values[0];
int minIndex = 0;
for (int i = 1; i < values.Count; i++)
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}

return minIndex;
}
}
}
@@ -0,0 +1,99 @@
using System.Linq;
using BenchmarkDotNet.Mathematics.ChangePointDetection;
using JetBrains.Annotations;
using Xunit;

namespace BenchmarkDotNet.Tests.Mathematics.ChangePointDetection
{
public class EdPeltTests
{
[AssertionMethod]
private void Check(double[] data, int[] expectedChangePoints)
{
var actualChangePoints = EdPeltChangePointDetector.Instance.GetChangePointIndexes(data);
Assert.Equal(expectedChangePoints, actualChangePoints);
}

[Fact]
public void Test1() => Check(new double[] { 3240, 3207, 2029, 3028, 3021, 2624, 3290, 2823, 3573 }, new int[0]);

[Fact]
public void Test2() => Check(new[]
{
8.51775781386248, 1013.74746867838, 969.560811828691, 9.99903626421251,
11.3498239348095, 990.192098938082, 10.8362538563524, 11.0497172665872,
1054.23425067251, 9.66601964023216, 1037.81637817117, 8.66558370477946,
919.374881701026, 1131.1541993554, 11.2022838597658, 7.98843446730286,
8.02589313889234, 10.8204269900969, 10.3681600526862, 8.93960790455779,
9.35305979981456, 915.395965153361, 8.16535865503972, 10.2120572491007,
1066.64002362996, 11.069835770419, 9.31344240206391, 10.2348674145244,
11.6801773710108, 7.84243526689435, 9.21956447597874, 1112.57213641362,
10.6780602207311, 1079.1784141977, 8.88378695422417, 8.70650073605106,
8.77123401566279, 10.1077223378988, 9.38105207657018, 12.1034309729932,
1053.30004708716, 955.498653876947, 953.97646035295, 9.80143691452122,
987.671577491428, 8.9168418081165, 9.68107569533077, 10.0424589038011,
9.63713378148637, 932.511669228627, 916.740706556999, 972.101180041745,
1057.81026627134, 888.449133758924, 10.4439525673558, 9.91175069231749,
9.16063280158654, 1094.72129249188, 983.216157217079, 878.618768427093,
959.360944583076, 9.81400390730057, 1042.77719758456, 10.0499199152934,
8.87431073340845, 9.76856626664346, 9.54692832839134, 11.3173847481645,
10.8158661076597, 10.1762447729252, 8.59347190198671, 9.89783946138358,
10.4458247651239, 10.5958375090298, 1048.89026506509, 9.53689245159815,
10.3556711294034, 10.5111206253409, 10.9498649241074, 10.0522571754171,
10.427691547751, 1060.88753031917, 9.69076437597138, 11.724350932454,
10.9261779098129, 9.1663020020811, 11.7754734440436, 9.21696404189964,
1109.00024831966, 8.77237176835653, 11.5707985076869, 9.28229456839916,
1059.70559777354, 11.3686880467374, 10.7804164367935, 10.6682087852642,
9.81848935380367, 1117.56915403446, 10.6748380014858, 8.92628692366044,
21.0443737235031, 1105.72241345132, 19.4477654250736, 17.7821802121474,
19.4781999899639, 820.585116566889, 22.7905163513292, 1074.92528880068,
1013.45423941534, 1033.78179165301, 21.7033767555293, 21.2010125600959,
1155.11887755134, 1124.70778630606, 19.3635057382183, 20.8681411244392,
1025.46927962978, 20.9228886069588, 19.7414810417396, 17.6077812642339,
18.7826030943573, 18.4230008183603, 21.5574158030065, 20.3540972663881,
19.8951972303914, 20.5874066514834, 19.5436655021679, 871.540648869152,
1131.09301822079, 909.512157959961, 21.1625922742, 20.1996974368549,
21.4371296125597, 20.6326908527947, 19.003929460121, 22.1891456118526,
846.131237958441, 21.1737225623976, 17.1273840427487, 20.6505297811944,
21.8058704670258, 974.533374319673, 1038.80781759386, 916.525796601127,
19.5550401051096, 23.1690470680813, 21.6574852061953, 23.6718803027189,
1322.22337184262, 15.7593232457946, 1048.75496002333, 19.8561784114712,
20.1016645168086, 21.9293783943216, 1227.01980787618, 1006.39031466135,
20.8804270869218, 1105.5032121435, 18.7780711547771, 19.1795343556248,
20.442492618286, 22.4707697720047, 1078.35810096477, 833.855347209093,
20.3579023933224, 1046.39353756804, 969.025272754239, 18.265175253187,
22.3140634142563, 912.906077936653, 879.367205674359, 21.6433957676533,
20.9004212873356, 932.700450733285, 1022.41607841167, 16.3513165110277,
19.6568173805112, 20.9655767366488, 21.9780598222067, 881.348203812156,
18.2930220268125, 18.9998486812023, 22.3244524568253, 20.4548445140534,
18.720101811033, 20.0178080242701, 20.2804893326285, 1019.0048151298,
21.3107894997328, 21.3910419104093, 18.3314544873099, 18.4213348686018,
16.7293099646236, 22.4272411724304, 18.8659567857605, 19.9033226277375,
15.8084704011053, 23.5263915426487, 22.0073090762765, 21.5468281669676
}, new[] { 99 });

[Fact]
public void Test3() => Check(new double[]
{
0, 0, 0, 0, 0, 100, 100, 100, 100
}, new[] { 4 });

[Fact]
public void Test4() => Check(new double[]
{
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2
}, new[] { 5, 11 });

[Fact]
public void Test5() => Check(Enumerable.Range(1, 1000).Select(it => (double) it).ToArray(), new[]
{
15, 47, 79, 129, 204, 306, 432, 565, 691, 793, 868, 918, 950, 982
});

[Fact]
public void Test6() => Check(Enumerable.Range(1, 10000).Select(it => (double) it).ToArray(), new[]
{
26, 79, 136, 230, 387, 643, 1051, 1671, 2552, 3692, 4998, 6305, 7445, 8326, 8946, 9354, 9610, 9767, 9861, 9918, 9971
});
}
}

0 comments on commit f89091a

Please sign in to comment.