/
EdPeltChangePointDetector.cs
226 lines (203 loc) · 11.9 KB
/
EdPeltChangePointDetector.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
using System;
using System.Collections.Generic;
using System.Linq;
// ReSharper disable CommentTypo, CompareOfFloatsByEqualityOperator
namespace BenchmarkDotNet.Mathematics.ChangePointDetection
{
/// <summary>
/// The ED-PELT algorithm for changepoint detection.
///
/// <remarks>
/// The implementation is based on the following papers:
/// <list type="bullet">
/// <item>
/// <b>[Haynes2017]</b> Haynes, Kaylea, Paul Fearnhead, and Idris A. Eckley.
/// "A computationally efficient nonparametric approach for changepoint detection."
/// Statistics and Computing 27, no. 5 (2017): 1293-1305.
/// https://doi.org/10.1007/s11222-016-9687-5
/// </item>
/// <item>
/// <b>[Killick2012]</b> Killick, Rebecca, Paul Fearnhead, and Idris A. Eckley.
/// "Optimal detection of changepoints with a linear computational cost."
/// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
/// https://arxiv.org/pdf/1101.1438.pdf
/// </item>
/// </list>
/// </remarks>
/// </summary>
public class EdPeltChangePointDetector
{
public static readonly EdPeltChangePointDetector Instance = new EdPeltChangePointDetector();
/// <summary>
/// For given array of `double` values, detects locations of changepoints that
/// splits original series of values into "statistically homogeneous" segments.
/// Such points correspond to moments when statistical properties of the distribution are changing.
///
/// This method supports nonparametric distributions and has O(N*log(N)) algorithmic complexity.
/// </summary>
/// <param name="data">An array of double values</param>
/// <param name="minDistance">Minimum distance between changepoints</param>
/// <returns>
/// Returns an `int[]` array with 0-based indexes of changepoint.
/// Changepoints correspond to the end of the detected segments.
/// For example, changepoints for { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 } are { 5, 11 }.
/// </returns>
public int[] GetChangePointIndexes(double[] data, int minDistance = 1)
{
// We will use `n` as the number of elements in the `data` array
int n = data.Length;
// Checking corner cases
if (n <= 2)
return new int[0];
if (minDistance < 1 || minDistance > n)
throw new ArgumentOutOfRangeException(
nameof(minDistance), $"{minDistance} should be in range from 1 to data.Length");
// The penalty which we add to the final cost for each additional changepoint
// Here we use the Modified Bayesian Information Criterion
double penalty = 3 * Math.Log(n);
// `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
// We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
// `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
int k = Math.Min(n, (int) Math.Ceiling(4 * Math.Log(n)));
// We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
var partialSums = GetPartialSums(data, k);
// Since we use the same values of `partialSums`, `k`, `n` all the time,
// we introduce a shortcut `Cost(tau1, tau2)` for segment cost evaluation.
// Hereinafter, we use `tau` to name variables that are changepoint candidates.
double Cost(int tau1, int tau2) => GetSegmentCost(partialSums, tau1, tau2, k, n);
// We will use dynamic programming to find the best solution; `bestCost` is the cost array.
// `bestCost[i]` is the cost for subarray `data[0..i-1]`.
// It's a 1-based array (`data[0]`..`data[n-1]` correspond to `bestCost[1]`..`bestCost[n]`)
var bestCost = new double[n + 1];
bestCost[0] = -penalty;
for (int currentTau = minDistance; currentTau < 2 * minDistance; currentTau++)
bestCost[currentTau] = Cost(0, currentTau);
// `previousChangePointIndex` is an array of references to previous changepoints. If the current segment ends at
// the position `i`, the previous segment ends at the position `previousChangePointIndex[i]`. It's a 1-based
// array (`data[0]`..`data[n-1]` correspond to the `previousChangePointIndex[1]`..`previousChangePointIndex[n]`)
var previousChangePointIndex = new int[n + 1];
// We use PELT (Pruned Exact Linear Time) approach which means that instead of enumerating all possible previous
// tau values, we use a whitelist of "good" tau values that can be used in the optimal solution. If we are 100%
// sure that some of the tau values will not help us to form the optimal solution, such values should be
// removed. See [Killick2012] for details.
var previousTaus = new List<int>(n + 1) { 0, minDistance };
var costForPreviousTau = new List<double>(n + 1);
// Following the dynamic programming approach, we enumerate all tau positions. For each `currentTau`, we pretend
// that it's the end of the last segment and trying to find the end of the previous segment.
for (int currentTau = 2 * minDistance; currentTau < n + 1; currentTau++)
{
// For each previous tau, we should calculate the cost of taking this tau as the end of the previous
// segment. This cost equals the cost for the `previousTau` plus cost of the new segment (from `previousTau`
// to `currentTau`) plus penalty for the new changepoint.
costForPreviousTau.Clear();
foreach (int previousTau in previousTaus)
costForPreviousTau.Add(bestCost[previousTau] + Cost(previousTau, currentTau) + penalty);
// Now we should choose the tau that provides the minimum possible cost.
int bestPreviousTauIndex = WhichMin(costForPreviousTau);
bestCost[currentTau] = costForPreviousTau[bestPreviousTauIndex];
previousChangePointIndex[currentTau] = previousTaus[bestPreviousTauIndex];
// Prune phase: we remove "useless" tau values that will not help to achieve minimum cost in the future
double currentBestCost = bestCost[currentTau];
int newPreviousTausSize = 0;
for (int i = 0; i < previousTaus.Count; i++)
if (costForPreviousTau[i] < currentBestCost + penalty)
previousTaus[newPreviousTausSize++] = previousTaus[i];
previousTaus.RemoveRange(newPreviousTausSize, previousTaus.Count - newPreviousTausSize);
// We add a new tau value that is located on the `minDistance` distance from the next `currentTau` value
previousTaus.Add(currentTau - (minDistance - 1));
}
// Here we collect the result list of changepoint indexes `changePointIndexes` using `previousChangePointIndex`
var changePointIndexes = new List<int>();
int currentIndex = previousChangePointIndex[n]; // The index of the end of the last segment is `n`
while (currentIndex != 0)
{
changePointIndexes.Add(currentIndex - 1); // 1-based indexes should be be transformed to 0-based indexes
currentIndex = previousChangePointIndex[currentIndex];
}
changePointIndexes.Reverse(); // The result changepoints should be sorted in ascending order.
return changePointIndexes.ToArray();
}
/// <summary>
/// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
/// <code>
/// partialSums[i, tau] = (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
/// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
/// </code>
/// <remarks>
/// <list type="bullet">
/// <item>
/// We use doubled sum values in order to use <c>int[,]</c> instead of <c>double[,]</c> (it provides noticeable
/// performance boost). Thus, multipliers for <c>count(data[j] < t)</c> and <c>count(data[j] == t)</c> are
/// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
/// </item>
/// <item>
/// Note that these quantiles are not uniformly distributed: tails of the <c>data</c> distribution contain more
/// quantile values than the center of the distribution
/// </item>
/// </list>
/// </remarks>
/// </summary>
private static int[,] GetPartialSums(double[] data, int k)
{
int n = data.Length;
var partialSums = new int[k, n + 1];
var sortedData = data.OrderBy(it => it).ToArray();
for (int i = 0; i < k; i++)
{
double z = -1 + (2 * i + 1.0) / k; // Values from (-1+1/k) to (1-1/k) with step = 2/k
double p = 1.0 / (1 + Math.Pow(2 * n - 1, -z)); // Values from 0.0 to 1.0
double t = sortedData[(int) Math.Truncate((n - 1) * p)]; // Quantile value, formula (2.1) in [Haynes2017]
for (int tau = 1; tau <= n; tau++)
{
partialSums[i, tau] = partialSums[i, tau - 1];
if (data[tau - 1] < t)
partialSums[i, tau] += 2; // We use doubled value (2) instead of original 1.0
if (data[tau - 1] == t)
partialSums[i, tau] += 1; // We use doubled value (1) instead of original 0.5
}
}
return partialSums;
}
/// <summary>
/// Calculates the cost of the (tau1; tau2] segment.
/// </summary>
private static double GetSegmentCost(int[,] partialSums, int tau1, int tau2, int k, int n)
{
double sum = 0;
for (int i = 0; i < k; i++)
{
// actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
int actualSum = partialSums[i, tau2] - partialSums[i, tau1];
// We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid Math.Log values
if (actualSum != 0 && actualSum != (tau2 - tau1) * 2)
{
// Empirical CDF $\hat{F}_i(t)$ (Section 2.1 "Model" in [Haynes2017])
double fit = actualSum * 0.5 / (tau2 - tau1);
// Segment cost $\mathcal{L}_{np}$ (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
double lnp = (tau2 - tau1) * (fit * Math.Log(fit) + (1 - fit) * Math.Log(1 - fit));
sum += lnp;
}
}
double c = -Math.Log(2 * n - 1); // Constant from Lemma 3.1 in [Haynes2017]
return 2.0 * c / k * sum; // See Section 3.1 "Discrete approximation" in [Haynes2017]
}
/// <summary>
/// Returns the index of the minimum element.
/// In case if there are several minimum elements in the given list, the index of the first one will be returned.
/// </summary>
private static int WhichMin(IList<double> values)
{
if (values.Count == 0)
throw new InvalidOperationException("Array should contain elements");
double minValue = values[0];
int minIndex = 0;
for (int i = 1; i < values.Count; i++)
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
return minIndex;
}
}
}