Add EdPeltChangePointDetector

dotnet · Oct 7, 2019 · f89091a · f89091a
1 parent 51f53a1
commit f89091a
Show file tree

Hide file tree

Showing 3 changed files with 326 additions and 0 deletions.
diff --git a/BenchmarkDotNet.sln.DotSettings b/BenchmarkDotNet.sln.DotSettings
@@ -32,6 +32,7 @@
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpCodeStyle/CONSTRUCTOR_OR_DESTRUCTOR_BODY/@EntryValue">ExpressionBody</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpCodeStyle/LOCAL_FUNCTION_BODY/@EntryValue">ExpressionBody</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpCodeStyle/METHOD_OR_OPERATOR_BODY/@EntryValue">ExpressionBody</s:String>
+	<s:Int64 x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/BLANK_LINES_AFTER_BLOCK_STATEMENTS/@EntryValue">0</s:Int64>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/EMPTY_BLOCK_STYLE/@EntryValue">TOGETHER_SAME_LINE</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/KEEP_EXISTING_ATTRIBUTE_ARRANGEMENT/@EntryValue">True</s:Boolean>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/PLACE_ACCESSORHOLDER_ATTRIBUTE_ON_SAME_LINE_EX/@EntryValue">NEVER</s:String>

diff --git a/src/BenchmarkDotNet/Mathematics/ChangePointDetection/EdPeltChangePointDetector.cs b/src/BenchmarkDotNet/Mathematics/ChangePointDetection/EdPeltChangePointDetector.cs
@@ -0,0 +1,226 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+// ReSharper disable CommentTypo, CompareOfFloatsByEqualityOperator
+namespace BenchmarkDotNet.Mathematics.ChangePointDetection
+{
+    /// <summary>
+    /// The ED-PELT algorithm for changepoint detection.
+    ///
+    /// <remarks>
+    /// The implementation is based on the following papers:
+    /// <list type="bullet">
+    /// <item>
+    /// <b>[Haynes2017]</b> Haynes, Kaylea, Paul Fearnhead, and Idris A. Eckley.
+    /// "A computationally efficient nonparametric approach for changepoint detection."
+    /// Statistics and Computing 27, no. 5 (2017): 1293-1305.
+    /// https://doi.org/10.1007/s11222-016-9687-5
+    /// </item>
+    /// <item>
+    /// <b>[Killick2012]</b> Killick, Rebecca, Paul Fearnhead, and Idris A. Eckley.
+    /// "Optimal detection of changepoints with a linear computational cost."
+    /// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
+    /// https://arxiv.org/pdf/1101.1438.pdf
+    /// </item>
+    /// </list>
+    /// </remarks>
+    /// </summary>
+    public class EdPeltChangePointDetector
+    {
+        public static readonly EdPeltChangePointDetector Instance = new EdPeltChangePointDetector();
+
+        /// <summary>
+        /// For given array of `double` values, detects locations of changepoints that
+        /// splits original series of values into "statistically homogeneous" segments.
+        /// Such points correspond to moments when statistical properties of the distribution are changing.
+        ///
+        /// This method supports nonparametric distributions and has O(N*log(N)) algorithmic complexity.
+        /// </summary>
+        /// <param name="data">An array of double values</param>
+        /// <param name="minDistance">Minimum distance between changepoints</param>
+        /// <returns>
+        /// Returns an `int[]` array with 0-based indexes of changepoint.
+        /// Changepoints correspond to the end of the detected segments.
+        /// For example, changepoints for { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 } are { 5, 11 }.
+        /// </returns>
+        public int[] GetChangePointIndexes(double[] data, int minDistance = 1)
+        {
+            // We will use `n` as the number of elements in the `data` array
+            int n = data.Length;
+
+            // Checking corner cases
+            if (n <= 2)
+                return new int[0];
+            if (minDistance < 1 || minDistance > n)
+                throw new ArgumentOutOfRangeException(
+                    nameof(minDistance), $"{minDistance} should be in range from 1 to data.Length");
+
+            // The penalty which we add to the final cost for each additional changepoint
+            // Here we use the Modified Bayesian Information Criterion
+            double penalty = 3 * Math.Log(n);
+
+            // `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
+            // We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
+            // `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
+            int k = Math.Min(n, (int) Math.Ceiling(4 * Math.Log(n)));
+
+            // We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
+            var partialSums = GetPartialSums(data, k);
+
+            // Since we use the same values of `partialSums`, `k`, `n` all the time,
+            // we introduce a shortcut `Cost(tau1, tau2)` for segment cost evaluation.
+            // Hereinafter, we use `tau` to name variables that are changepoint candidates.
+            double Cost(int tau1, int tau2) => GetSegmentCost(partialSums, tau1, tau2, k, n);
+
+            // We will use dynamic programming to find the best solution; `bestCost` is the cost array.
+            // `bestCost[i]` is the cost for subarray `data[0..i-1]`.
+            // It's a 1-based array (`data[0]`..`data[n-1]` correspond to `bestCost[1]`..`bestCost[n]`)
+            var bestCost = new double[n + 1];
+            bestCost[0] = -penalty;
+            for (int currentTau = minDistance; currentTau < 2 * minDistance; currentTau++)
+                bestCost[currentTau] = Cost(0, currentTau);
+
+            // `previousChangePointIndex` is an array of references to previous changepoints. If the current segment ends at
+            // the position `i`, the previous segment ends at the position `previousChangePointIndex[i]`. It's a 1-based
+            // array (`data[0]`..`data[n-1]` correspond to the `previousChangePointIndex[1]`..`previousChangePointIndex[n]`)
+            var previousChangePointIndex = new int[n + 1];
+
+            // We use PELT (Pruned Exact Linear Time) approach which means that instead of enumerating all possible previous
+            // tau values, we use a whitelist of "good" tau values that can be used in the optimal solution. If we are 100%
+            // sure that some of the tau values will not help us to form the optimal solution, such values should be
+            // removed. See [Killick2012] for details.
+            var previousTaus = new List<int>(n + 1) { 0, minDistance };
+            var costForPreviousTau = new List<double>(n + 1);
+
+            // Following the dynamic programming approach, we enumerate all tau positions. For each `currentTau`, we pretend
+            // that it's the end of the last segment and trying to find the end of the previous segment.
+            for (int currentTau = 2 * minDistance; currentTau < n + 1; currentTau++)
+            {
+                // For each previous tau, we should calculate the cost of taking this tau as the end of the previous
+                // segment. This cost equals the cost for the `previousTau` plus cost of the new segment (from `previousTau`
+                // to `currentTau`) plus penalty for the new changepoint.
+                costForPreviousTau.Clear();
+                foreach (int previousTau in previousTaus)
+                    costForPreviousTau.Add(bestCost[previousTau] + Cost(previousTau, currentTau) + penalty);
+
+                // Now we should choose the tau that provides the minimum possible cost.
+                int bestPreviousTauIndex = WhichMin(costForPreviousTau);
+                bestCost[currentTau] = costForPreviousTau[bestPreviousTauIndex];
+                previousChangePointIndex[currentTau] = previousTaus[bestPreviousTauIndex];
+
+                // Prune phase: we remove "useless" tau values that will not help to achieve minimum cost in the future
+                double currentBestCost = bestCost[currentTau];
+                int newPreviousTausSize = 0;
+                for (int i = 0; i < previousTaus.Count; i++)
+                    if (costForPreviousTau[i] < currentBestCost + penalty)
+                        previousTaus[newPreviousTausSize++] = previousTaus[i];
+                previousTaus.RemoveRange(newPreviousTausSize, previousTaus.Count - newPreviousTausSize);
+
+                // We add a new tau value that is located on the `minDistance` distance from the next `currentTau` value
+                previousTaus.Add(currentTau - (minDistance - 1));
+            }
+
+            // Here we collect the result list of changepoint indexes `changePointIndexes` using `previousChangePointIndex`
+            var changePointIndexes = new List<int>();
+            int currentIndex = previousChangePointIndex[n]; // The index of the end of the last segment is `n`
+            while (currentIndex != 0)
+            {
+                changePointIndexes.Add(currentIndex - 1); // 1-based indexes should be be transformed to 0-based indexes
+                currentIndex = previousChangePointIndex[currentIndex];
+            }
+            changePointIndexes.Reverse(); // The result changepoints should be sorted in ascending order.
+            return changePointIndexes.ToArray();
+        }
+
+        /// <summary>
+        /// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
+        /// <code>
+        /// partialSums[i, tau] = (count(data[j] &lt; t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
+        /// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
+        /// </code>
+        /// <remarks>
+        /// <list type="bullet">
+        /// <item>
+        /// We use doubled sum values in order to use <c>int[,]</c> instead of <c>double[,]</c> (it provides noticeable
+        /// performance boost). Thus, multipliers for <c>count(data[j] &lt; t)</c> and <c>count(data[j] == t)</c> are
+        /// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
+        /// </item>
+        /// <item>
+        /// Note that these quantiles are not uniformly distributed: tails of the <c>data</c> distribution contain more
+        /// quantile values than the center of the distribution
+        /// </item>
+        /// </list>
+        /// </remarks>
+        /// </summary>
+        private static int[,] GetPartialSums(double[] data, int k)
+        {
+            int n = data.Length;
+            var partialSums = new int[k, n + 1];
+            var sortedData = data.OrderBy(it => it).ToArray();
+
+            for (int i = 0; i < k; i++)
+            {
+                double z = -1 + (2 * i + 1.0) / k; // Values from (-1+1/k) to (1-1/k) with step = 2/k
+                double p = 1.0 / (1 + Math.Pow(2 * n - 1, -z)); // Values from 0.0 to 1.0
+                double t = sortedData[(int) Math.Truncate((n - 1) * p)]; // Quantile value, formula (2.1) in [Haynes2017]
+
+                for (int tau = 1; tau <= n; tau++)
+                {
+                    partialSums[i, tau] = partialSums[i, tau - 1];
+                    if (data[tau - 1] < t)
+                        partialSums[i, tau] += 2; // We use doubled value (2) instead of original 1.0
+                    if (data[tau - 1] == t)
+                        partialSums[i, tau] += 1; // We use doubled value (1) instead of original 0.5
+                }
+            }
+            return partialSums;
+        }
+
+        /// <summary>
+        /// Calculates the cost of the (tau1; tau2] segment.
+        /// </summary>
+        private static double GetSegmentCost(int[,] partialSums, int tau1, int tau2, int k, int n)
+        {
+            double sum = 0;
+            for (int i = 0; i < k; i++)
+            {
+                // actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
+                int actualSum = partialSums[i, tau2] - partialSums[i, tau1];
+
+                // We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid Math.Log values
+                if (actualSum != 0 && actualSum != (tau2 - tau1) * 2)
+                {
+                    // Empirical CDF $\hat{F}_i(t)$ (Section 2.1 "Model" in [Haynes2017])
+                    double fit = actualSum * 0.5 / (tau2 - tau1);
+                    // Segment cost $\mathcal{L}_{np}$ (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
+                    double lnp = (tau2 - tau1) * (fit * Math.Log(fit) + (1 - fit) * Math.Log(1 - fit));
+                    sum += lnp;
+                }
+            }
+            double c = -Math.Log(2 * n - 1); // Constant from Lemma 3.1 in [Haynes2017]
+            return 2.0 * c / k * sum; // See Section 3.1 "Discrete approximation" in [Haynes2017]
+        }
+
+        /// <summary>
+        /// Returns the index of the minimum element.
+        /// In case if there are several minimum elements in the given list, the index of the first one will be returned.
+        /// </summary>
+        private static int WhichMin(IList<double> values)
+        {
+            if (values.Count == 0)
+                throw new InvalidOperationException("Array should contain elements");
+
+            double minValue = values[0];
+            int minIndex = 0;
+            for (int i = 1; i < values.Count; i++)
+                if (values[i] < minValue)
+                {
+                    minValue = values[i];
+                    minIndex = i;
+                }
+
+            return minIndex;
+        }
+    }
+}
diff --git a/tests/BenchmarkDotNet.Tests/Mathematics/ChangePointDetection/EdPeltTests.cs b/tests/BenchmarkDotNet.Tests/Mathematics/ChangePointDetection/EdPeltTests.cs
@@ -0,0 +1,99 @@
+using System.Linq;
+using BenchmarkDotNet.Mathematics.ChangePointDetection;
+using JetBrains.Annotations;
+using Xunit;
+
+namespace BenchmarkDotNet.Tests.Mathematics.ChangePointDetection
+{
+    public class EdPeltTests
+    {
+        [AssertionMethod]
+        private void Check(double[] data, int[] expectedChangePoints)
+        {
+            var actualChangePoints = EdPeltChangePointDetector.Instance.GetChangePointIndexes(data);
+            Assert.Equal(expectedChangePoints, actualChangePoints);
+        }
+
+        [Fact]
+        public void Test1() => Check(new double[] { 3240, 3207, 2029, 3028, 3021, 2624, 3290, 2823, 3573 }, new int[0]);
+
+        [Fact]
+        public void Test2() => Check(new[]
+        {
+            8.51775781386248, 1013.74746867838, 969.560811828691, 9.99903626421251,
+            11.3498239348095, 990.192098938082, 10.8362538563524, 11.0497172665872,
+            1054.23425067251, 9.66601964023216, 1037.81637817117, 8.66558370477946,
+            919.374881701026, 1131.1541993554, 11.2022838597658, 7.98843446730286,
+            8.02589313889234, 10.8204269900969, 10.3681600526862, 8.93960790455779,
+            9.35305979981456, 915.395965153361, 8.16535865503972, 10.2120572491007,
+            1066.64002362996, 11.069835770419, 9.31344240206391, 10.2348674145244,
+            11.6801773710108, 7.84243526689435, 9.21956447597874, 1112.57213641362,
+            10.6780602207311, 1079.1784141977, 8.88378695422417, 8.70650073605106,
+            8.77123401566279, 10.1077223378988, 9.38105207657018, 12.1034309729932,
+            1053.30004708716, 955.498653876947, 953.97646035295, 9.80143691452122,
+            987.671577491428, 8.9168418081165, 9.68107569533077, 10.0424589038011,
+            9.63713378148637, 932.511669228627, 916.740706556999, 972.101180041745,
+            1057.81026627134, 888.449133758924, 10.4439525673558, 9.91175069231749,
+            9.16063280158654, 1094.72129249188, 983.216157217079, 878.618768427093,
+            959.360944583076, 9.81400390730057, 1042.77719758456, 10.0499199152934,
+            8.87431073340845, 9.76856626664346, 9.54692832839134, 11.3173847481645,
+            10.8158661076597, 10.1762447729252, 8.59347190198671, 9.89783946138358,
+            10.4458247651239, 10.5958375090298, 1048.89026506509, 9.53689245159815,
+            10.3556711294034, 10.5111206253409, 10.9498649241074, 10.0522571754171,
+            10.427691547751, 1060.88753031917, 9.69076437597138, 11.724350932454,
+            10.9261779098129, 9.1663020020811, 11.7754734440436, 9.21696404189964,
+            1109.00024831966, 8.77237176835653, 11.5707985076869, 9.28229456839916,
+            1059.70559777354, 11.3686880467374, 10.7804164367935, 10.6682087852642,
+            9.81848935380367, 1117.56915403446, 10.6748380014858, 8.92628692366044,
+            21.0443737235031, 1105.72241345132, 19.4477654250736, 17.7821802121474,
+            19.4781999899639, 820.585116566889, 22.7905163513292, 1074.92528880068,
+            1013.45423941534, 1033.78179165301, 21.7033767555293, 21.2010125600959,
+            1155.11887755134, 1124.70778630606, 19.3635057382183, 20.8681411244392,
+            1025.46927962978, 20.9228886069588, 19.7414810417396, 17.6077812642339,
+            18.7826030943573, 18.4230008183603, 21.5574158030065, 20.3540972663881,
+            19.8951972303914, 20.5874066514834, 19.5436655021679, 871.540648869152,
+            1131.09301822079, 909.512157959961, 21.1625922742, 20.1996974368549,
+            21.4371296125597, 20.6326908527947, 19.003929460121, 22.1891456118526,
+            846.131237958441, 21.1737225623976, 17.1273840427487, 20.6505297811944,
+            21.8058704670258, 974.533374319673, 1038.80781759386, 916.525796601127,
+            19.5550401051096, 23.1690470680813, 21.6574852061953, 23.6718803027189,
+            1322.22337184262, 15.7593232457946, 1048.75496002333, 19.8561784114712,
+            20.1016645168086, 21.9293783943216, 1227.01980787618, 1006.39031466135,
+            20.8804270869218, 1105.5032121435, 18.7780711547771, 19.1795343556248,
+            20.442492618286, 22.4707697720047, 1078.35810096477, 833.855347209093,
+            20.3579023933224, 1046.39353756804, 969.025272754239, 18.265175253187,
+            22.3140634142563, 912.906077936653, 879.367205674359, 21.6433957676533,
+            20.9004212873356, 932.700450733285, 1022.41607841167, 16.3513165110277,
+            19.6568173805112, 20.9655767366488, 21.9780598222067, 881.348203812156,
+            18.2930220268125, 18.9998486812023, 22.3244524568253, 20.4548445140534,
+            18.720101811033, 20.0178080242701, 20.2804893326285, 1019.0048151298,
+            21.3107894997328, 21.3910419104093, 18.3314544873099, 18.4213348686018,
+            16.7293099646236, 22.4272411724304, 18.8659567857605, 19.9033226277375,
+            15.8084704011053, 23.5263915426487, 22.0073090762765, 21.5468281669676
+        }, new[] { 99 });
+
+        [Fact]
+        public void Test3() => Check(new double[]
+        {
+            0, 0, 0, 0, 0, 100, 100, 100, 100
+        }, new[] { 4 });
+
+        [Fact]
+        public void Test4() => Check(new double[]
+        {
+            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2
+        }, new[] { 5, 11 });
+
+        [Fact]
+        public void Test5() => Check(Enumerable.Range(1, 1000).Select(it => (double) it).ToArray(), new[]
+        {
+            15, 47, 79, 129, 204, 306, 432, 565, 691, 793, 868, 918, 950, 982
+        });
+
+        [Fact]
+        public void Test6() => Check(Enumerable.Range(1, 10000).Select(it => (double) it).ToArray(), new[]
+        {
+            26, 79, 136, 230, 387, 643, 1051, 1671, 2552, 3692, 4998, 6305, 7445, 8326, 8946, 9354, 9610, 9767, 9861, 9918, 9971
+        });
+    }
+}