pythongh-115532: Add kernel density estimation to the statistics modu…

…le (pythongh-115863)
diegorusso · Apr 17, 2024 · 430e61e · 430e61e
1 parent 6b5344a
commit 430e61e
Show file tree

Hide file tree

Showing 5 changed files with 285 additions and 41 deletions.
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
@@ -76,6 +76,7 @@ or sample.
 :func:`fmean`            Fast, floating point arithmetic mean, with optional weighting.
 :func:`geometric_mean`   Geometric mean of data.
 :func:`harmonic_mean`    Harmonic mean of data.
+:func:`kde`              Estimate the probability density distribution of the data.
 :func:`median`           Median (middle value) of data.
 :func:`median_low`       Low median of data.
 :func:`median_high`      High median of data.
@@ -259,6 +260,54 @@ However, for reading convenience, most of the examples show sorted sequences.
    .. versionchanged:: 3.10
       Added support for *weights*.
 
+
+.. function:: kde(data, h, kernel='normal')
+
+   `Kernel Density Estimation (KDE)
+   <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
+   Create a continuous probability density function from discrete samples.
+
+   The basic idea is to smooth the data using `a kernel function
+   <https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
+   to help draw inferences about a population from a sample.
+
+   The degree of smoothing is controlled by the scaling parameter *h*
+   which is called the bandwidth.  Smaller values emphasize local
+   features while larger values give smoother results.
+
+   The *kernel* determines the relative weights of the sample data
+   points.  Generally, the choice of kernel shape does not matter
+   as much as the more influential bandwidth smoothing parameter.
+
+   Kernels that give some weight to every sample point include
+   *normal* or *gauss*, *logistic*, and *sigmoid*.
+
+   Kernels that only give weight to sample points within the bandwidth
+   include *rectangular* or *uniform*, *triangular*, *parabolic* or
+   *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
+
+   A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
+
+   `Wikipedia has an example
+   <https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
+   where we can use :func:`kde` to generate and plot a probability
+   density function estimated from a small sample:
+
+   .. doctest::
+
+      >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+      >>> f_hat = kde(sample, h=1.5)
+      >>> xarr = [i/100 for i in range(-750, 1100)]
+      >>> yarr = [f_hat(x) for x in xarr]
+
+   The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
+
+   .. image:: kde_example.png
+      :alt: Scatter plot of the estimated probability density function.
+
+   .. versionadded:: 3.13
+
+
 .. function:: median(data)
 
    Return the median (middle value) of numeric data, using the common "mean of
@@ -1095,46 +1144,6 @@ The final prediction goes to the largest posterior. This is known as the
   'female'
 
 
-Kernel density estimation
-*************************
-
-It is possible to estimate a continuous probability density function
-from a fixed number of discrete samples.
-
-The basic idea is to smooth the data using `a kernel function such as a
-normal distribution, triangular distribution, or uniform distribution
-<https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
-The degree of smoothing is controlled by a scaling parameter, ``h``,
-which is called the *bandwidth*.
-
-.. testcode::
-
-   def kde_normal(sample, h):
-       "Create a continuous probability density function from a sample."
-       # Smooth the sample with a normal distribution kernel scaled by h.
-       kernel_h = NormalDist(0.0, h).pdf
-       n = len(sample)
-       def pdf(x):
-           return sum(kernel_h(x - x_i) for x_i in sample) / n
-       return pdf
-
-`Wikipedia has an example
-<https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
-where we can use the ``kde_normal()`` recipe to generate and plot
-a probability density function estimated from a small sample:
-
-.. doctest::
-
-   >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
-   >>> f_hat = kde_normal(sample, h=1.5)
-   >>> xarr = [i/100 for i in range(-750, 1100)]
-   >>> yarr = [f_hat(x) for x in xarr]
-
-The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
-
-.. image:: kde_example.png
-   :alt: Scatter plot of the estimated probability density function.
-
 ..
    # This modelines must appear within the last ten lines of the file.
    kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
@@ -476,6 +476,14 @@ sqlite3
   for filtering database objects to dump.
   (Contributed by Mariusz Felisiak in :gh:`91602`.)
 
+statistics
+----------
+
+* Add :func:`statistics.kde` for kernel density estimation.
+  This makes it possible to estimate a continuous probability density function
+  from a fixed number of discrete samples.
+  (Contributed by Raymond Hettinger in :gh:`115863`.)
+
 subprocess
 ----------
 

diff --git a/Lib/statistics.py b/Lib/statistics.py
@@ -112,6 +112,7 @@
     'fmean',
     'geometric_mean',
     'harmonic_mean',
+    'kde',
     'linear_regression',
     'mean',
     'median',
@@ -137,7 +138,7 @@
 from itertools import count, groupby, repeat
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
-from math import isfinite, isinf
+from math import isfinite, isinf, pi, cos, cosh
 from functools import reduce
 from operator import itemgetter
 from collections import Counter, namedtuple, defaultdict
@@ -802,6 +803,171 @@ def multimode(data):
     return [value for value, count in counts.items() if count == maxcount]
 
 
+def kde(data, h, kernel='normal'):
+    """Kernel Density Estimation:  Create a continuous probability
+    density function from discrete samples.
+
+    The basic idea is to smooth the data using a kernel function
+    to help draw inferences about a population from a sample.
+
+    The degree of smoothing is controlled by the scaling parameter h
+    which is called the bandwidth.  Smaller values emphasize local
+    features while larger values give smoother results.
+
+    The kernel determines the relative weights of the sample data
+    points.  Generally, the choice of kernel shape does not matter
+    as much as the more influential bandwidth smoothing parameter.
+
+    Kernels that give some weight to every sample point:
+
+       normal or gauss
+       logistic
+       sigmoid
+
+    Kernels that only give weight to sample points within
+    the bandwidth:
+
+       rectangular or uniform
+       triangular
+       parabolic or epanechnikov
+       quartic or biweight
+       triweight
+       cosine
+
+    A StatisticsError will be raised if the data sequence is empty.
+
+    Example
+    -------
+
+    Given a sample of six data points, construct a continuous
+    function that estimates the underlying probability density:
+
+        >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+        >>> f_hat = kde(sample, h=1.5)
+
+    Compute the area under the curve:
+
+        >>> sum(f_hat(x) for x in range(-20, 20))
+        1.0
+
+    Plot the estimated probability density function at
+    evenly spaced points from -6 to 10:
+
+        >>> for x in range(-6, 11):
+        ...     density = f_hat(x)
+        ...     plot = ' ' * int(density * 400) + 'x'
+        ...     print(f'{x:2}: {density:.3f} {plot}')
+        ...
+        -6: 0.002 x
+        -5: 0.009    x
+        -4: 0.031             x
+        -3: 0.070                             x
+        -2: 0.111                                             x
+        -1: 0.125                                                   x
+         0: 0.110                                            x
+         1: 0.086                                   x
+         2: 0.068                            x
+         3: 0.059                        x
+         4: 0.066                           x
+         5: 0.082                                 x
+         6: 0.082                                 x
+         7: 0.058                        x
+         8: 0.028            x
+         9: 0.009    x
+        10: 0.002 x
+
+    References
+    ----------
+
+    Kernel density estimation and its application:
+    https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
+
+    Kernel functions in common use:
+    https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use
+
+    Interactive graphical demonstration and exploration:
+    https://demonstrations.wolfram.com/KernelDensityEstimation/
+
+    """
+
+    n = len(data)
+    if not n:
+        raise StatisticsError('Empty data sequence')
+
+    if not isinstance(data[0], (int, float)):
+        raise TypeError('Data sequence must contain ints or floats')
+
+    if h <= 0.0:
+        raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
+
+    match kernel:
+
+        case 'normal' | 'gauss':
+            c = 1 / sqrt(2 * pi)
+            K = lambda t: c * exp(-1/2 * t * t)
+            support = None
+
+        case 'logistic':
+            # 1.0 / (exp(t) + 2.0 + exp(-t))
+            K = lambda t: 1/2 / (1.0 + cosh(t))
+            support = None
+
+        case 'sigmoid':
+            # (2/pi) / (exp(t) + exp(-t))
+            c = 1 / pi
+            K = lambda t: c / cosh(t)
+            support = None
+
+        case 'rectangular' | 'uniform':
+            K = lambda t: 1/2
+            support = 1.0
+
+        case 'triangular':
+            K = lambda t: 1.0 - abs(t)
+            support = 1.0
+
+        case 'parabolic' | 'epanechnikov':
+            K = lambda t: 3/4 * (1.0 - t * t)
+            support = 1.0
+
+        case 'quartic' | 'biweight':
+            K = lambda t: 15/16 * (1.0 - t * t) ** 2
+            support = 1.0
+
+        case 'triweight':
+            K = lambda t: 35/32 * (1.0 - t * t) ** 3
+            support = 1.0
+
+        case 'cosine':
+            c1 = pi / 4
+            c2 = pi / 2
+            K = lambda t: c1 * cos(c2 * t)
+            support = 1.0
+
+        case _:
+            raise StatisticsError(f'Unknown kernel name: {kernel!r}')
+
+    if support is None:
+
+        def pdf(x):
+            return sum(K((x - x_i) / h) for x_i in data) / (n * h)
+
+    else:
+
+        sample = sorted(data)
+        bandwidth = h * support
+
+        def pdf(x):
+            i = bisect_left(sample, x - bandwidth)
+            j = bisect_right(sample, x + bandwidth)
+            supported = sample[i : j]
+            return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
+
+    pdf.__doc__ = f'PDF estimate with {kernel=!r} and {h=!r}'
+
+    return pdf
+
+
 # Notes on methods for computing quantiles
 # ----------------------------------------
 #

diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
@@ -2353,6 +2353,66 @@ def test_mixed_int_and_float(self):
                 self.assertAlmostEqual(actual_mean, expected_mean, places=5)
 
 
+class TestKDE(unittest.TestCase):
+
+    def test_kde(self):
+        kde = statistics.kde
+        StatisticsError = statistics.StatisticsError
+
+        kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
+                   'uniform', 'triangular', 'parabolic', 'epanechnikov',
+                   'quartic', 'biweight', 'triweight', 'cosine']
+
+        sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+
+        # The approximate integral of a PDF should be close to 1.0
+
+        def integrate(func, low, high, steps=10_000):
+            "Numeric approximation of a definite function integral."
+            dx = (high - low) / steps
+            midpoints = (low + (i + 1/2) * dx for i in range(steps))
+            return sum(map(func, midpoints)) * dx
+
+        for kernel in kernels:
+            with self.subTest(kernel=kernel):
+                f_hat = kde(sample, h=1.5, kernel=kernel)
+                area = integrate(f_hat, -20, 20)
+                self.assertAlmostEqual(area, 1.0, places=4)
+
+        # Check error cases
+
+        with self.assertRaises(StatisticsError):
+            kde([], h=1.0)                              # Empty dataset
+        with self.assertRaises(TypeError):
+            kde(['abc', 'def'], 1.5)                    # Non-numeric data
+        with self.assertRaises(TypeError):
+            kde(iter(sample), 1.5)                      # Data is not a sequence
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=0.0)                          # Zero bandwidth
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=0.0)                          # Negative bandwidth
+        with self.assertRaises(TypeError):
+            kde(sample, h='str')                        # Wrong bandwidth type
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
+
+        # Test name and docstring of the generated function
+
+        h = 1.5
+        kernel = 'cosine'
+        f_hat = kde(sample, h, kernel)
+        self.assertEqual(f_hat.__name__, 'pdf')
+        self.assertIn(kernel, f_hat.__doc__)
+        self.assertIn(str(h), f_hat.__doc__)
+
+        # Test closed interval for the support boundaries.
+        # In particular, 'uniform' should non-zero at the boundaries.
+
+        f_hat = kde([0], 1.0, 'uniform')
+        self.assertEqual(f_hat(-1.0), 1/2)
+        self.assertEqual(f_hat(1.0), 1/2)
+
+
 class TestQuantiles(unittest.TestCase):
 
     def test_specific_cases(self):

diff --git a/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst
@@ -0,0 +1 @@
+Add kernel density estimation to the statistics module.