Merge pull request #3947 from grlee77/histogramdd

Add histogram2d and histogramdd
cupy · Sep 14, 2020 · 12ebcf5 · 12ebcf5
2 parents ac2f798 + 7aaff5f
commit 12ebcf5
Show file tree

Hide file tree

Showing 8 changed files with 343 additions and 10 deletions.
diff --git a/cupy/__init__.py b/cupy/__init__.py
@@ -721,6 +721,8 @@ def isscalar(element):
 from cupy._statistics.histogram import bincount  # NOQA
 from cupy._statistics.histogram import digitize  # NOQA
 from cupy._statistics.histogram import histogram  # NOQA
+from cupy._statistics.histogram import histogram2d  # NOQA
+from cupy._statistics.histogram import histogramdd  # NOQA
 
 # -----------------------------------------------------------------------------
 # Undocumented functions

diff --git a/cupy/_statistics/histogram.py b/cupy/_statistics/histogram.py
@@ -9,6 +9,9 @@
 from cupy.cuda import cub
 from cupy.cuda import common
 
+# rename builtin range for use in functions that take a range argument
+_range = range
+
 
 # TODO(unno): use searchsorted
 _histogram_kernel = core.ElementwiseKernel(
@@ -60,14 +63,14 @@ def _ravel_and_check_weights(a, weights):
 
     # Ensure that the array is a "subtractable" dtype
     if a.dtype == cupy.bool_:
-        warnings.warn("Converting input from {} to {} for compatibility."
+        warnings.warn('Converting input from {} to {} for compatibility.'
                       .format(a.dtype, cupy.uint8),
                       RuntimeWarning, stacklevel=3)
         a = a.astype(cupy.uint8)
 
     if weights is not None:
         if not isinstance(weights, cupy.ndarray):
-            raise ValueError("weights must be a cupy.ndarray")
+            raise ValueError('weights must be a cupy.ndarray')
         if weights.shape != a.shape:
             raise ValueError(
                 'weights should have the same shape as a.')
@@ -88,7 +91,7 @@ def _get_outer_edges(a, range):
                 'max must be larger than min in range parameter.')
         if not (numpy.isfinite(first_edge) and numpy.isfinite(last_edge)):
             raise ValueError(
-                "supplied range of [{}, {}] is not finite".format(
+                'supplied range of [{}, {}] is not finite'.format(
                     first_edge, last_edge))
     elif a.size == 0:
         first_edge = 0.0
@@ -98,7 +101,7 @@ def _get_outer_edges(a, range):
         last_edge = float(a.max())
         if not (cupy.isfinite(first_edge) and cupy.isfinite(last_edge)):
             raise ValueError(
-                "autodetected range of [{}, {}] is not finite".format(
+                'autodetected range of [{}, {}] is not finite'.format(
                     first_edge, last_edge))
 
     # expand empty range to avoid divide by zero
@@ -127,7 +130,7 @@ def _get_bin_edges(a, bins, range):
 
     if isinstance(bins, str):
         raise NotImplementedError(
-            "only integer and array bins are implemented")
+            'only integer and array bins are implemented')
     elif isinstance(bins, cupy.ndarray) or numpy.ndim(bins) == 1:
         # TODO(okuta): After #3060 is merged, `if cupy.ndim(bins) == 1:`.
         if isinstance(bins, cupy.ndarray):
@@ -206,7 +209,7 @@ def histogram(x, bins=10, range=None, weights=None, density=False):
         raise NotImplementedError('complex number is not supported')
 
     if not isinstance(x, cupy.ndarray):
-        raise ValueError("x must be a cupy.ndarray")
+        raise ValueError('x must be a cupy.ndarray')
 
     x, weights = _ravel_and_check_weights(x, weights)
     bin_edges = _get_bin_edges(x, bins, range)
@@ -249,8 +252,8 @@ def histogram(x, bins=10, range=None, weights=None, density=False):
         if not simple_weights:
             # object dtype such as Decimal are supported in NumPy, but not here
             raise NotImplementedError(
-                "only weights with dtype that can be cast to float or complex "
-                "are supported")
+                'only weights with dtype that can be cast to float or complex '
+                'are supported')
         if weights.dtype.kind == 'c':
             y = cupy.zeros(bin_edges.size - 1, dtype=cupy.complex128)
             _weighted_histogram_kernel(
@@ -271,10 +274,213 @@ def histogram(x, bins=10, range=None, weights=None, density=False):
     return y, bin_edges
 
 
-# TODO(okuta): Implement histogram2d
+def histogramdd(sample, bins=10, range=None, weights=None, density=False):
+    """Compute the multidimensional histogram of some data.
+
+    Args:
+        sample (cupy.ndarray): The data to be histogrammed. (N, D) or (D, N)
+            array
+
+            Note the unusual interpretation of sample when an array_like:
+
+            * When an array, each row is a coordinate in a D-dimensional
+              space - such as ``histogramdd(cupy.array([p1, p2, p3]))``.
+            * When an array_like, each element is the list of values for single
+              coordinate - such as ``histogramdd((X, Y, Z))``.
+
+            The first form should be preferred.
+        bins (int or tuple of int or cupy.ndarray): The bin specification:
+
+            * A sequence of arrays describing the monotonically increasing bin
+              edges along each dimension.
+            * The number of bins for each dimension (nx, ny, ... =bins)
+            * The number of bins for all dimensions (nx=ny=...=bins).
+        range (sequence, optional): A sequence of length D, each an optional
+            (lower, upper) tuple giving the outer bin edges to be used if the
+            edges are not given explicitly in `bins`. An entry of None in the
+            sequence results in the minimum and maximum values being used for
+            the corresponding dimension. The default, None, is equivalent to
+            passing a tuple of D None values.
+        weights (cupy.ndarray): An array of values `w_i` weighing each sample
+            `(x_i, y_i, z_i, ...)`. The values of the returned histogram are
+            equal to the sum of the weights belonging to the samples falling
+            into each bin.
+        density (bool, optional): If False, the default, returns the number of
+            samples in each bin. If True, returns the probability *density*
+            function at the bin, ``bin_count / sample_count / bin_volume``.
+
+    Returns:
+        H (cupy.ndarray): The multidimensional histogram of sample x. See
+            normed and weights for the different possible semantics.
+        edges (list of cupy.ndarray): A list of D arrays describing the bin
+            edges for each dimension.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`numpy.histogramdd`
+    """
+    if isinstance(sample, cupy.ndarray):
+        # Sample is an ND-array.
+        if sample.ndim == 1:
+            sample = sample[:, cupy.newaxis]
+        nsamples, ndim = sample.shape
+    else:
+        sample = cupy.stack(sample, axis=-1)
+        nsamples, ndim = sample.shape
+
+    nbin = numpy.empty(ndim, int)
+    edges = ndim * [None]
+    dedges = ndim * [None]
+    if weights is not None:
+        weights = cupy.asarray(weights)
+
+    try:
+        nbins = len(bins)
+        if nbins != ndim:
+            raise ValueError(
+                'The dimension of bins must be equal to the dimension of the '
+                ' sample x.'
+            )
+    except TypeError:
+        # bins is an integer
+        bins = ndim * [bins]
+
+    # normalize the range argument
+    if range is None:
+        range = (None,) * ndim
+    elif len(range) != ndim:
+        raise ValueError('range argument must have one entry per dimension')
+
+    # Create edge arrays
+    for i in _range(ndim):
+        if cupy.ndim(bins[i]) == 0:
+            if bins[i] < 1:
+                raise ValueError(
+                    '`bins[{}]` must be positive, when an integer'.format(i)
+                )
+            smin, smax = _get_outer_edges(sample[:, i], range[i])
+            num = int(bins[i] + 1)  # synchronize!
+            edges[i] = cupy.linspace(smin, smax, num)
+        elif cupy.ndim(bins[i]) == 1:
+            if not isinstance(bins[i], cupy.ndarray):
+                raise ValueError('array-like bins not supported')
+            edges[i] = bins[i]
+            if (edges[i][:-1] > edges[i][1:]).any():  # synchronize!
+                raise ValueError(
+                    '`bins[{}]` must be monotonically increasing, when an '
+                    'array'.format(i)
+                )
+        else:
+            raise ValueError(
+                '`bins[{}]` must be a scalar or 1d array'.format(i)
+            )
+
+        nbin[i] = len(edges[i]) + 1  # includes an outlier on each end
+        dedges[i] = cupy.diff(edges[i])
+
+    # Compute the bin number each sample falls into.
+    ncount = tuple(
+        # avoid cupy.digitize to work around NumPy issue gh-11022
+        cupy.searchsorted(edges[i], sample[:, i], side='right')
+        for i in _range(ndim)
+    )
+
+    # Using digitize, values that fall on an edge are put in the right bin.
+    # For the rightmost bin, we want values equal to the right edge to be
+    # counted in the last bin, and not as an outlier.
+    for i in _range(ndim):
+        # Find which points are on the rightmost edge.
+        on_edge = sample[:, i] == edges[i][-1]
+        # Shift these points one bin to the left.
+        ncount[i][on_edge] -= 1
+
+    # Compute the sample indices in the flattened histogram matrix.
+    # This raises an error if the array is too large.
+    xy = cupy.ravel_multi_index(ncount, nbin)
+
+    # Compute the number of repetitions in xy and assign it to the
+    # flattened histmat.
+    hist = cupy.bincount(xy, weights, minlength=numpy.prod(nbin))
+
+    # Shape into a proper matrix
+    hist = hist.reshape(nbin)
+
+    # This preserves the (bad) behavior observed in NumPy gh-7845, for now.
+    hist = hist.astype(float)  # Note: NumPy uses casting='safe' here too
+
+    # Remove outliers (indices 0 and -1 for each dimension).
+    core = ndim * (slice(1, -1),)
+    hist = hist[core]
 
+    if density:
+        # calculate the probability density function
+        s = hist.sum()
+        for i in _range(ndim):
+            shape = [1] * ndim
+            shape[i] = nbin[i] - 2
+            hist = hist / dedges[i].reshape(shape)
+        hist /= s
+
+    if any(hist.shape != numpy.asarray(nbin) - 2):
+        raise RuntimeError('Internal Shape Error')
+    return hist, edges
+
+
+def histogram2d(x, y, bins=10, range=None, weights=None, density=None):
+    """Compute the bi-dimensional histogram of two data samples.
+
+    Args:
+        x (cupy.ndarray): The first array of samples to be histogrammed.
+        y (cupy.ndarray): The second array of samples to be histogrammed.
+        bins (int or tuple of int or cupy.ndarray): The bin specification:
+
+            * A sequence of arrays describing the monotonically increasing bin
+              edges along each dimension.
+            * The number of bins for each dimension (nx, ny)
+            * The number of bins for all dimensions (nx=ny=bins).
+        range (sequence, optional): A sequence of length two, each an optional
+            (lower, upper) tuple giving the outer bin edges to be used if the
+            edges are not given explicitly in `bins`. An entry of None in the
+            sequence results in the minimum and maximum values being used for
+            the corresponding dimension. The default, None, is equivalent to
+            passing a tuple of two None values.
+        weights (cupy.ndarray): An array of values `w_i` weighing each sample
+            `(x_i, y_i)`. The values of the returned histogram are equal to the
+            sum of the weights belonging to the samples falling into each bin.
+        density (bool, optional): If False, the default, returns the number of
+            samples in each bin. If True, returns the probability *density*
+            function at the bin, ``bin_count / sample_count / bin_volume``.
+
+    Returns:
+        H (cupy.ndarray): The multidimensional histogram of sample x. See
+            normed and weights for the different possible semantics.
+        edges0 (tuple of cupy.ndarray): A list of D arrays describing the bin
+            edges for the first dimension.
+        edges1 (tuple of cupy.ndarray): A list of D arrays describing the bin
+            edges for the second dimension.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`numpy.histogram2d`
+    """
+    try:
+        n = len(bins)
+    except TypeError:
+        n = 1
+
+    if n != 1 and n != 2:
+        if isinstance(bins, cupy.ndarray):
+            xedges = yedges = bins
+            bins = [xedges, yedges]
+        else:
+            raise ValueError('array-like bins not supported in CuPy')
 
-# TODO(okuta): Implement histogramdd
+    hist, edges = histogramdd([x, y], bins, range, weights, density)
+    return hist, edges[0], edges[1]
 
 
 _bincount_kernel = core.ElementwiseKernel(

diff --git a/docs/source/reference/statistics.rst b/docs/source/reference/statistics.rst
@@ -15,6 +15,7 @@ Order statistics
    cupy.nanmin
    cupy.nanmax
    cupy.percentile
+   cupy.ptp
 
 
 Means and variances
@@ -41,7 +42,10 @@ Histograms
    :nosignatures:
 
    cupy.histogram
+   cupy.histogram2d
+   cupy.histogramdd
    cupy.bincount
+   cupy.digitize
 
 
 Correlations

diff --git a/tests/cupy_tests/statics_tests/__init__.py → ...s/cupy_tests/statistics_tests/__init__.py b/tests/cupy_tests/statics_tests/__init__.py → ...s/cupy_tests/statistics_tests/__init__.py
diff --git a/...y_tests/statics_tests/test_correlation.py → ...ests/statistics_tests/test_correlation.py b/...y_tests/statics_tests/test_correlation.py → ...ests/statistics_tests/test_correlation.py