Skip to content

Commit

Permalink
Merge pull request #3947 from grlee77/histogramdd
Browse files Browse the repository at this point in the history
Add histogram2d and histogramdd
  • Loading branch information
mergify[bot] committed Sep 14, 2020
2 parents ac2f798 + 7aaff5f commit 12ebcf5
Show file tree
Hide file tree
Showing 8 changed files with 343 additions and 10 deletions.
2 changes: 2 additions & 0 deletions cupy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,8 @@ def isscalar(element):
from cupy._statistics.histogram import bincount # NOQA
from cupy._statistics.histogram import digitize # NOQA
from cupy._statistics.histogram import histogram # NOQA
from cupy._statistics.histogram import histogram2d # NOQA
from cupy._statistics.histogram import histogramdd # NOQA

# -----------------------------------------------------------------------------
# Undocumented functions
Expand Down
226 changes: 216 additions & 10 deletions cupy/_statistics/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from cupy.cuda import cub
from cupy.cuda import common

# rename builtin range for use in functions that take a range argument
_range = range


# TODO(unno): use searchsorted
_histogram_kernel = core.ElementwiseKernel(
Expand Down Expand Up @@ -60,14 +63,14 @@ def _ravel_and_check_weights(a, weights):

# Ensure that the array is a "subtractable" dtype
if a.dtype == cupy.bool_:
warnings.warn("Converting input from {} to {} for compatibility."
warnings.warn('Converting input from {} to {} for compatibility.'
.format(a.dtype, cupy.uint8),
RuntimeWarning, stacklevel=3)
a = a.astype(cupy.uint8)

if weights is not None:
if not isinstance(weights, cupy.ndarray):
raise ValueError("weights must be a cupy.ndarray")
raise ValueError('weights must be a cupy.ndarray')
if weights.shape != a.shape:
raise ValueError(
'weights should have the same shape as a.')
Expand All @@ -88,7 +91,7 @@ def _get_outer_edges(a, range):
'max must be larger than min in range parameter.')
if not (numpy.isfinite(first_edge) and numpy.isfinite(last_edge)):
raise ValueError(
"supplied range of [{}, {}] is not finite".format(
'supplied range of [{}, {}] is not finite'.format(
first_edge, last_edge))
elif a.size == 0:
first_edge = 0.0
Expand All @@ -98,7 +101,7 @@ def _get_outer_edges(a, range):
last_edge = float(a.max())
if not (cupy.isfinite(first_edge) and cupy.isfinite(last_edge)):
raise ValueError(
"autodetected range of [{}, {}] is not finite".format(
'autodetected range of [{}, {}] is not finite'.format(
first_edge, last_edge))

# expand empty range to avoid divide by zero
Expand Down Expand Up @@ -127,7 +130,7 @@ def _get_bin_edges(a, bins, range):

if isinstance(bins, str):
raise NotImplementedError(
"only integer and array bins are implemented")
'only integer and array bins are implemented')
elif isinstance(bins, cupy.ndarray) or numpy.ndim(bins) == 1:
# TODO(okuta): After #3060 is merged, `if cupy.ndim(bins) == 1:`.
if isinstance(bins, cupy.ndarray):
Expand Down Expand Up @@ -206,7 +209,7 @@ def histogram(x, bins=10, range=None, weights=None, density=False):
raise NotImplementedError('complex number is not supported')

if not isinstance(x, cupy.ndarray):
raise ValueError("x must be a cupy.ndarray")
raise ValueError('x must be a cupy.ndarray')

x, weights = _ravel_and_check_weights(x, weights)
bin_edges = _get_bin_edges(x, bins, range)
Expand Down Expand Up @@ -249,8 +252,8 @@ def histogram(x, bins=10, range=None, weights=None, density=False):
if not simple_weights:
# object dtype such as Decimal are supported in NumPy, but not here
raise NotImplementedError(
"only weights with dtype that can be cast to float or complex "
"are supported")
'only weights with dtype that can be cast to float or complex '
'are supported')
if weights.dtype.kind == 'c':
y = cupy.zeros(bin_edges.size - 1, dtype=cupy.complex128)
_weighted_histogram_kernel(
Expand All @@ -271,10 +274,213 @@ def histogram(x, bins=10, range=None, weights=None, density=False):
return y, bin_edges


# TODO(okuta): Implement histogram2d
def histogramdd(sample, bins=10, range=None, weights=None, density=False):
"""Compute the multidimensional histogram of some data.
Args:
sample (cupy.ndarray): The data to be histogrammed. (N, D) or (D, N)
array
Note the unusual interpretation of sample when an array_like:
* When an array, each row is a coordinate in a D-dimensional
space - such as ``histogramdd(cupy.array([p1, p2, p3]))``.
* When an array_like, each element is the list of values for single
coordinate - such as ``histogramdd((X, Y, Z))``.
The first form should be preferred.
bins (int or tuple of int or cupy.ndarray): The bin specification:
* A sequence of arrays describing the monotonically increasing bin
edges along each dimension.
* The number of bins for each dimension (nx, ny, ... =bins)
* The number of bins for all dimensions (nx=ny=...=bins).
range (sequence, optional): A sequence of length D, each an optional
(lower, upper) tuple giving the outer bin edges to be used if the
edges are not given explicitly in `bins`. An entry of None in the
sequence results in the minimum and maximum values being used for
the corresponding dimension. The default, None, is equivalent to
passing a tuple of D None values.
weights (cupy.ndarray): An array of values `w_i` weighing each sample
`(x_i, y_i, z_i, ...)`. The values of the returned histogram are
equal to the sum of the weights belonging to the samples falling
into each bin.
density (bool, optional): If False, the default, returns the number of
samples in each bin. If True, returns the probability *density*
function at the bin, ``bin_count / sample_count / bin_volume``.
Returns:
H (cupy.ndarray): The multidimensional histogram of sample x. See
normed and weights for the different possible semantics.
edges (list of cupy.ndarray): A list of D arrays describing the bin
edges for each dimension.
.. warning::
This function may synchronize the device.
.. seealso:: :func:`numpy.histogramdd`
"""
if isinstance(sample, cupy.ndarray):
# Sample is an ND-array.
if sample.ndim == 1:
sample = sample[:, cupy.newaxis]
nsamples, ndim = sample.shape
else:
sample = cupy.stack(sample, axis=-1)
nsamples, ndim = sample.shape

nbin = numpy.empty(ndim, int)
edges = ndim * [None]
dedges = ndim * [None]
if weights is not None:
weights = cupy.asarray(weights)

try:
nbins = len(bins)
if nbins != ndim:
raise ValueError(
'The dimension of bins must be equal to the dimension of the '
' sample x.'
)
except TypeError:
# bins is an integer
bins = ndim * [bins]

# normalize the range argument
if range is None:
range = (None,) * ndim
elif len(range) != ndim:
raise ValueError('range argument must have one entry per dimension')

# Create edge arrays
for i in _range(ndim):
if cupy.ndim(bins[i]) == 0:
if bins[i] < 1:
raise ValueError(
'`bins[{}]` must be positive, when an integer'.format(i)
)
smin, smax = _get_outer_edges(sample[:, i], range[i])
num = int(bins[i] + 1) # synchronize!
edges[i] = cupy.linspace(smin, smax, num)
elif cupy.ndim(bins[i]) == 1:
if not isinstance(bins[i], cupy.ndarray):
raise ValueError('array-like bins not supported')
edges[i] = bins[i]
if (edges[i][:-1] > edges[i][1:]).any(): # synchronize!
raise ValueError(
'`bins[{}]` must be monotonically increasing, when an '
'array'.format(i)
)
else:
raise ValueError(
'`bins[{}]` must be a scalar or 1d array'.format(i)
)

nbin[i] = len(edges[i]) + 1 # includes an outlier on each end
dedges[i] = cupy.diff(edges[i])

# Compute the bin number each sample falls into.
ncount = tuple(
# avoid cupy.digitize to work around NumPy issue gh-11022
cupy.searchsorted(edges[i], sample[:, i], side='right')
for i in _range(ndim)
)

# Using digitize, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right edge to be
# counted in the last bin, and not as an outlier.
for i in _range(ndim):
# Find which points are on the rightmost edge.
on_edge = sample[:, i] == edges[i][-1]
# Shift these points one bin to the left.
ncount[i][on_edge] -= 1

# Compute the sample indices in the flattened histogram matrix.
# This raises an error if the array is too large.
xy = cupy.ravel_multi_index(ncount, nbin)

# Compute the number of repetitions in xy and assign it to the
# flattened histmat.
hist = cupy.bincount(xy, weights, minlength=numpy.prod(nbin))

# Shape into a proper matrix
hist = hist.reshape(nbin)

# This preserves the (bad) behavior observed in NumPy gh-7845, for now.
hist = hist.astype(float) # Note: NumPy uses casting='safe' here too

# Remove outliers (indices 0 and -1 for each dimension).
core = ndim * (slice(1, -1),)
hist = hist[core]

if density:
# calculate the probability density function
s = hist.sum()
for i in _range(ndim):
shape = [1] * ndim
shape[i] = nbin[i] - 2
hist = hist / dedges[i].reshape(shape)
hist /= s

if any(hist.shape != numpy.asarray(nbin) - 2):
raise RuntimeError('Internal Shape Error')
return hist, edges


def histogram2d(x, y, bins=10, range=None, weights=None, density=None):
"""Compute the bi-dimensional histogram of two data samples.
Args:
x (cupy.ndarray): The first array of samples to be histogrammed.
y (cupy.ndarray): The second array of samples to be histogrammed.
bins (int or tuple of int or cupy.ndarray): The bin specification:
* A sequence of arrays describing the monotonically increasing bin
edges along each dimension.
* The number of bins for each dimension (nx, ny)
* The number of bins for all dimensions (nx=ny=bins).
range (sequence, optional): A sequence of length two, each an optional
(lower, upper) tuple giving the outer bin edges to be used if the
edges are not given explicitly in `bins`. An entry of None in the
sequence results in the minimum and maximum values being used for
the corresponding dimension. The default, None, is equivalent to
passing a tuple of two None values.
weights (cupy.ndarray): An array of values `w_i` weighing each sample
`(x_i, y_i)`. The values of the returned histogram are equal to the
sum of the weights belonging to the samples falling into each bin.
density (bool, optional): If False, the default, returns the number of
samples in each bin. If True, returns the probability *density*
function at the bin, ``bin_count / sample_count / bin_volume``.
Returns:
H (cupy.ndarray): The multidimensional histogram of sample x. See
normed and weights for the different possible semantics.
edges0 (tuple of cupy.ndarray): A list of D arrays describing the bin
edges for the first dimension.
edges1 (tuple of cupy.ndarray): A list of D arrays describing the bin
edges for the second dimension.
.. warning::
This function may synchronize the device.
.. seealso:: :func:`numpy.histogram2d`
"""
try:
n = len(bins)
except TypeError:
n = 1

if n != 1 and n != 2:
if isinstance(bins, cupy.ndarray):
xedges = yedges = bins
bins = [xedges, yedges]
else:
raise ValueError('array-like bins not supported in CuPy')

# TODO(okuta): Implement histogramdd
hist, edges = histogramdd([x, y], bins, range, weights, density)
return hist, edges[0], edges[1]


_bincount_kernel = core.ElementwiseKernel(
Expand Down
4 changes: 4 additions & 0 deletions docs/source/reference/statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Order statistics
cupy.nanmin
cupy.nanmax
cupy.percentile
cupy.ptp


Means and variances
Expand All @@ -41,7 +42,10 @@ Histograms
:nosignatures:

cupy.histogram
cupy.histogram2d
cupy.histogramdd
cupy.bincount
cupy.digitize


Correlations
Expand Down
File renamed without changes.

0 comments on commit 12ebcf5

Please sign in to comment.