/
histogram.py
604 lines (506 loc) · 21.9 KB
/
histogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
import operator
import warnings
import numpy
import cupy
from cupy import _core
from cupy._core import _accelerator
from cupy.cuda import cub
from cupy.cuda import common
from cupy.cuda import runtime
# rename builtin range for use in functions that take a range argument
_range = range
# TODO(unno): use searchsorted
_histogram_kernel = _core.ElementwiseKernel(
'S x, raw T bins, int32 n_bins',
'raw U y',
'''
if (x < bins[0] or bins[n_bins - 1] < x) {
return;
}
int high = n_bins - 1;
int low = 0;
while (high - low > 1) {
int mid = (high + low) / 2;
if (bins[mid] <= x) {
low = mid;
} else {
high = mid;
}
}
atomicAdd(&y[low], U(1));
''',
'cupy_histogram_kernel')
_weighted_histogram_kernel = _core.ElementwiseKernel(
'S x, raw T bins, int32 n_bins, raw W weights',
'raw Y y',
'''
if (x < bins[0] or bins[n_bins - 1] < x) {
return;
}
int high = n_bins - 1;
int low = 0;
while (high - low > 1) {
int mid = (high + low) / 2;
if (bins[mid] <= x) {
low = mid;
} else {
high = mid;
}
}
atomicAdd(&y[low], (Y)weights[i]);
''',
'cupy_weighted_histogram_kernel')
def _ravel_and_check_weights(a, weights):
""" Check a and weights have matching shapes, and ravel both """
# Ensure that the array is a "subtractable" dtype
if a.dtype == cupy.bool_:
warnings.warn('Converting input from {} to {} for compatibility.'
.format(a.dtype, cupy.uint8),
RuntimeWarning, stacklevel=3)
a = a.astype(cupy.uint8)
if weights is not None:
if not isinstance(weights, cupy.ndarray):
raise ValueError('weights must be a cupy.ndarray')
if weights.shape != a.shape:
raise ValueError(
'weights should have the same shape as a.')
weights = weights.ravel()
a = a.ravel()
return a, weights
def _get_outer_edges(a, range):
"""
Determine the outer bin edges to use, from either the data or the range
argument
"""
if range is not None:
first_edge, last_edge = range
if first_edge > last_edge:
raise ValueError(
'max must be larger than min in range parameter.')
if not (numpy.isfinite(first_edge) and numpy.isfinite(last_edge)):
raise ValueError(
'supplied range of [{}, {}] is not finite'.format(
first_edge, last_edge))
elif a.size == 0:
first_edge = 0.0
last_edge = 1.0
else:
first_edge = float(a.min())
last_edge = float(a.max())
if not (cupy.isfinite(first_edge) and cupy.isfinite(last_edge)):
raise ValueError(
'autodetected range of [{}, {}] is not finite'.format(
first_edge, last_edge))
# expand empty range to avoid divide by zero
if first_edge == last_edge:
first_edge = first_edge - 0.5
last_edge = last_edge + 0.5
return first_edge, last_edge
def _get_bin_edges(a, bins, range):
"""
Computes the bins used internally by `histogram`.
Args:
a (ndarray): Ravelled data array
bins (int or ndarray): Forwarded argument from `histogram`.
range (None or tuple): Forwarded argument from `histogram`.
Returns:
bin_edges (ndarray): Array of bin edges
"""
# parse the overloaded bins argument
n_equal_bins = None
bin_edges = None
if isinstance(bins, str):
raise NotImplementedError(
'only integer and array bins are implemented')
elif isinstance(bins, cupy.ndarray) or numpy.ndim(bins) == 1:
# TODO(okuta): After #3060 is merged, `if cupy.ndim(bins) == 1:`.
if isinstance(bins, cupy.ndarray):
bin_edges = bins
else:
bin_edges = numpy.asarray(bins)
if (bin_edges[:-1] > bin_edges[1:]).any(): # synchronize! when CuPy
raise ValueError(
'`bins` must increase monotonically, when an array')
if isinstance(bin_edges, numpy.ndarray):
bin_edges = cupy.asarray(bin_edges)
elif numpy.ndim(bins) == 0:
try:
n_equal_bins = operator.index(bins)
except TypeError:
raise TypeError(
'`bins` must be an integer, a string, or an array')
if n_equal_bins < 1:
raise ValueError('`bins` must be positive, when an integer')
first_edge, last_edge = _get_outer_edges(a, range)
else:
raise ValueError('`bins` must be 1d, when an array')
if n_equal_bins is not None:
# numpy's gh-10322 means that type resolution rules are dependent on
# array shapes. To avoid this causing problems, we pick a type now and
# stick with it throughout.
bin_type = cupy.result_type(first_edge, last_edge, a)
if cupy.issubdtype(bin_type, cupy.integer):
bin_type = cupy.result_type(bin_type, float)
# bin edges must be computed
bin_edges = cupy.linspace(
first_edge, last_edge, n_equal_bins + 1,
endpoint=True, dtype=bin_type)
return bin_edges
def histogram(x, bins=10, range=None, weights=None, density=False):
"""Computes the histogram of a set of data.
Args:
x (cupy.ndarray): Input array.
bins (int or cupy.ndarray): If ``bins`` is an int, it represents the
number of bins. If ``bins`` is an :class:`~cupy.ndarray`, it
represents a bin edges.
range (2-tuple of float, optional): The lower and upper range of the
bins. If not provided, range is simply ``(x.min(), x.max())``.
Values outside the range are ignored. The first element of the
range must be less than or equal to the second. `range` affects the
automatic bin computation as well. While bin width is computed to
be optimal based on the actual data within `range`, the bin count
will fill the entire range including portions containing no data.
density (bool, optional): If False, the default, returns the number of
samples in each bin. If True, returns the probability *density*
function at the bin, ``bin_count / sample_count / bin_volume``.
weights (cupy.ndarray, optional): An array of weights, of the same
shape as `x`. Each value in `x` only contributes its associated
weight towards the bin count (instead of 1).
Returns:
tuple: ``(hist, bin_edges)`` where ``hist`` is a :class:`cupy.ndarray`
storing the values of the histogram, and ``bin_edges`` is a
:class:`cupy.ndarray` storing the bin edges.
.. warning::
This function may synchronize the device.
.. seealso:: :func:`numpy.histogram`
"""
if x.dtype.kind == 'c':
# TODO(unno): comparison between complex numbers is not implemented
raise NotImplementedError('complex number is not supported')
if not isinstance(x, cupy.ndarray):
raise ValueError('x must be a cupy.ndarray')
x, weights = _ravel_and_check_weights(x, weights)
bin_edges = _get_bin_edges(x, bins, range)
if weights is None:
y = cupy.zeros(bin_edges.size - 1, dtype=cupy.int64)
for accelerator in _accelerator.get_routine_accelerators():
# CUB uses int for bin counts
# TODO(leofang): support >= 2^31 elements in x?
if (accelerator == _accelerator.ACCELERATOR_CUB
and x.size <= 0x7fffffff and bin_edges.size <= 0x7fffffff):
# Need to ensure the dtype of bin_edges as it's needed for both
# the CUB call and the correction later
assert isinstance(bin_edges, cupy.ndarray)
if numpy.issubdtype(x.dtype, numpy.integer):
bin_type = float
else:
bin_type = numpy.result_type(bin_edges.dtype, x.dtype)
if (bin_type == numpy.float16 and
not common._is_fp16_supported()):
bin_type = numpy.float32
x = x.astype(bin_type, copy=False)
acc_bin_edge = bin_edges.astype(bin_type, copy=True)
# CUB's upper bin boundary is exclusive for all bins, including
# the last bin, so we must shift it to comply with NumPy
if x.dtype.kind in 'ui':
acc_bin_edge[-1] += 1
elif x.dtype.kind == 'f':
last = acc_bin_edge[-1]
acc_bin_edge[-1] = cupy.nextafter(last, last + 1)
if runtime.is_hip:
y = y.astype(cupy.uint64, copy=False)
y = cub.device_histogram(x, y, acc_bin_edge)
if runtime.is_hip:
y = y.astype(cupy.int64, copy=False)
break
else:
_histogram_kernel(x, bin_edges, bin_edges.size, y)
else:
simple_weights = (
cupy.can_cast(weights.dtype, cupy.float64) or
cupy.can_cast(weights.dtype, cupy.complex128)
)
if not simple_weights:
# object dtype such as Decimal are supported in NumPy, but not here
raise NotImplementedError(
'only weights with dtype that can be cast to float or complex '
'are supported')
if weights.dtype.kind == 'c':
y = cupy.zeros(bin_edges.size - 1, dtype=cupy.complex128)
_weighted_histogram_kernel(
x, bin_edges, bin_edges.size, weights.real, y.real)
_weighted_histogram_kernel(
x, bin_edges, bin_edges.size, weights.imag, y.imag)
else:
if weights.dtype.kind in 'bui':
y = cupy.zeros(bin_edges.size - 1, dtype=int)
else:
y = cupy.zeros(bin_edges.size - 1, dtype=cupy.float64)
_weighted_histogram_kernel(
x, bin_edges, bin_edges.size, weights, y)
if density:
db = cupy.array(cupy.diff(bin_edges), cupy.float64)
return y / db / y.sum(), bin_edges
return y, bin_edges
def histogramdd(sample, bins=10, range=None, weights=None, density=False):
"""Compute the multidimensional histogram of some data.
Args:
sample (cupy.ndarray): The data to be histogrammed. (N, D) or (D, N)
array
Note the unusual interpretation of sample when an array_like:
* When an array, each row is a coordinate in a D-dimensional
space - such as ``histogramdd(cupy.array([p1, p2, p3]))``.
* When an array_like, each element is the list of values for single
coordinate - such as ``histogramdd((X, Y, Z))``.
The first form should be preferred.
bins (int or tuple of int or cupy.ndarray): The bin specification:
* A sequence of arrays describing the monotonically increasing bin
edges along each dimension.
* The number of bins for each dimension (nx, ny, ... =bins)
* The number of bins for all dimensions (nx=ny=...=bins).
range (sequence, optional): A sequence of length D, each an optional
(lower, upper) tuple giving the outer bin edges to be used if the
edges are not given explicitly in `bins`. An entry of None in the
sequence results in the minimum and maximum values being used for
the corresponding dimension. The default, None, is equivalent to
passing a tuple of D None values.
weights (cupy.ndarray): An array of values `w_i` weighing each sample
`(x_i, y_i, z_i, ...)`. The values of the returned histogram are
equal to the sum of the weights belonging to the samples falling
into each bin.
density (bool, optional): If False, the default, returns the number of
samples in each bin. If True, returns the probability *density*
function at the bin, ``bin_count / sample_count / bin_volume``.
Returns:
tuple:
H (cupy.ndarray):
The multidimensional histogram of sample x. See
normed and weights for the different possible semantics.
edges (list of cupy.ndarray):
A list of D arrays describing the bin
edges for each dimension.
.. warning::
This function may synchronize the device.
.. seealso:: :func:`numpy.histogramdd`
"""
if isinstance(sample, cupy.ndarray):
# Sample is an ND-array.
if sample.ndim == 1:
sample = sample[:, cupy.newaxis]
nsamples, ndim = sample.shape
else:
sample = cupy.stack(sample, axis=-1)
nsamples, ndim = sample.shape
nbin = numpy.empty(ndim, int)
edges = ndim * [None]
dedges = ndim * [None]
if weights is not None:
weights = cupy.asarray(weights)
try:
nbins = len(bins)
if nbins != ndim:
raise ValueError(
'The dimension of bins must be equal to the dimension of the '
' sample x.'
)
except TypeError:
# bins is an integer
bins = ndim * [bins]
# normalize the range argument
if range is None:
range = (None,) * ndim
elif len(range) != ndim:
raise ValueError('range argument must have one entry per dimension')
# Create edge arrays
for i in _range(ndim):
if cupy.ndim(bins[i]) == 0:
if bins[i] < 1:
raise ValueError(
'`bins[{}]` must be positive, when an integer'.format(i)
)
smin, smax = _get_outer_edges(sample[:, i], range[i])
num = int(bins[i] + 1) # synchronize!
edges[i] = cupy.linspace(smin, smax, num)
elif cupy.ndim(bins[i]) == 1:
if not isinstance(bins[i], cupy.ndarray):
raise ValueError('array-like bins not supported')
edges[i] = bins[i]
if (edges[i][:-1] > edges[i][1:]).any(): # synchronize!
raise ValueError(
'`bins[{}]` must be monotonically increasing, when an '
'array'.format(i)
)
else:
raise ValueError(
'`bins[{}]` must be a scalar or 1d array'.format(i)
)
nbin[i] = len(edges[i]) + 1 # includes an outlier on each end
dedges[i] = cupy.diff(edges[i])
# Compute the bin number each sample falls into.
ncount = tuple(
# avoid cupy.digitize to work around NumPy issue gh-11022
cupy.searchsorted(edges[i], sample[:, i], side='right')
for i in _range(ndim)
)
# Using digitize, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right edge to be
# counted in the last bin, and not as an outlier.
for i in _range(ndim):
# Find which points are on the rightmost edge.
on_edge = sample[:, i] == edges[i][-1]
# Shift these points one bin to the left.
ncount[i][on_edge] -= 1
# Compute the sample indices in the flattened histogram matrix.
# This raises an error if the array is too large.
xy = cupy.ravel_multi_index(ncount, nbin)
# Compute the number of repetitions in xy and assign it to the
# flattened histmat.
hist = cupy.bincount(xy, weights, minlength=numpy.prod(nbin))
# Shape into a proper matrix
hist = hist.reshape(nbin)
# This preserves the (bad) behavior observed in NumPy gh-7845, for now.
hist = hist.astype(float) # Note: NumPy uses casting='safe' here too
# Remove outliers (indices 0 and -1 for each dimension).
core = ndim * (slice(1, -1),)
hist = hist[core]
if density:
# calculate the probability density function
s = hist.sum()
for i in _range(ndim):
shape = [1] * ndim
shape[i] = nbin[i] - 2
hist = hist / dedges[i].reshape(shape)
hist /= s
if any(hist.shape != numpy.asarray(nbin) - 2):
raise RuntimeError('Internal Shape Error')
return hist, edges
def histogram2d(x, y, bins=10, range=None, weights=None, density=None):
"""Compute the bi-dimensional histogram of two data samples.
Args:
x (cupy.ndarray): The first array of samples to be histogrammed.
y (cupy.ndarray): The second array of samples to be histogrammed.
bins (int or tuple of int or cupy.ndarray): The bin specification:
* A sequence of arrays describing the monotonically increasing bin
edges along each dimension.
* The number of bins for each dimension (nx, ny)
* The number of bins for all dimensions (nx=ny=bins).
range (sequence, optional): A sequence of length two, each an optional
(lower, upper) tuple giving the outer bin edges to be used if the
edges are not given explicitly in `bins`. An entry of None in the
sequence results in the minimum and maximum values being used for
the corresponding dimension. The default, None, is equivalent to
passing a tuple of two None values.
weights (cupy.ndarray): An array of values `w_i` weighing each sample
`(x_i, y_i)`. The values of the returned histogram are equal to the
sum of the weights belonging to the samples falling into each bin.
density (bool, optional): If False, the default, returns the number of
samples in each bin. If True, returns the probability *density*
function at the bin, ``bin_count / sample_count / bin_volume``.
Returns:
tuple:
H (cupy.ndarray):
The multidimensional histogram of sample x. See
normed and weights for the different possible semantics.
edges0 (tuple of cupy.ndarray):
A list of D arrays describing the bin
edges for the first dimension.
edges1 (tuple of cupy.ndarray):
A list of D arrays describing the bin
edges for the second dimension.
.. warning::
This function may synchronize the device.
.. seealso:: :func:`numpy.histogram2d`
"""
try:
n = len(bins)
except TypeError:
n = 1
if n != 1 and n != 2:
if isinstance(bins, cupy.ndarray):
xedges = yedges = bins
bins = [xedges, yedges]
else:
raise ValueError('array-like bins not supported in CuPy')
hist, edges = histogramdd([x, y], bins, range, weights, density)
return hist, edges[0], edges[1]
_bincount_kernel = _core.ElementwiseKernel(
'S x', 'raw U bin',
'atomicAdd(&bin[x], U(1))',
'cupy_bincount_kernel')
_bincount_with_weight_kernel = _core.ElementwiseKernel(
'S x, T w', 'raw U bin',
'atomicAdd(&bin[x], w)',
'cupy_bincount_with_weight_kernel')
def bincount(x, weights=None, minlength=None):
"""Count number of occurrences of each value in array of non-negative ints.
Args:
x (cupy.ndarray): Input array.
weights (cupy.ndarray): Weights array which has the same shape as
``x``.
minlength (int): A minimum number of bins for the output array.
Returns:
cupy.ndarray: The result of binning the input array. The length of
output is equal to ``max(cupy.max(x) + 1, minlength)``.
.. warning::
This function may synchronize the device.
.. seealso:: :func:`numpy.bincount`
"""
if x.ndim > 1:
raise ValueError('object too deep for desired array')
if x.ndim < 1:
raise ValueError('object of too small depth for desired array')
if x.dtype.kind == 'f':
raise TypeError('x must be int array')
if (x < 0).any(): # synchronize!
raise ValueError('The first argument of bincount must be non-negative')
if weights is not None and x.shape != weights.shape:
raise ValueError('The weights and list don\'t have the same length.')
if minlength is not None:
minlength = int(minlength)
if minlength < 0:
raise ValueError('minlength must be non-negative')
size = int(cupy.max(x)) + 1 # synchronize!
if minlength is not None:
size = max(size, minlength)
if weights is None:
b = cupy.zeros((size,), dtype=numpy.intp)
for accelerator in _accelerator.get_routine_accelerators():
# CUB uses int for bin counts
# TODO(leofang): support >= 2^31 elements in x?
if (not runtime.is_hip
and accelerator == _accelerator.ACCELERATOR_CUB
and x.size <= 0x7fffffff and size <= 0x7fffffff):
b = cub.device_histogram(x, b, size+1)
break
else:
_bincount_kernel(x, b)
else:
b = cupy.zeros((size,), dtype=numpy.float64)
_bincount_with_weight_kernel(x, weights, b)
return b
def digitize(x, bins, right=False):
"""Finds the indices of the bins to which each value in input array belongs.
.. note::
In order to avoid device synchronization, digitize does not raise
an exception when the array is not monotonic
Args:
x (cupy.ndarray): Input array.
bins (cupy.ndarray): Array of bins.
It has to be 1-dimensional and monotonic increasing or decreasing.
right (bool):
Indicates whether the intervals include the right or the left bin
edge.
Returns:
cupy.ndarray: Output array of indices, of same shape as ``x``.
.. seealso:: :func:`numpy.digitize`
"""
# This is for NumPy compat, although it works fine
if x.dtype.kind == 'c':
raise TypeError('x may not be complex')
if bins.ndim > 1:
raise ValueError('object too deep for desired array')
if bins.ndim < 1:
raise ValueError('object of too small depth for desired array')
# As the order of the arguments are reversed, the side must be too.
side = 'left' if right else 'right'
return cupy._sorting.search._searchsorted(bins, x, side, None, False)