Skip to content

Commit

Permalink
set launch bounds and tune block size
Browse files Browse the repository at this point in the history
  • Loading branch information
grlee77 committed Oct 16, 2020
1 parent 5b126c7 commit 3665dcf
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
7 changes: 5 additions & 2 deletions cupyx/scipy/ndimage/_spline_prefilter_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def _get_spline1d_code(mode, poles, pole_type):

_batch_spline1d_strided_template = """
extern "C" __global__
__launch_bounds__({block_size})
void cupyx_spline_filter(T* __restrict__ y, const idx_t* __restrict__ info) {{
const idx_t n_signals = info[0], n_samples = info[1],
* __restrict__ shape = info+2;
Expand All @@ -215,7 +216,8 @@ def _get_spline1d_code(mode, poles, pole_type):

@cupy.memoize(for_each_device=True)
def get_raw_spline1d_kernel(axis, ndim, mode, order, index_type="int",
data_type="double", pole_type="double"):
data_type="double", pole_type="double",
block_size=128):
"""Generate a kernel for applying a spline prefilter along a given axis."""
poles = get_poles(order)

Expand All @@ -227,5 +229,6 @@ def get_raw_spline1d_kernel(axis, ndim, mode, order, index_type="int",
code += _get_spline1d_code(mode, poles, pole_type)

# generate code handling batch operation of the 1d filter
code += _batch_spline1d_strided_template.format(ndim=ndim, axis=axis)
code += _batch_spline1d_strided_template.format(ndim=ndim, axis=axis,
block_size=block_size)
return cupy.RawKernel(code, "cupyx_spline_filter")
17 changes: 10 additions & 7 deletions cupyx/scipy/ndimage/interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,24 +127,27 @@ def spline_filter1d(
index_type = _util._get_inttype(input)
index_dtype = cupy.int32 if index_type == 'int' else cupy.int64

kern =_spline_prefilter_core.get_raw_spline1d_kernel(
n_samples = x.shape[axis]
n_signals = x.size // n_samples
info = cupy.array((n_signals, n_samples) + x.shape, dtype=index_dtype)

# empirical choice of block size that seemed to work well
block_size = max(2 ** math.ceil(numpy.log2(n_samples / 32)), 8)
kern = _spline_prefilter_core.get_raw_spline1d_kernel(
axis,
ndim,
mode,
order=order,
index_type=index_type,
data_type=data_type,
pole_type=pole_type,
block_size=block_size,
)

n_samples = x.shape[axis]
n_signals = x.size // n_samples
info = cupy.array((n_signals, n_samples) + x.shape, dtype=index_dtype)

# Due to recursive nature, a given line of data must be processed by a
# single thread. n_signals lines will be processed in total.
block = 128,
grid = (n_signals + block[0] - 1) // block[0],
block = (block_size,)
grid = ((n_signals + block[0] - 1) // block[0],)

# apply prefilter gain
poles = _spline_prefilter_core.get_poles(order=order)
Expand Down

0 comments on commit 3665dcf

Please sign in to comment.