Skip to content

Commit

Permalink
Add transform parameter to convolution cpu (NVIDIA#3535)
Browse files Browse the repository at this point in the history
* Add transform parameter to convolution cpu

Signed-off-by: Kamil Tokarski <ktokarski@nvidia.com>
  • Loading branch information
stiepan authored and cyyever committed Jan 23, 2022
1 parent f68f86a commit fc75c9a
Show file tree
Hide file tree
Showing 4 changed files with 282 additions and 88 deletions.
115 changes: 92 additions & 23 deletions dali/kernels/imgproc/convolution/convolution_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,70 @@

namespace dali {
namespace kernels {

namespace conv_transform {

/** @defgroup ConvolutionTransforms Convolution postprocessing transformations
* Transforms enable postprocessing of values computed by 1D convolution before
* they are stored in the output. It may be just conversion and scaling,
* but also can be used to accumulate results in the output buffer.
* @{
*/

/**
* @brief Stores the convolution value in the output tensor, performs scaling and
* converts to the output type with saturation.
*/
template <typename Out, typename W>
struct TransScaleSat {
TransScaleSat(float scale = 1.f) : scale{scale} {} // NOLINT

void operator()(Out *out_ptr, int64_t offset, W val) const {
out_ptr[offset] = ConvertSat<Out>(val * scale);
}

float scale;
};

/**
* @brief Adds the convolution value to the output tensor, performs scaling and
* converts to the output type with saturation.
*/
template <typename Out, typename W>
struct TransScaleAddOutSat {
TransScaleAddOutSat(float scale = 1.f) : scale{scale} {} // NOLINT

void operator()(Out *out_ptr, int64_t offset, W val) const {
Out &out = out_ptr[offset];
out = ConvertSat<Out>(val * scale + out);
}

float scale;
};

/**
* @brief Stores in the output tensor the convolution values with pointwise addition of `acc_ptr`.
* Performs scaling and converts to the output type with saturation.
* The `acc_ptr` and `out_ptr` should not overlap, if the `acc_ptr` is same as `out_ptr`,
* use `TransScaleAddOutSat` instead.
*/
template <typename Intermediate, typename Out, typename W>
struct TransScaleAddBufferSat {
TransScaleAddBufferSat(Intermediate* acc_ptr, float scale = 1.f) // NOLINT
: acc_ptr{acc_ptr}, scale{scale} {}

void operator()(Out *out_ptr, int64_t offset, W val) const {
out_ptr[offset] = ConvertSat<Out>(val * scale + acc_ptr[offset]);
}

Intermediate* acc_ptr;
float scale;
};

/** @} */ // end of ConvolutionTransforms

} // namespace conv_transform

/**
* @brief Cyclic buffer used for storing input window for convolution.
*
Expand Down Expand Up @@ -104,12 +168,13 @@ class CyclicWindowWrapper {
}
}

template <typename U, typename W>
void CalculateDot(U* __restrict__ output, const W* __restrict__ window, float scale) const {
template <typename U, typename W, typename Transform = conv_transform::TransScaleSat<U, W>>
void CalculateDot(U* __restrict__ output, int64_t offset, const W* __restrict__ window,
const Transform& transform) const {
std::array<W, max_lanes> tmp;
CalculateDot(tmp.data(), window);
for (int c = 0; c < NumLanes(); c++) {
output[c] = ConvertSat<U>(tmp[c] * scale);
transform(output, c + offset, tmp[c]);
}
}

Expand Down Expand Up @@ -166,10 +231,11 @@ void load_pixel_no_border(CyclicWindowWrapper<T, max_lanes>& cww, const T* in_pt
cww.PushBack(in_ptr + in_idx * stride);
}

template <bool has_channels, typename Out, typename In, typename W, int ndim>
template <bool has_channels, typename Out, typename In, typename W, int ndim,
typename T = conv_transform::TransScaleSat<Out, W>>
void ConvolveInnerDim(Out* out, const In* in, const W* window, int window_size,
const TensorShape<ndim>& shape, const TensorShape<ndim>& strides,
float scale) {
const T& transform) {
constexpr int last_dim = has_channels ? ndim - 2 : ndim - 1;
int channels = has_channels ? strides[last_dim] : 1;
int64_t outer_elements = volume(&shape[0], &shape[last_dim]);
Expand All @@ -181,7 +247,7 @@ void ConvolveInnerDim(Out* out, const In* in, const W* window, int window_size,
for (int64_t o = 0; o < outer_elements; o++) {
int64_t x0 = -radius;
int64_t xout = 0;
Out* out_axis = &out[o * axis_stride];
int64_t axis_offset = o * axis_stride;
const In* in_axis = &in[o * axis_stride];
// Left border
for (; x0 < 0 && xout < axis_size; x0++, xout++) {
Expand All @@ -191,7 +257,7 @@ void ConvolveInnerDim(Out* out, const In* in, const W* window, int window_size,
int x = boundary::idx_reflect_101(x0 + k, axis_size);
acc += in_axis[x * channels + c] * window[k];
}
out_axis[xout * channels + c] = ConvertSat<Out>(acc * scale);
transform(out, axis_offset + xout * channels + c, acc);
}
}
int64_t flat_x = x0 * channels;
Expand All @@ -202,7 +268,7 @@ void ConvolveInnerDim(Out* out, const In* in, const W* window, int window_size,
for (int k = 0; k < window_size; k++) {
acc += in_axis[flat_x + k * channels] * window[k];
}
out_axis[flat_xout] = ConvertSat<Out>(acc * scale);
transform(out, axis_offset + flat_xout, acc);
}
// get back from flat coordinates
x0 = flat_x / channels;
Expand All @@ -215,23 +281,22 @@ void ConvolveInnerDim(Out* out, const In* in, const W* window, int window_size,
int x = boundary::idx_reflect_101(x0 + k, axis_size);
acc += in_axis[x * channels + c] * window[k];
}
out_axis[xout * channels + c] = ConvertSat<Out>(acc * scale);
transform(out, axis_offset + xout * channels + c, acc);
}
}
}
}

template <int axis, bool has_channels, int max_lanes, typename Out, typename In, typename W,
int ndim>
int ndim, typename T = conv_transform::TransScaleSat<Out, W>>
void ConvolveInplaceAxisLoop(Out* out, const In* in, const W* window,
const TensorShape<ndim>& shape, const TensorShape<ndim>& strides,
int diameter, int64_t offset, In* input_window_buffer, float scale,
int num_lanes) {
int diameter, int64_t offset, In* input_window_buffer, int num_lanes,
const T& transform) {
auto axis_stride = strides[axis];
auto axis_size = shape[axis];
int radius = (diameter - 1) / 2;
// offset <- start of current axis
auto* out_ptr = out + offset;
auto* in_ptr = in + offset;

CyclicWindowWrapper<In, max_lanes> input_window(input_window_buffer, diameter, num_lanes);
Expand All @@ -251,7 +316,7 @@ void ConvolveInplaceAxisLoop(Out* out, const In* in, const W* window,
for (; out_idx < axis_size && in_idx < axis_size; out_idx++, in_idx++) {
// we load last element of the input window corresponding to the out_idx
load_pixel_no_border(input_window, in_ptr, in_idx, axis_stride);
input_window.CalculateDot(out_ptr + out_idx * axis_stride, window, scale);
input_window.CalculateDot(out, offset + out_idx * axis_stride, window, transform);
// remove one element, to make space for next out_idx and in_idx
input_window.PopFront();
}
Expand All @@ -260,16 +325,16 @@ void ConvolveInplaceAxisLoop(Out* out, const In* in, const W* window,
// To process in-place, we need to pick the vales back from the CyclicBuffer,
// as it may happen that we already stored the element.u
reload_pixel_with_border(input_window, in_ptr, in_idx, out_idx, axis_stride, axis_size, radius);
input_window.CalculateDot(out_ptr + out_idx * axis_stride, window, scale);
input_window.CalculateDot(out, offset + out_idx * axis_stride, window, transform);
input_window.PopFront();
}
}

template <int axis, bool has_channels, int max_lanes, typename Out, typename In, typename W,
int ndim>
int ndim, typename T = conv_transform::TransScaleSat<Out, W>>
void ConvolveInplaceOuterLoop(Out* out, const In* in, const W* window,
const TensorShape<ndim>& shape, const TensorShape<ndim>& strides,
int diameter, In* input_window_buffer, float scale = 1.f) {
int diameter, In* input_window_buffer, const T& transform) {
int64_t outer_elements = volume(&shape[0], &shape[axis]);
int64_t axis_elements = shape[axis];
int64_t inner_elements = volume(&shape[axis + 1], &shape[ndim]);
Expand All @@ -288,7 +353,8 @@ void ConvolveInplaceOuterLoop(Out* out, const In* in, const W* window,
int64_t offset = outer_idx * (axis > 0 ? strides[axis - 1] : 0) + inner_idx;
int num_lanes = std::min(inner_elements - inner_idx, strip_size);
ConvolveInplaceAxisLoop<axis, has_channels, max_lanes>(
out, in, window, shape, strides, diameter, offset, input_window_buffer, scale, num_lanes);
out, in, window, shape, strides, diameter, offset, input_window_buffer, num_lanes,
transform);
}
}
}
Expand All @@ -304,7 +370,8 @@ void ConvolveInplaceOuterLoop(Out* out, const In* in, const W* window,
*
* The same implementation is used for in-place innermost convolution.
*/
template <typename Out, typename In, typename W, int ndim, int axis, bool has_channels = true>
template <typename Out, typename In, typename W, int ndim, int axis, bool has_channels = true,
typename T = conv_transform::TransScaleSat<Out, W>>
struct ConvolutionCpu {
// This can be ballanced between additional memory required and speed,
// it will request memory for a cyclic helper buffer of kStripSize * window_size.
Expand All @@ -323,19 +390,21 @@ struct ConvolutionCpu {

void Run(KernelContext& ctx, const TensorView<StorageCPU, Out, ndim> &out,
const TensorView<StorageCPU, const In, ndim>& in,
const TensorView<StorageCPU, const W, 1>& window, float scale = 1) {
const TensorView<StorageCPU, const W, 1>& window,
const T &transform = {}) {
auto diameter = window.num_elements();
int input_window_buf_size = GetInputWindowBufSize(in.shape, diameter);
auto* input_window_buffer = ctx.scratchpad->AllocateHost<In>(input_window_buf_size);
auto strides = GetStrides(in.shape);

if (axis == ndim - has_channels - 1 &&
static_cast<const void*>(out.data) != static_cast<const void*>(in.data)) {
ConvolveInnerDim<has_channels>(out.data, in.data, window.data, diameter, in.shape, strides,
scale);
ConvolveInnerDim<has_channels>(
out.data, in.data, window.data, diameter, in.shape, strides, transform);
} else {
ConvolveInplaceOuterLoop<axis, has_channels, kStripSize, Out, In, W, ndim>(
out.data, in.data, window.data, in.shape, strides, diameter, input_window_buffer, scale);
out.data, in.data, window.data, in.shape, strides, diameter, input_window_buffer,
transform);
}
}

Expand Down
Loading

0 comments on commit fc75c9a

Please sign in to comment.