Skip to content

Commit

Permalink
DepthWiseConvolution3D for OpenCL backend.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 288688185
Change-Id: I5a305d25ed1ff7ba7dc7dcb36d31762bdc7ae93f
  • Loading branch information
tensorflower-gardener committed Jan 8, 2020
1 parent f947e4d commit b40e538
Show file tree
Hide file tree
Showing 3 changed files with 530 additions and 0 deletions.
23 changes: 23 additions & 0 deletions tensorflow/lite/delegates/gpu/cl/kernels/BUILD
Expand Up @@ -546,6 +546,29 @@ cc_library(
],
)

cc_library(
name = "depth_wise_conv_3d",
srcs = ["depth_wise_conv_3d.cc"],
hdrs = ["depth_wise_conv_3d.h"],
deps = [
":gpu_operation",
":util",
":work_group_picking",
"//tensorflow/lite/delegates/gpu/cl:buffer",
"//tensorflow/lite/delegates/gpu/cl:cl_device",
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
"//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/cl:texture2d",
"//tensorflow/lite/delegates/gpu/cl:util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:types",
],
)

cc_test(
name = "depth_wise_conv_test",
srcs = ["depth_wise_conv_test.cc"],
Expand Down
337 changes: 337 additions & 0 deletions tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3d.cc
@@ -0,0 +1,337 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3d.h"

#include <string>
#include <utility>
#include <vector>

#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"

namespace tflite {
namespace gpu {
namespace cl {
namespace {

bool IsSpecializedCase(int channel_multiplier) {
return channel_multiplier == 1 || channel_multiplier == 2 ||
channel_multiplier == 4;
}

std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
int channel_multiplier,
TextureAddressMode address_mode) {
std::string c;
if (channel_multiplier == 1) {
c += " FLT4 src_final =" +
src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S", address_mode) + ";\n";
} else if (channel_multiplier == 2) {
c += " int z_layer = S / 2;\n";
c += " FLT4 src =" +
src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
";\n";
c += " FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
c += " FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
} else if (channel_multiplier == 4) {
c += " int z_layer = S / 4;\n";
c += " FLT4 src =" +
src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
";\n";
c += " FLT t0 = src.x;\n";
c += " int reminder = S % 4;\n";
c += " if (reminder == 1) t0 = src.y;\n";
c += " if (reminder == 2) t0 = src.z;\n";
c += " if (reminder == 3) t0 = src.w;\n";
c += " FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
} else {
c += " int z_layer = S / channel_multiplier;\n";
c += " FLT4 src =" +
src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
";\n";
c += " int z_offset = (S % channel_multiplier) * 4;\n";
c += " FLT4 src_final;\n";
c += " FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
c += " src_final.x = temp_arr[(z_offset + 0) / "
"channel_multiplier];\n";
c += " src_final.y = temp_arr[(z_offset + 1) / "
"channel_multiplier];\n";
c += " src_final.z = temp_arr[(z_offset + 2) / "
"channel_multiplier];\n";
c += " src_final.w = temp_arr[(z_offset + 3) / "
"channel_multiplier];\n";
}

return c;
}

std::string GenerateDepthWiseConvolution3DCode(
const OperationDef& op_def, bool stride_correction,
const LinearStorage& biases, int channel_multiplier,
bool weights_are_buffer,
const std::vector<ElementwiseOperation*>& linked_operations,
const CLDevice& device) {
TensorCodeGenerator src_tensor(
"src_data",
WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor(
"dst_data",
WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;

std::string c = GetCommonDefines(op_def.precision);

const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;

c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
if (weights_are_buffer) {
c += " __global FLT4* filters, \n";
} else {
c += " __read_only image2d_t filters, \n";
}
c += biases.GetDeclaration();
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 kernel_size, \n";
c += " int4 stride, \n";
c += " int4 padding, \n";
c += " int4 dilation, \n";
if (!IsSpecializedCase(channel_multiplier)) {
c += " int channel_multiplier, \n";
}
if (op_def.batch_support) {
c += " int batch_size, \n";
}
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int linear_id_z = get_global_id(2);\n";
c += " int S = linear_id_z % dst_size.w;\n";
c += " int Z = linear_id_z / dst_size.w;\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
if (stride_correction) {
c += " int x_offseted = " +
GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
";\n";
} else {
c += " int x_offseted = X * stride.x + padding.x;\n";
}
c += " int y_offseted = Y * stride.y + padding.y;\n";
c += " int z_offseted = Z * stride.z + padding.z;\n";
if (weights_are_buffer) {
c += " int fx_c = S * kernel_size.x * kernel_size.y * kernel_size.z;\n";
} else {
c += " int fx_c = 0;\n";
}

if (manual_clamp) {
c += " for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
c += " int z_c = z_offseted + kz * dilation.z;\n";
c += " bool outside_z = z_c < 0 || z_c >= src_size.z;\n";
c += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
c += " int y_c = y_offseted + ky * dilation.y;\n";
c += " bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
c += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
c += " int x_c = x_offseted + kx * dilation.x;\n";
c += " bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
c += " if (!outside_x && !outside_y && !outside_z) {\n";
if (weights_are_buffer) {
c += " FLT4 f = filters[fx_c];\n";
} else {
c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, "
"S));\n";
}
c += GetSrcValue(src_tensor, channel_multiplier,
TextureAddressMode::DONT_CARE);
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
c += " };\n";
c += " fx_c++;\n";
c += " }\n";
c += " }\n";
c += " }\n";
} else { // Texture types with ZERO clamping
c += " for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
c += " int z_c = z_offseted + kz * dilation.z;\n";
if (src_tensor_type !=
TensorStorageType::TEXTURE_3D) { // Only TEXTURE_3D supports clamping
// in DEPTH dimension
c += " if (z_c < 0 || z_c >= src_size.z) {\n";
c += " fx_c += kernel_size.y * kernel_size.x;\n";
c += " continue;\n";
c += " }\n";
}
c += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
c += " int y_c = y_offseted + ky * dilation.y;\n";
c += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
c += " int x_c = x_offseted + kx * dilation.x;\n";
const auto access_mode = GetFastestZeroMode(device);
c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
if (weights_are_buffer) {
c += " FLT4 f = filters[fx_c];\n";
} else {
c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, S));\n";
}
c += " fx_c++;\n";
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
c += " }\n";
c += " }\n";
c += " }\n";
}
c += " FLT4 bias_val = " + biases.ReadLinearFLT4("S") + ";\n";
c += " FLT4 res0 = TO_FLT4(r) + bias_val;\n";
const LinkingContext context{"res0", "X", "Y", "S"};
c += PostProcess(linked_operations, context);
c += " " + dst_tensor.WriteWHDS("res0", "X", "Y", "Z", "S") + "\n";
c += "}\n";
return c;
}
} // namespace

DepthWiseConvolution3D::DepthWiseConvolution3D(
const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr, const CLDevice& device)
: GPUOperation(definition),
weights_are_buffer_(device.IsMali()),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
attr.weights.shape.d),
stride_(attr.strides.w, attr.strides.h, attr.strides.d),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-attr.padding.prepended.d),
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
channel_multiplier_(attr.weights.shape.o),
work_group_size_(8, 8, 1) {}

DepthWiseConvolution3D::DepthWiseConvolution3D(
DepthWiseConvolution3D&& operation)
: GPUOperation(std::move(operation)),
weights_tex2d_(std::move(operation.weights_tex2d_)),
weights_buf_(std::move(operation.weights_buf_)),
weights_are_buffer_(operation.weights_are_buffer_),
biases_(std::move(operation.biases_)),
kernel_size_(operation.kernel_size_),
stride_(operation.stride_),
padding_(operation.padding_),
dilation_(operation.dilation_),
channel_multiplier_(operation.channel_multiplier_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}

DepthWiseConvolution3D& DepthWiseConvolution3D::operator=(
DepthWiseConvolution3D&& operation) {
if (this != &operation) {
weights_tex2d_ = std::move(operation.weights_tex2d_);
weights_buf_ = std::move(operation.weights_buf_);
std::swap(weights_are_buffer_, operation.weights_are_buffer_);
biases_ = std::move(operation.biases_);
std::swap(kernel_size_, operation.kernel_size_);
std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_);
std::swap(dilation_, operation.dilation_);
std::swap(channel_multiplier_, operation.channel_multiplier_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}

Status DepthWiseConvolution3D::Compile(
const CreationContext& creation_context) {
const bool stride_correction = definition_.batch_support && stride_.x != 1;
const auto code = GenerateDepthWiseConvolution3DCode(
definition_, stride_correction, biases_, channel_multiplier_,
weights_are_buffer_, linked_operations_, *creation_context.device);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}

Status DepthWiseConvolution3D::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
if (weights_are_buffer_) {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
} else {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_tex2d_.GetMemoryPtr()));
}
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(
int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
RETURN_IF_ERROR(
kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
RETURN_IF_ERROR(kernel_.SetBytesAuto(
int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
RETURN_IF_ERROR(kernel_.SetBytesAuto(
int4(dilation_.x * src_[0]->Batch(), dilation_.y, dilation_.z, 1)));
if (!IsSpecializedCase(channel_multiplier_)) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
}
if (definition_.batch_support) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
return OkStatus();
}

int3 DepthWiseConvolution3D::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}

Status DepthWiseConvolution3D::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}

Status DepthWiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}

Status CreateDepthWiseConvolution3D(
const CreationContext& creation_context, const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr,
DepthWiseConvolution3D* result) {
*result = DepthWiseConvolution3D(definition, attr, *creation_context.device);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type =
DeduceLinearStorageType(definition.GetPrimaryStorageType());
create_info.data_type = definition.GetDataType();
create_info.name = "biases";
create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}

} // namespace cl
} // namespace gpu
} // namespace tflite

0 comments on commit b40e538

Please sign in to comment.