Add files via upload

fqnchina · Nov 7, 2018 · 154a024 · 154a024
1 parent e29aa6c
commit 154a024
Show file tree

Hide file tree

Showing 17 changed files with 18,161 additions and 0 deletions.
diff --git a/compilation/EdgeComputation.cu b/compilation/EdgeComputation.cu
@@ -0,0 +1,62 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "EdgeComputation.h"
+
+void THNN_CudaEdgeComputation_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) {
+
+	long batchSize = input->size[0];
+	long plane = input->size[1];
+	long height = input->size[2];
+	long width = input->size[3];
+
+	THCudaTensor *input_n = THCudaTensor_new(state);
+	THCudaTensor *output_n = THCudaTensor_new(state);
+
+	// For each elt in batch, do:
+	for (int elt = 0; elt < batchSize; elt ++) {
+		// Matrix mulitply per output:
+		THCudaTensor_select(state, input_n, input, 0, elt);
+		THCudaTensor_select(state, output_n, output, 0, elt);
+
+		EdgeComputation(THCState_getCurrentStream(state),
+				THCudaTensor_data(state, input_n),
+				THCudaTensor_data(state, output_n),
+				height, width);
+	}
+
+	THCudaTensor_free(state, input_n);
+	THCudaTensor_free(state, output_n);
+}
+
+void THNN_CudaEdgeComputation_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput) {
+
+	long batchSize = input->size[0];
+	long plane = input->size[1];
+	long height = input->size[2];
+	long width = input->size[3];
+
+	THCudaTensor *input_n = THCudaTensor_new(state);
+	THCudaTensor *gradOutput_n = THCudaTensor_new(state);
+	THCudaTensor *gradInput_n = THCudaTensor_new(state);
+
+	// For each elt in batch, do:
+	for (int elt = 0; elt < batchSize; elt ++) {
+		// Matrix mulitply per output:
+		THCudaTensor_select(state, input_n, input, 0, elt);
+		THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
+		THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
+
+		EdgeComputation_backward(THCState_getCurrentStream(state),
+				THCudaTensor_data(state, input_n),
+				THCudaTensor_data(state, gradOutput_n),
+				THCudaTensor_data(state, gradInput_n),
+				height, width);
+	}
+
+	THCudaTensor_free(state, input_n);
+	THCudaTensor_free(state, gradOutput_n);
+	THCudaTensor_free(state, gradInput_n);
+}
diff --git a/compilation/EdgeComputation.h b/compilation/EdgeComputation.h
@@ -0,0 +1,101 @@
+#include "common.h"
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+template<typename Dtype>
+__global__ void EdgeComputation_kernel(const int num_kernels, Dtype* input, Dtype* output, int height, int width) {
+
+	CUDA_KERNEL_LOOP(index, num_kernels)
+	{
+		int point_offset = index;
+		int x = index % width;
+		int y = index / width;
+
+		int window_size = 1;
+		for (int m = -window_size; m <= window_size; m++) {
+			for (int n = -window_size; n <= window_size; n++) {
+				if (y+m < 0 || y+m >= height || x+n < 0 || x+n >= width)
+					continue;
+				int image_offset = (y + m) * width + x + n;
+				*(output + point_offset) += fabs(*(input + point_offset)-*(input + image_offset));
+			}
+		}
+
+		if (y-2 >= 0)
+			*(output + point_offset) += fabs(*(input + point_offset)-*(input + (y - 2) * width + x));
+		if (y+2 < height)
+			*(output + point_offset) += fabs(*(input + point_offset)-*(input + (y + 2) * width + x));
+		if (x-2 >= 0)
+			*(output + point_offset) += fabs(*(input + point_offset)-*(input + y * width + x - 2));
+		if (x+2 < width)
+			*(output + point_offset) += fabs(*(input + point_offset)-*(input + y * width + x + 2));
+
+		*(output + point_offset) = *(output + point_offset)/6;
+	}
+}
+
+template<typename Dtype>
+void EdgeComputation(cudaStream_t stream, Dtype* input, Dtype* output, int height, int width)
+{
+	int dimSize = 1024;
+	int num_kernels = height * width;
+	int grid = (num_kernels + dimSize - 1) / dimSize;
+	EdgeComputation_kernel<<<grid, dimSize, 0, stream>>>(num_kernels, input, output, height, width);
+}
+
+template<typename Dtype>
+__global__ void EdgeComputation_backward_kernel(const int num_kernels, Dtype* input, Dtype* gradOutput, Dtype* gradInput, int height, int width) {
+
+	CUDA_KERNEL_LOOP(index, num_kernels)
+	{
+		int point_offset = index;
+		int x = index % width;
+		int y = index / width;
+
+		int window_size = 1;
+		for (int m = -window_size; m <= window_size; m++) {
+			for (int n = -window_size; n <= window_size; n++) {
+				if (y+m < 0 || y+m >= height || x+n < 0 || x+n >= width)
+					continue;
+				int image_offset = (y + m) * width + x + n;
+
+				*(gradInput + point_offset) += (*(input + point_offset) > *(input + image_offset) ? 1 : -1) * *(gradOutput + point_offset);
+				*(gradInput + point_offset) += (*(input + point_offset) > *(input + image_offset) ? 1 : -1) * *(gradOutput + image_offset);
+			}
+		}
+
+		if (y-2 >= 0)
+		{
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + (y - 2) * width + x) ? 1 : -1) * *(gradOutput + point_offset);
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + (y - 2) * width + x) ? 1 : -1) * *(gradOutput + (y - 2) * width + x);
+		}
+		if (y+2 < height)
+		{
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + (y + 2) * width + x) ? 1 : -1) * *(gradOutput + point_offset);
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + (y + 2) * width + x) ? 1 : -1) * *(gradOutput + (y + 2) * width + x);
+		}
+		if (x-2 >= 0)
+		{
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + y * width + x - 2) ? 1 : -1) * *(gradOutput + point_offset);
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + y * width + x - 2) ? 1 : -1) * *(gradOutput + y * width + x - 2);
+		}
+		if (x+2 < width)
+		{
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + y * width + x + 2) ? 1 : -1) * *(gradOutput + point_offset);
+			*(gradInput + point_offset) += (*(input + point_offset) > *(input + y * width + x + 2) ? 1 : -1) * *(gradOutput + y * width + x + 2);
+		}
+
+		*(gradInput + point_offset) = *(gradInput + point_offset)/6;
+
+	}
+}
+
+template<typename Dtype>
+void EdgeComputation_backward(cudaStream_t stream, Dtype* input, Dtype* gradOutput, Dtype* gradInput, int height, int width)
+{
+	int dimSize = 1024;
+	int num_kernels = height * width;
+	int grid = (num_kernels + dimSize - 1) / dimSize;
+	EdgeComputation_backward_kernel<<<grid, dimSize, 0, stream>>>(num_kernels, input, gradOutput, gradInput, height, width);
+}
diff --git a/compilation/EdgeComputation.lua b/compilation/EdgeComputation.lua
@@ -0,0 +1,36 @@
+local THNN = require 'nn.THNN'
+local EdgeComputation, parent = torch.class('nn.EdgeComputation', 'nn.Module')
+
+function EdgeComputation:__init(scale)
+  parent.__init(self)
+  self.scale = scale or 1
+end
+
+function EdgeComputation:updateOutput(input)
+  self.scale = self.scale or 1
+  input = input / self.scale
+
+  local bs,dim,height,width = input:size(1),input:size(2),input:size(3),input:size(4)
+  input = torch.sum(input,2)
+  self.output = torch.CudaTensor():resizeAs(input):fill(0)
+
+  input.THNN.EdgeComputation_updateOutput(
+    input:cdata(),
+    self.output:cdata()
+  )
+  return self.output
+end
+
+function EdgeComputation:updateGradInput(input, gradOutput)
+    local bs,dim,height,width = input:size(1),input:size(2),input:size(3),input:size(4)
+    self.gradInput = torch.CudaTensor():resizeAs(gradOutput):zero()
+
+    input.THNN.EdgeComputation_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+    )
+    self.gradInput = torch.expand(self.gradInput,bs,dim,height,width) / self.scale
+
+    return self.gradInput
+end
diff --git a/compilation/EdgeDetector.cu b/compilation/EdgeDetector.cu
@@ -0,0 +1,38 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "EdgeDetector.h"
+
+void THNN_CudaEdgeDetector_updateOutput(THCState *state, THCudaTensor *input_image, THCudaTensor *input_edge, THCudaTensor *label_preserve, THCudaTensor *label_eliminate, int isSmoothing) {
+
+	long batchSize = input_image->size[0];
+	long plane = input_image->size[1];
+	long height = input_image->size[2];
+	long width = input_image->size[3];
+
+	THCudaTensor *input_image_n = THCudaTensor_new(state);
+	THCudaTensor *input_edge_n = THCudaTensor_new(state);
+	THCudaTensor *label_preserve_n = THCudaTensor_new(state);
+	THCudaTensor *label_eliminate_n = THCudaTensor_new(state);
+
+	for (int elt = 0; elt < batchSize; elt ++) {
+		THCudaTensor_select(state, input_image_n, input_image, 0, elt);
+		THCudaTensor_select(state, input_edge_n, input_edge, 0, elt);
+		THCudaTensor_select(state, label_preserve_n, label_preserve, 0, elt);
+		THCudaTensor_select(state, label_eliminate_n, label_eliminate, 0, elt);
+
+		EdgeDetector(THCState_getCurrentStream(state),
+				THCudaTensor_data(state, input_image_n),
+				THCudaTensor_data(state, input_edge_n),
+				THCudaTensor_data(state, label_preserve_n),
+				THCudaTensor_data(state, label_eliminate_n),
+				height, width, isSmoothing);
+	}
+
+	THCudaTensor_free(state, input_image_n);
+	THCudaTensor_free(state, input_edge_n);
+	THCudaTensor_free(state, label_preserve_n);
+	THCudaTensor_free(state, label_eliminate_n);
+}