Skip to content

Commit

Permalink
added openvino models, allow for explicit selection of openvino/pytor…
Browse files Browse the repository at this point in the history
…ch or autoselect
  • Loading branch information
pspillai committed May 6, 2019
1 parent c432d8c commit 2804041
Show file tree
Hide file tree
Showing 35 changed files with 5,037 additions and 4 deletions.
34 changes: 34 additions & 0 deletions clkernels/mvn_custom_layer.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<CustomLayer name="MVN" type="SimpleGPU" version="1">
<Kernel entry="mvn_kernel">
<Source filename="mvn_layer.cl"/>
</Kernel>
<!-- inputs and outputs of the kernel-->
<Buffers>
<Tensor arg-index="0" type="input" port-index="0" format="any"/>
<Tensor arg-index="1" type="output" port-index="0" format="any"/>
</Buffers>
<!-- OpenCL compiler options-->
<CompilerOptions options="-cl-mad-enable"/>
<!-- define the global worksize. The formulas can use the values of the B,F,Y,X dimensions and contain the operators: +,-,/,*,% (all evaluated in integer arithmetic)
Default value: global="B*F*Y*X,1,1"-->
<WorkSizes global="B*F,256,1" local="1,256,1"/>
</CustomLayer>
<CustomLayer name="MVN_Scale" type="SimpleGPU" version="1">
<Kernel entry="mvn_scale_kernel">
<Source filename="mvn_scale_layer.cl"/>
<Define name="USE_RELU" type="int" param="use_relu" default="1" />
</Kernel>
<!-- inputs and outputs of the kernel-->
<Buffers>
<Tensor arg-index="0" type="input" port-index="0" format="any"/>
<Tensor arg-index="1" type="output" port-index="0" format="any"/>
<Data arg-index="2" name="weights"/>
<Data arg-index="3" name="biases"/>
</Buffers>
<!-- OpenCL compiler options-->
<CompilerOptions options="-cl-mad-enable"/>
<!-- define the global worksize. The formulas can use the values of the B,F,Y,X dimensions and contain the operators: +,-,/,*,% (all evaluated in integer arithmetic)
Default value: global="B*F*Y*X,1,1"-->
<WorkSizes global="B*F,256,1" local="1,256,1"/>
</CustomLayer>

74 changes: 74 additions & 0 deletions clkernels/mvn_layer.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN(a,b) ((a)<(b))?(a):(b)
__kernel void mvn_kernel(
const __global INPUT0_TYPE* input0,
__global OUTPUT0_TYPE* output)
{
const uint pxpitch = MIN( INPUT0_PITCHES[2], INPUT0_PITCHES[3] );
const uint imgsize = INPUT0_DIMS[2] * INPUT0_DIMS[3];
const uint startidx = (get_global_id(0) / INPUT0_DIMS[1]) * INPUT0_PITCHES[0] + (get_global_id(0) % INPUT0_DIMS[1]) * INPUT0_PITCHES[1];
const uint endidx = startidx + imgsize*pxpitch;
const uint stride = pxpitch*LOCAL_WORKSIZE[1];
uint lid = get_local_id(1);

float sum=0;
float sqsum=0;

uint idx = startidx + lid*pxpitch;
uint n = 0;
while (idx<endidx) {
sum = sum + input0[idx];
sqsum = sqsum + input0[idx]*input0[idx];
idx += stride;
n += 1;
}
sum = sub_group_reduce_add(sum);
sqsum = sub_group_reduce_add(sqsum);
n = sub_group_reduce_add(n);

INPUT0_TYPE mean;
INPUT0_TYPE invvar;
mean = sum / n;
invvar = 1.0 / sqrt( sqsum / n - mean * mean + 0.0001);
idx = startidx + lid*pxpitch;
while (idx<endidx) {
output[idx] = (input0[idx] - mean)*invvar;
idx = idx+stride;
}
}


// seems to work with FP32, with local group up to 1,16,1
__kernel void mvn_kernel_1(
const __global INPUT0_TYPE* input0,
__global OUTPUT0_TYPE* output)
{
const uint pxpitch = MIN( INPUT0_PITCHES[2], INPUT0_PITCHES[3] );
const uint imgsize = INPUT0_DIMS[2] * INPUT0_DIMS[3];
const uint startidx = (get_global_id(0) / INPUT0_DIMS[1]) * INPUT0_PITCHES[0] + (get_global_id(0) % INPUT0_DIMS[1]) * INPUT0_PITCHES[1];
const uint endidx = startidx + imgsize*pxpitch;
const uint stride = pxpitch*LOCAL_WORKSIZE[1];
uint lid = get_local_id(1);

INPUT0_TYPE sum=0;
INPUT0_TYPE sqsum=0;

uint idx = startidx + lid*pxpitch;
while (idx<endidx) {
sum = sum + input0[idx];
sqsum = sqsum + input0[idx]*input0[idx];
idx = idx+stride;
}
//sum = sub_group_reduce_add(sum);
//sqsum = sub_group_reduce_add(sqsum);

INPUT0_TYPE mean;
INPUT0_TYPE invvar;
mean = sum / imgsize;
invvar = 1.0 / sqrt( sqsum / imgsize - mean * mean + 0.0001);
idx = startidx + lid*pxpitch;
while (idx<endidx) {
output[idx] = (input0[idx] - mean)*invvar;
idx = idx+stride;
}
}
60 changes: 60 additions & 0 deletions clkernels/mvn_scale_layer.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN(a,b) ((a)<(b))?(a):(b)
#define MAX(a,b) ((a)>(b))?(a):(b)
__kernel void mvn_scale_kernel(
const __global INPUT0_TYPE* input0,
__global OUTPUT0_TYPE* output,
const __global INPUT0_TYPE* weights,
const __global INPUT0_TYPE* biases)
{
const uint channel = get_global_id(0) % INPUT0_DIMS[1];
const uint startidx = (get_global_id(0) / INPUT0_DIMS[1]) * INPUT0_PITCHES[0] + channel * INPUT0_PITCHES[1] + INPUT0_OFFSET;
const uint startidx2 = (get_global_id(0) / OUTPUT0_DIMS[1]) * OUTPUT0_PITCHES[0] + channel * OUTPUT0_PITCHES[1] + OUTPUT0_OFFSET;
const uint lid = get_local_id(1);

float sum=0;
float sqsum=0;

uint n = 0;
uint y = 0;
uint x = lid;
while (y<INPUT0_DIMS[2]) {
while (x<INPUT0_DIMS[3]) {
uint idx = startidx + y*INPUT0_PITCHES[2] + x*INPUT0_PITCHES[3];
sum = sum + input0[idx];
sqsum = sqsum + input0[idx]*input0[idx];
x += LOCAL_WORKSIZE[1];
n += 1;
}
y += x / INPUT0_DIMS[3];
x = x % INPUT0_DIMS[3];
}
sum = sub_group_reduce_add(sum);
sqsum = sub_group_reduce_add(sqsum);
n = sub_group_reduce_add(n);

INPUT0_TYPE mean = sum / n;
INPUT0_TYPE scale = weights[channel] / sqrt( sqsum / n - mean * mean + 0.0001);
INPUT0_TYPE shift = biases[channel] - mean*scale;
OUTPUT0_TYPE value;

y = 0;
x = lid;
while (y<OUTPUT0_DIMS[2]) {
while (x<OUTPUT0_DIMS[3]) {
uint idx = startidx + y*INPUT0_PITCHES[2] + x*INPUT0_PITCHES[3];
uint idx2 = startidx2 + y*OUTPUT0_PITCHES[2] + x*OUTPUT0_PITCHES[3];
value = input0[idx]*scale + shift;
#if USE_RELU!=0
output[idx2] = MAX(value, 0.0);
#else
output[idx2] = value;
#endif
x += LOCAL_WORKSIZE[1];
}
y += x / OUTPUT0_DIMS[3];
x = x % OUTPUT0_DIMS[3];
}
}


Loading

0 comments on commit 2804041

Please sign in to comment.