-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added openvino models, allow for explicit selection of openvino/pytor…
…ch or autoselect
- Loading branch information
Showing
35 changed files
with
5,037 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
<CustomLayer name="MVN" type="SimpleGPU" version="1"> | ||
<Kernel entry="mvn_kernel"> | ||
<Source filename="mvn_layer.cl"/> | ||
</Kernel> | ||
<!-- inputs and outputs of the kernel--> | ||
<Buffers> | ||
<Tensor arg-index="0" type="input" port-index="0" format="any"/> | ||
<Tensor arg-index="1" type="output" port-index="0" format="any"/> | ||
</Buffers> | ||
<!-- OpenCL compiler options--> | ||
<CompilerOptions options="-cl-mad-enable"/> | ||
<!-- define the global worksize. The formulas can use the values of the B,F,Y,X dimensions and contain the operators: +,-,/,*,% (all evaluated in integer arithmetic) | ||
Default value: global="B*F*Y*X,1,1"--> | ||
<WorkSizes global="B*F,256,1" local="1,256,1"/> | ||
</CustomLayer> | ||
<CustomLayer name="MVN_Scale" type="SimpleGPU" version="1"> | ||
<Kernel entry="mvn_scale_kernel"> | ||
<Source filename="mvn_scale_layer.cl"/> | ||
<Define name="USE_RELU" type="int" param="use_relu" default="1" /> | ||
</Kernel> | ||
<!-- inputs and outputs of the kernel--> | ||
<Buffers> | ||
<Tensor arg-index="0" type="input" port-index="0" format="any"/> | ||
<Tensor arg-index="1" type="output" port-index="0" format="any"/> | ||
<Data arg-index="2" name="weights"/> | ||
<Data arg-index="3" name="biases"/> | ||
</Buffers> | ||
<!-- OpenCL compiler options--> | ||
<CompilerOptions options="-cl-mad-enable"/> | ||
<!-- define the global worksize. The formulas can use the values of the B,F,Y,X dimensions and contain the operators: +,-,/,*,% (all evaluated in integer arithmetic) | ||
Default value: global="B*F*Y*X,1,1"--> | ||
<WorkSizes global="B*F,256,1" local="1,256,1"/> | ||
</CustomLayer> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||
#define MIN(a,b) ((a)<(b))?(a):(b) | ||
__kernel void mvn_kernel( | ||
const __global INPUT0_TYPE* input0, | ||
__global OUTPUT0_TYPE* output) | ||
{ | ||
const uint pxpitch = MIN( INPUT0_PITCHES[2], INPUT0_PITCHES[3] ); | ||
const uint imgsize = INPUT0_DIMS[2] * INPUT0_DIMS[3]; | ||
const uint startidx = (get_global_id(0) / INPUT0_DIMS[1]) * INPUT0_PITCHES[0] + (get_global_id(0) % INPUT0_DIMS[1]) * INPUT0_PITCHES[1]; | ||
const uint endidx = startidx + imgsize*pxpitch; | ||
const uint stride = pxpitch*LOCAL_WORKSIZE[1]; | ||
uint lid = get_local_id(1); | ||
|
||
float sum=0; | ||
float sqsum=0; | ||
|
||
uint idx = startidx + lid*pxpitch; | ||
uint n = 0; | ||
while (idx<endidx) { | ||
sum = sum + input0[idx]; | ||
sqsum = sqsum + input0[idx]*input0[idx]; | ||
idx += stride; | ||
n += 1; | ||
} | ||
sum = sub_group_reduce_add(sum); | ||
sqsum = sub_group_reduce_add(sqsum); | ||
n = sub_group_reduce_add(n); | ||
|
||
INPUT0_TYPE mean; | ||
INPUT0_TYPE invvar; | ||
mean = sum / n; | ||
invvar = 1.0 / sqrt( sqsum / n - mean * mean + 0.0001); | ||
idx = startidx + lid*pxpitch; | ||
while (idx<endidx) { | ||
output[idx] = (input0[idx] - mean)*invvar; | ||
idx = idx+stride; | ||
} | ||
} | ||
|
||
|
||
// seems to work with FP32, with local group up to 1,16,1 | ||
__kernel void mvn_kernel_1( | ||
const __global INPUT0_TYPE* input0, | ||
__global OUTPUT0_TYPE* output) | ||
{ | ||
const uint pxpitch = MIN( INPUT0_PITCHES[2], INPUT0_PITCHES[3] ); | ||
const uint imgsize = INPUT0_DIMS[2] * INPUT0_DIMS[3]; | ||
const uint startidx = (get_global_id(0) / INPUT0_DIMS[1]) * INPUT0_PITCHES[0] + (get_global_id(0) % INPUT0_DIMS[1]) * INPUT0_PITCHES[1]; | ||
const uint endidx = startidx + imgsize*pxpitch; | ||
const uint stride = pxpitch*LOCAL_WORKSIZE[1]; | ||
uint lid = get_local_id(1); | ||
|
||
INPUT0_TYPE sum=0; | ||
INPUT0_TYPE sqsum=0; | ||
|
||
uint idx = startidx + lid*pxpitch; | ||
while (idx<endidx) { | ||
sum = sum + input0[idx]; | ||
sqsum = sqsum + input0[idx]*input0[idx]; | ||
idx = idx+stride; | ||
} | ||
//sum = sub_group_reduce_add(sum); | ||
//sqsum = sub_group_reduce_add(sqsum); | ||
|
||
INPUT0_TYPE mean; | ||
INPUT0_TYPE invvar; | ||
mean = sum / imgsize; | ||
invvar = 1.0 / sqrt( sqsum / imgsize - mean * mean + 0.0001); | ||
idx = startidx + lid*pxpitch; | ||
while (idx<endidx) { | ||
output[idx] = (input0[idx] - mean)*invvar; | ||
idx = idx+stride; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||
#define MIN(a,b) ((a)<(b))?(a):(b) | ||
#define MAX(a,b) ((a)>(b))?(a):(b) | ||
__kernel void mvn_scale_kernel( | ||
const __global INPUT0_TYPE* input0, | ||
__global OUTPUT0_TYPE* output, | ||
const __global INPUT0_TYPE* weights, | ||
const __global INPUT0_TYPE* biases) | ||
{ | ||
const uint channel = get_global_id(0) % INPUT0_DIMS[1]; | ||
const uint startidx = (get_global_id(0) / INPUT0_DIMS[1]) * INPUT0_PITCHES[0] + channel * INPUT0_PITCHES[1] + INPUT0_OFFSET; | ||
const uint startidx2 = (get_global_id(0) / OUTPUT0_DIMS[1]) * OUTPUT0_PITCHES[0] + channel * OUTPUT0_PITCHES[1] + OUTPUT0_OFFSET; | ||
const uint lid = get_local_id(1); | ||
|
||
float sum=0; | ||
float sqsum=0; | ||
|
||
uint n = 0; | ||
uint y = 0; | ||
uint x = lid; | ||
while (y<INPUT0_DIMS[2]) { | ||
while (x<INPUT0_DIMS[3]) { | ||
uint idx = startidx + y*INPUT0_PITCHES[2] + x*INPUT0_PITCHES[3]; | ||
sum = sum + input0[idx]; | ||
sqsum = sqsum + input0[idx]*input0[idx]; | ||
x += LOCAL_WORKSIZE[1]; | ||
n += 1; | ||
} | ||
y += x / INPUT0_DIMS[3]; | ||
x = x % INPUT0_DIMS[3]; | ||
} | ||
sum = sub_group_reduce_add(sum); | ||
sqsum = sub_group_reduce_add(sqsum); | ||
n = sub_group_reduce_add(n); | ||
|
||
INPUT0_TYPE mean = sum / n; | ||
INPUT0_TYPE scale = weights[channel] / sqrt( sqsum / n - mean * mean + 0.0001); | ||
INPUT0_TYPE shift = biases[channel] - mean*scale; | ||
OUTPUT0_TYPE value; | ||
|
||
y = 0; | ||
x = lid; | ||
while (y<OUTPUT0_DIMS[2]) { | ||
while (x<OUTPUT0_DIMS[3]) { | ||
uint idx = startidx + y*INPUT0_PITCHES[2] + x*INPUT0_PITCHES[3]; | ||
uint idx2 = startidx2 + y*OUTPUT0_PITCHES[2] + x*OUTPUT0_PITCHES[3]; | ||
value = input0[idx]*scale + shift; | ||
#if USE_RELU!=0 | ||
output[idx2] = MAX(value, 0.0); | ||
#else | ||
output[idx2] = value; | ||
#endif | ||
x += LOCAL_WORKSIZE[1]; | ||
} | ||
y += x / OUTPUT0_DIMS[3]; | ||
x = x % OUTPUT0_DIMS[3]; | ||
} | ||
} | ||
|
||
|
Oops, something went wrong.