Permalink
Browse files

Merge branch 'tiling'

  • Loading branch information...
2 parents 76a324e + a0cf7f6 commit 2300c2ddaf9ef43582de460158fea7480c7c7f13 @upegelow upegelow committed Apr 23, 2012
Showing with 353 additions and 94 deletions.
  1. +186 −76 data/kernels/nlmeans.cl
  2. +1 −0 po/POTFILES.in
  3. +166 −18 src/iop/nlmeans.c
View
262 data/kernels/nlmeans.cl
@@ -1,6 +1,7 @@
/*
This file is part of darktable,
copyright (c) 2011 johannes hanika.
+ copyright (c) 2012 Ulrich Pegelow.
darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,107 +19,216 @@
const sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
const sampler_t samplerf = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+const sampler_t samplerc = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#define ICLAMP(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
+
+
+/*
+ To speed up processing we use an algorithm proposed from B. Goossens, H.Q. Luong, J. Aelterman, A. Pizurica, and W. Philips,
+ "A GPU-Accelerated Real-Time NLMeans Algorithm for Denoising Color Video Sequences", in Proc. ACIVS (2), 2010, pp.46-57.
+
+ Benchmarking figures (export of a 20MPx image on a i7-2600 with an NVIDIA GTS450):
+
+ This GPU-code: 18s
+ Brute force GPU-code: 136s
+ Optimized CPU-code: 27s
+
+*/
+
+
float gh(const float f)
{
// make spread bigger: less smoothing
const float spread = 100.f;
return 1.0f/(1.0f + fabs(f)*spread);
}
+
+float ddirac(const int2 q)
+{
+ return ((q.x || q.y) ? 1.0f : 0.0f);
+}
+
+
kernel void
-nlmeans (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int P, const int K, const float nL, const float nC)
+nlmeans_init(write_only image2d_t out, const int width, const int height)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
- const int maxx = width - 1;
- const int maxy = height - 1;
- const float4 norm2 = (float4)(nL, nC, nC, 1.0f);
-
-#if 0
- // this is 20s (compared to 29s brute force below) but still unusable:
- // load a block of shared memory, initialize to zero
- local float4 block[32*32];//get_local_size(0)*get_local_size(1)];
- block[get_local_id(0) + get_local_id(1) * get_local_size(0)] = (float4)0.0f;
- barrier(CLK_LOCAL_MEM_FENCE);
if(x >= width || y >= height) return;
- // coalesced mem accesses:
- const float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
+ write_imagef (out, (int2)(x, y), (float4)0.0f);
+}
+
- // for each shift vector
- for(int kj=-K;kj<=K;kj++)
+kernel void
+nlmeans_dist(read_only image2d_t in, write_only image2d_t U4, const int width, const int height,
+ const int2 q, const float nL2, const float nC2)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const float4 norm2 = (float4)(nL2, nC2, nC2, 1.0f);
+
+ if(x >= width || y >= height) return;
+
+ float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
+ float4 p2 = read_imagef(in, sampleri, (int2)(x, y) + q);
+ float4 tmp = (p1 - p2)*(p1 - p2)*norm2;
+ float dist = tmp.x + tmp.y + tmp.z;
+
+ write_imagef (U4, (int2)(x, y), dist);
+}
+
+kernel void
+nlmeans_horiz(read_only image2d_t U4_in, write_only image2d_t U4_out, const int width, const int height,
+ const int2 q, const int P, local float *buffer)
+{
+ const int lid = get_local_id(0);
+ const int lsz = get_local_size(0);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ if(y >= height) return;
+
+ /* fill center part of buffer */
+ buffer[P + lid] = read_imagef(U4_in, samplerc, (int2)(x, y)).x;
+
+ /* left wing of buffer */
+ for(int n=0; n <= P/lsz; n++)
{
- for(int ki=-K;ki<=K;ki++)
- {
- const float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+ki, 0, maxx), ICLAMP(y+kj, 0, maxy)));
- const float4 tmp = (p1 - p2)*norm2;
- const float dist = tmp.x + tmp.y + tmp.z;
- for(int pj=-P;pj<=P;pj++)
- {
- for(int pi=-P;pi<=P;pi++)
- {
- float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi+ki, 0, maxx), ICLAMP(y+pj+kj, 0, maxy)));
- p2.w = dist;
- const int i = get_local_id(0) + pi, j = get_local_id(1) + pj;
- if(i >= 0 && i < get_local_size(0) && j >= 0 && j < get_local_size(1))
- {
- // TODO: for non-linear gh(), this produces results different than the CPU
- block[i + get_local_size(0) * j].x += gh(p2.x);
- block[i + get_local_size(0) * j].y += gh(p2.y);
- block[i + get_local_size(0) * j].z += gh(p2.z);
- block[i + get_local_size(0) * j].w += gh(p2.w);
- }
- }
- }
- }
+ const int l = mad24(n, lsz, lid + 1);
+ if(l > P) continue;
+ const int xx = mad24((int)get_group_id(0), lsz, -l);
+ buffer[P - l] = read_imagef(U4_in, samplerc, (int2)(xx, y)).x;
}
- // write back normalized shm
+
+ /* right wing of buffer */
+ for(int n=0; n <= P/lsz; n++)
+ {
+ const int r = mad24(n, lsz, lsz - lid);
+ if(r > P) continue;
+ const int xx = mad24((int)get_group_id(0), lsz, lsz - 1 + r);
+ buffer[P + lsz - 1 + r] = read_imagef(U4_in, samplerc, (int2)(xx, y)).x;
+ }
+
barrier(CLK_LOCAL_MEM_FENCE);
- const float4 tmp = block[get_local_id(0) + get_local_id(1) * get_local_size(0)];
- tmp.x *= 1.0f/tmp.w;
- tmp.y *= 1.0f/tmp.w;
- tmp.z *= 1.0f/tmp.w;
- write_imagef (out, (int2)(x, y), tmp);
-#endif
+ if(x >= width) return;
-#if 1
- if(x >= width || y >= height) return;
+ buffer += lid + P;
+
+ float distacc = 0.0f;
+ for(int pi = -P; pi <= P; pi++)
+ {
+ distacc += buffer[pi];
+ }
+
+ write_imagef (U4_out, (int2)(x, y), distacc);
+}
+
+
+kernel void
+nlmeans_vert(read_only image2d_t U4_in, write_only image2d_t U4_out, const int width, const int height,
+ const int2 q, const int P, local float *buffer)
+{
+ const int lid = get_local_id(1);
+ const int lsz = get_local_size(1);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ if(x >= width) return;
+
+ /* fill center part of buffer */
+ buffer[P + lid] = read_imagef(U4_in, samplerc, (int2)(x, y)).x;
+
+ /* left wing of buffer */
+ for(int n=0; n <= P/lsz; n++)
+ {
+ const int l = mad24(n, lsz, lid + 1);
+ if(l > P) continue;
+ const int yy = mad24((int)get_group_id(1), lsz, -l);
+ buffer[P - l] = read_imagef(U4_in, samplerc, (int2)(x, yy)).x;
+ }
- const float4 acc = (float4)(0.0f);
- // brute force (superslow baseline)!
- // for each shift vector
- for(int kj=-K;kj<=K;kj++)
+ /* right wing of buffer */
+ for(int n=0; n <= P/lsz; n++)
{
- for(int ki=-K;ki<=K;ki++)
- {
- float dist = 0.0f;
- for(int pj=-P;pj<=P;pj++)
- {
- for(int pi=-P;pi<=P;pi++)
- {
- float4 p1 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi, 0, maxx), ICLAMP(y+pj, 0, maxy)));
- float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi+ki, 0, maxx), ICLAMP(y+pj+kj, 0, maxy)));
- float4 tmp = (p1 - p2)*norm2;
- dist += tmp.x + tmp.y + tmp.z;
- }
- }
- float4 pin = read_imagef(in, sampleri, (int2)(ICLAMP(x+ki, 0, maxx), ICLAMP(y+kj, 0, maxy)));
- dist = gh(dist);
- acc.x += dist * pin.x;
- acc.y += dist * pin.y;
- acc.z += dist * pin.z;
- acc.w += dist;
- }
+ const int r = mad24(n, lsz, lsz - lid);
+ if(r > P) continue;
+ const int yy = mad24((int)get_group_id(1), lsz, lsz - 1 + r);
+ buffer[P + lsz - 1 + r] = read_imagef(U4_in, samplerc, (int2)(x, yy)).x;
}
- acc.x *= 1.0f/acc.w;
- acc.y *= 1.0f/acc.w;
- acc.z *= 1.0f/acc.w;
- write_imagef (out, (int2)(x, y), acc);
-#endif
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(y >= height) return;
+
+ buffer += lid + P;
+
+ float distacc = 0.0f;
+ for(int pj = -P; pj <= P; pj++)
+ {
+ distacc += buffer[pj];
+ }
+
+ distacc = gh(distacc);
+
+ write_imagef (U4_out, (int2)(x, y), distacc);
}
+
+
+kernel void
+nlmeans_accu(read_only image2d_t in, read_only image2d_t U2_in, read_only image2d_t U4_in,
+ write_only image2d_t U2_out, const int width, const int height,
+ const int2 q)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ if(x >= width || y >= height) return;
+
+ float4 u1_pq = read_imagef(in, sampleri, (int2)(x, y) + q);
+ float4 u1_mq = read_imagef(in, sampleri, (int2)(x, y) - q);
+
+ float4 u2 = read_imagef(U2_in, sampleri, (int2)(x, y));
+
+ float u4 = read_imagef(U4_in, sampleri, (int2)(x, y)).x;
+ float u4_mq = read_imagef(U4_in, sampleri, (int2)(x, y) - q).x;
+
+ float u3 = u2.w;
+ float u4_mq_dd = u4_mq * ddirac(q);
+
+ u2 += (u4 * u1_pq) + (u4_mq_dd * u1_mq);
+ u3 += (u4 + u4_mq_dd);
+
+ u2.w = u3;
+
+ write_imagef(U2_out, (int2)(x, y), u2);
+}
+
+
+kernel void
+nlmeans_finish(read_only image2d_t in, read_only image2d_t U2, write_only image2d_t out,
+ const int width, const int height, const float4 weight)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ if(x >= width || y >= height) return;
+
+ float4 i = read_imagef(in, sampleri, (int2)(x, y));
+ float4 u2 = read_imagef(U2, sampleri, (int2)(x, y));
+ float u3 = u2.w;
+
+ float4 o = i * ((float4)1.0f - weight) + u2/u3 * weight;
+ o.w = i.w;
+
+ write_imagef(out, (int2)(x, y), o);
+}
+
+
+
View
1 po/POTFILES.in
@@ -17,6 +17,7 @@ src/control/jobs/film_jobs.c
src/control/jobs/image_jobs.c
src/develop/develop.c
src/develop/imageop.c
+src/develop/tiling.c
src/dtgtk/resetlabel.c
src/dtgtk/slider.c
src/libs/similarity.c
View
184 src/iop/nlmeans.c
@@ -19,6 +19,7 @@
#include "config.h"
#endif
#include "develop/imageop.h"
+#include "develop/tiling.h"
#include "bauhaus/bauhaus.h"
#include "control/control.h"
#include "gui/accelerators.h"
@@ -28,6 +29,8 @@
#include <stdlib.h>
#include <xmmintrin.h>
+#define BLOCKSIZE 2048 /* maximum blocksize. must be a power of 2 and will be automatically reduced if needed */
+
// this is the version of the modules parameters,
// and includes version information about compile-time dt
DT_MODULE(1)
@@ -51,7 +54,12 @@ typedef dt_iop_nlmeans_params_t dt_iop_nlmeans_data_t;
typedef struct dt_iop_nlmeans_global_data_t
{
- int kernel_nlmeans;
+ int kernel_nlmeans_init;
+ int kernel_nlmeans_dist;
+ int kernel_nlmeans_horiz;
+ int kernel_nlmeans_vert;
+ int kernel_nlmeans_accu;
+ int kernel_nlmeans_finish;
}
dt_iop_nlmeans_global_data_t;
@@ -69,7 +77,7 @@ groups ()
int
flags ()
{
- return IOP_FLAGS_SUPPORTS_BLENDING;
+ return IOP_FLAGS_SUPPORTS_BLENDING | IOP_FLAGS_ALLOW_TILING;
}
void init_key_accels(dt_iop_module_so_t *self)
@@ -99,18 +107,23 @@ static float gh(const float f)
return 1.0f/(1.0f + fabsf(f)*spread);
}
-// temporarily disabled, because it is really quite unbearably slow the way it is implemented now..
-#if 0//def HAVE_OPENCL
+
+#ifdef HAVE_OPENCL
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;
dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)self->data;
+
+
const int devid = piece->pipe->devid;
const int width = roi_in->width;
const int height = roi_in->height;
+ cl_mem dev_U4 = NULL;
+
cl_int err = -999;
+
const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size
const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood
@@ -122,28 +135,153 @@ process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem
if (err != CL_SUCCESS) goto error;
return TRUE;
}
- float max_L = 100.0f, max_C = 256.0f;
- float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
- nL *= nL; nC *= nC;
+
+ float max_L = 120.0f, max_C = 512.0f;
+ float nL = 1.0f/max_L, nC = 1.0f/max_C;
+ float nL2 = nL*nL, nC2 = nC*nC;
+ float weight[4] = { powf(d->luma, 0.6), powf(d->chroma, 0.6), powf(d->chroma, 0.6), 1.0f };
+
+ dev_U4 = dt_opencl_alloc_device(devid, roi_out->width, roi_out->height, sizeof(float));
+ if (dev_U4 == NULL) goto error;
+
+ // prepare local work group
+ size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group
+ size_t workgroupsize = 0; // the maximum number of items in a work group
+ unsigned long localmemsize = 0; // the maximum amount of local memory we can use
+ size_t kernelworkgroupsize = 0; // the maximum amount of items in work group of the kernel
+ // assuming this is the same for nlmeans_horiz and nlmeans_vert
+
+ // make sure blocksize is not too large
+ int blocksize = BLOCKSIZE;
+ if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS &&
+ dt_opencl_get_kernel_work_group_size(devid, gd->kernel_nlmeans_horiz, &kernelworkgroupsize) == CL_SUCCESS)
+ {
+ // reduce blocksize step by step until it fits to limits
+ while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize
+ || blocksize > workgroupsize || (blocksize+2*P)*sizeof(float) > localmemsize)
+ {
+ if(blocksize == 1) break;
+ blocksize >>= 1;
+ }
+ }
+ else
+ {
+ blocksize = 1; // slow but safe
+ }
+
+ const size_t bwidth = width % blocksize == 0 ? width : (width / blocksize + 1)*blocksize;
+ const size_t bheight = height % blocksize == 0 ? height : (height / blocksize + 1)*blocksize;
+
+ size_t sizesl[3];
+ size_t local[3];
size_t sizes[] = { ROUNDUPWD(width), ROUNDUPHT(height), 1};
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 0, sizeof(cl_mem), (void *)&dev_in);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 1, sizeof(cl_mem), (void *)&dev_out);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 2, sizeof(int), (void *)&width);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 3, sizeof(int), (void *)&height);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 4, sizeof(int32_t), (void *)&P);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 5, sizeof(int32_t), (void *)&K);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 6, sizeof(float), (void *)&nL);
- dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 7, sizeof(float), (void *)&nC);
- err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans, sizes);
+
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_init, 0, sizeof(cl_mem), (void *)&dev_out);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_init, 1, sizeof(int), (void *)&width);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_init, 2, sizeof(int), (void *)&height);
+ err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_init, sizes);
if(err != CL_SUCCESS) goto error;
+
+
+
+ for(int j = -K; j <= 0; j++)
+ for(int i = -K; i <= K; i++)
+ {
+ int q[2] = { i, j};
+
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 0, sizeof(cl_mem), (void *)&dev_in);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 1, sizeof(cl_mem), (void *)&dev_U4);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 2, sizeof(int), (void *)&width);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 3, sizeof(int), (void *)&height);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 4, 2*sizeof(int), (void *)&q);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 5, sizeof(float), (void *)&nL2);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_dist, 6, sizeof(float), (void *)&nC2);
+ err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_dist, sizes);
+ if(err != CL_SUCCESS) goto error;
+
+ sizesl[0] = bwidth;
+ sizesl[1] = ROUNDUPHT(height);
+ sizesl[2] = 1;
+ local[0] = blocksize;
+ local[1] = 1;
+ local[2] = 1;
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 0, sizeof(cl_mem), (void *)&dev_U4);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 1, sizeof(cl_mem), (void *)&dev_U4);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 2, sizeof(int), (void *)&width);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 3, sizeof(int), (void *)&height);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 4, 2*sizeof(int), (void *)&q);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 5, sizeof(int), (void *)&P);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_horiz, 6, (blocksize+2*P)*sizeof(float), NULL);
+ err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_nlmeans_horiz, sizesl, local);
+ if(err != CL_SUCCESS) goto error;
+
+
+ sizesl[0] = ROUNDUPWD(width);
+ sizesl[1] = bheight;
+ sizesl[2] = 1;
+ local[0] = 1;
+ local[1] = blocksize;
+ local[2] = 1;
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 0, sizeof(cl_mem), (void *)&dev_U4);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 1, sizeof(cl_mem), (void *)&dev_U4);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 2, sizeof(int), (void *)&width);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 3, sizeof(int), (void *)&height);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 4, 2*sizeof(int), (void *)&q);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 5, sizeof(int), (void *)&P);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_vert, 6, (blocksize+2*P)*sizeof(float), NULL);
+ err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_nlmeans_vert, sizesl, local);
+ if(err != CL_SUCCESS) goto error;
+
+
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 0, sizeof(cl_mem), (void *)&dev_in);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 1, sizeof(cl_mem), (void *)&dev_out);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 2, sizeof(cl_mem), (void *)&dev_U4);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 3, sizeof(cl_mem), (void *)&dev_out);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 4, sizeof(int), (void *)&width);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 5, sizeof(int), (void *)&height);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_accu, 6, 2*sizeof(int), (void *)&q);
+ err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_accu, sizes);
+ if(err != CL_SUCCESS) goto error;
+
+ dt_opencl_finish(devid);
+ }
+
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 0, sizeof(cl_mem), (void *)&dev_in);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 1, sizeof(cl_mem), (void *)&dev_out);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 2, sizeof(cl_mem), (void *)&dev_out);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 3, sizeof(int), (void *)&width);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 4, sizeof(int), (void *)&height);
+ dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans_finish, 5, 4*sizeof(float), (void *)&weight);
+ err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans_finish, sizes);
+ if(err != CL_SUCCESS) goto error;
+
+ dt_opencl_release_mem_object(dev_U4);
return TRUE;
error:
+ if(dev_U4 != NULL) dt_opencl_release_mem_object(dev_U4);
dt_print(DT_DEBUG_OPENCL, "[opencl_nlmeans] couldn't enqueue kernel! %d\n", err);
return FALSE;
}
#endif
+
+void tiling_callback (struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out, struct dt_develop_tiling_t *tiling)
+{
+ const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size
+ const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood
+
+ tiling->factor = 2.25f; // in + out + tmp
+ tiling->maxbuf = 1.0f;
+ tiling->overhead = 0;
+ tiling->overlap = P+K;
+ tiling->xalign = 1;
+ tiling->yalign = 1;
+ return;
+}
+
+
+
/** process, all real work is done here. */
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
@@ -371,13 +509,23 @@ void init_global(dt_iop_module_so_t *module)
const int program = 5; // nlmeans.cl, from programs.conf
dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)malloc(sizeof(dt_iop_nlmeans_global_data_t));
module->data = gd;
- gd->kernel_nlmeans = dt_opencl_create_kernel(program, "nlmeans");
+ gd->kernel_nlmeans_init = dt_opencl_create_kernel(program, "nlmeans_init");
+ gd->kernel_nlmeans_dist = dt_opencl_create_kernel(program, "nlmeans_dist");
+ gd->kernel_nlmeans_horiz = dt_opencl_create_kernel(program, "nlmeans_horiz");
+ gd->kernel_nlmeans_vert = dt_opencl_create_kernel(program, "nlmeans_vert");
+ gd->kernel_nlmeans_accu = dt_opencl_create_kernel(program, "nlmeans_accu");
+ gd->kernel_nlmeans_finish = dt_opencl_create_kernel(program, "nlmeans_finish");
}
void cleanup_global(dt_iop_module_so_t *module)
{
dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)module->data;
- dt_opencl_free_kernel(gd->kernel_nlmeans);
+ dt_opencl_free_kernel(gd->kernel_nlmeans_init);
+ dt_opencl_free_kernel(gd->kernel_nlmeans_dist);
+ dt_opencl_free_kernel(gd->kernel_nlmeans_horiz);
+ dt_opencl_free_kernel(gd->kernel_nlmeans_vert);
+ dt_opencl_free_kernel(gd->kernel_nlmeans_accu);
+ dt_opencl_free_kernel(gd->kernel_nlmeans_finish);
free(module->data);
module->data = NULL;
}

0 comments on commit 2300c2d

Please sign in to comment.