Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve precision of HLR chroma corrections #13646

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions data/kernels/basic.cl
Original file line number Diff line number Diff line change
Expand Up @@ -395,18 +395,26 @@ highlights_dilatemask (global char *in, global char *out,
}

kernel void
highlights_chroma (read_only image2d_t in, global char *mask, global float *accu,
const int width, const int height,
const int pwidth, const int psize,
const int filters, global const unsigned char (*const xtrans)[6],
global const float *clips, global const float *dark)
highlights_chroma (
read_only image2d_t in,
global char *mask,
global double *accu,
const int width,
const int height,
const int pwidth,
const int psize,
const int filters,
global const unsigned char (*const xtrans)[6],
global const float *clips,
global const float *dark,
const int aligner)
{
const int row = get_global_id(0);

if((row < 3) || (row > height - 3)) return;

float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float cnt[4] = {0.0f, 0.0f, 0.0f, 0.0f};
double sum[4] = {0.0, 0.0, 0.0, 0.0};
double cnt[4] = {0.0, 0.0, 0.0, 0.0};

for(int col = 3; col < width-3; col++)
{
Expand All @@ -417,14 +425,14 @@ highlights_chroma (read_only image2d_t in, global char *mask, global float *accu
if(mask[px] && (inval > dark[color]) && (inval < clips[color]))
{
const float ref = _calc_refavg(in, xtrans, filters, row, col, width);
sum[color] += inval - ref;
cnt[color] += 1.0f;
sum[color] += (double)(inval - ref);
cnt[color] += 1.0;
}
}
for(int c = 0; c < 3; c++)
{
accu[row*6 + c] = sum[c];
accu[row*6 + 3 + c] = cnt[c];
accu[row*aligner + c] = sum[c];
accu[row*aligner + 3 + c] = cnt[c];
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/common/opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ static int dt_opencl_device_init(dt_opencl_t *cl, const int dev, cl_device_id *d
cl->dev[dev].asyncmode = 0;
cl->dev[dev].disabled = 0;
cl->dev[dev].forced_headroom = 0;
cl->dev[dev].cacheline = 0;
cl->dev[dev].tuneactive = 0;
cl->dev[dev].runtime_error = 0;
cl_device_id devid = cl->dev[dev].devid = devices[k];
Expand Down Expand Up @@ -596,6 +597,9 @@ static int dt_opencl_device_init(dt_opencl_t *cl, const int dev, cl_device_id *d
dt_print_nts(DT_DEBUG_OPENCL, " MAX IMAGE SIZE: %zd x %zd\n", cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
(cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(infoint), &infoint, NULL);
dt_print_nts(DT_DEBUG_OPENCL, " MAX WORK GROUP SIZE: %zu\n", infoint);
(cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(infoint), &infoint, NULL);
dt_print_nts(DT_DEBUG_OPENCL, " CACHELINE SIZE: %zu\n", infoint);
cl->dev[dev].cacheline = (int)infoint;
(cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(infoint), &infoint, NULL);
dt_print_nts(DT_DEBUG_OPENCL, " MAX WORK ITEM DIMENSIONS: %zu\n", infoint);

Expand Down
7 changes: 5 additions & 2 deletions src/common/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,14 @@ typedef struct dt_opencl_device_t
int disabled;

// Some devices are known to be unused by other apps so there is no need to test for available memory at all.
int forced_headroom;
unsigned int forced_headroom;

// As the benchmarks are not good enough to calculate tiled-gpu vs untiled-cpu we have a parameter exposed
// in the cldevice conf key to balance this
float advantage;
float advantage;

// size of the device cacheline in bytes
int cacheline;
} dt_opencl_device_t;

struct dt_bilateral_cl_global_t;
Expand Down
54 changes: 30 additions & 24 deletions src/iop/opposed.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ static void _process_linear_opposed(struct dt_iop_module_t *self, dt_dev_pixelpi
}
}

dt_aligned_pixel_t cr_sum = {0.0f, 0.0f, 0.0f, 0.0f};
dt_aligned_pixel_t cr_cnt = {0.0f, 0.0f, 0.0f, 0.0f};
double cr_sum[4] = {0.0, 0.0, 0.0, 0.0};
int cr_cnt[4] = {0, 0, 0, 0};
#ifdef _OPENMP
#pragma omp parallel for default(none) \
dt_omp_firstprivate(input, roi_in, clips, clipdark, mask) \
Expand All @@ -195,13 +195,13 @@ static void _process_linear_opposed(struct dt_iop_module_t *self, dt_dev_pixelpi
const float inval = fmaxf(0.0f, input[idx+c]);
if((inval > clipdark[c]) && (inval < clips[c]) && (mask[(c+3) * msize + _raw_to_cmap(mwidth, row, col)]))
{
cr_sum[c] += inval - _calc_linear_refavg(&input[idx], c);
cr_cnt[c] += 1.0f;
cr_sum[c] += (double)(inval - _calc_linear_refavg(&input[idx], c));
cr_cnt[c] += 1;
}
}
}
}
for_each_channel(c) chrominance[c] = cr_sum[c] / fmaxf(1.0f, cr_cnt[c]);
for_each_channel(c) chrominance[c] = (float)(cr_sum[c] / (double) MAX(1, cr_cnt[c]));
}

// we only have valid precalculated data if in fullpipe and complete (allow some rounding) image
Expand All @@ -211,7 +211,7 @@ static void _process_linear_opposed(struct dt_iop_module_t *self, dt_dev_pixelpi
{
for_each_channel(c) img_oppchroma[c] = chrominance[c];
img_opphash = opphash;
dt_print(DT_DEBUG_PIPE, "[opposed chroma cache] %f %f %f for opphash%22" PRIu64 "\n", chrominance[0], chrominance[1], chrominance[2], opphash);
dt_print(DT_DEBUG_PIPE, "[opposed chroma cache CPU] %f %f %f for opphash%22" PRIu64 "\n", chrominance[0], chrominance[1], chrominance[2], opphash);
}
}
dt_free_align(mask);
Expand Down Expand Up @@ -316,9 +316,11 @@ static float *_process_opposed(struct dt_iop_module_t *self, dt_dev_pixelpipe_io
}
}

/* After having the surrounding mask for each color channel we can calculate the chrominance corrections. */
dt_aligned_pixel_t cr_sum = {0.0f, 0.0f, 0.0f, 0.0f};
dt_aligned_pixel_t cr_cnt = {0.0f, 0.0f, 0.0f, 0.0f};
/* After having the surrounding mask for each color channel we can calculate the chrominance corrections.
We use doubles here for improved precision and less differences to OpenCL code
*/
double cr_sum[4] = {0.0, 0.0, 0.0, 0.0};
int cr_cnt[4] = {0, 0, 0, 0};
#ifdef _OPENMP
#pragma omp parallel for default(none) \
dt_omp_firstprivate(input, roi_in, xtrans, clips, clipdark, mask) \
Expand All @@ -336,18 +338,18 @@ static float *_process_opposed(struct dt_iop_module_t *self, dt_dev_pixelpipe_io
/* we only use the unclipped photosites very close the true clipped data to calculate the chrominance offset */
if((inval < clips[color]) && (inval > clipdark[color]) && (mask[(color+3) * msize + _raw_to_cmap(mwidth, row, col)]))
{
cr_sum[color] += inval - _calc_refavg(&input[idx], xtrans, filters, row, col, roi_in, TRUE);
cr_cnt[color] += 1.0f;
cr_sum[color] += (double)(inval - _calc_refavg(&input[idx], xtrans, filters, row, col, roi_in, TRUE));
cr_cnt[color] += 1;
}
}
}
for_each_channel(c) chrominance[c] = cr_sum[c] / fmaxf(1.0f, cr_cnt[c]);
for_each_channel(c) chrominance[c] = (float)(cr_sum[c] / (double)MAX(1, cr_cnt[c]));
}
if(piece->pipe->type == DT_DEV_PIXELPIPE_FULL)
{
for_each_channel(c) img_oppchroma[c] = chrominance[c];
img_opphash = opphash;
dt_print(DT_DEBUG_PIPE, "[opposed chroma cache] %f %f %f for opphash%22" PRIu64 "\n", chrominance[0], chrominance[1], chrominance[2], opphash);
dt_print(DT_DEBUG_PIPE, "[opposed chroma cache CPU] %f %f %f for opphash%22" PRIu64 "\n", chrominance[0], chrominance[1], chrominance[2], opphash);
}
}
dt_free_align(mask);
Expand Down Expand Up @@ -444,7 +446,7 @@ static cl_int process_opposed_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_
cl_mem dev_inmask = NULL;
cl_mem dev_outmask = NULL;
cl_mem dev_accu = NULL;
float *claccu = NULL;
double *claccu = NULL;

const size_t iwidth = ROUNDUPDWD(roi_in->width, devid);
const size_t iheight = ROUNDUPDHT(roi_in->height, devid);
Expand Down Expand Up @@ -489,42 +491,46 @@ static cl_int process_opposed_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_
CLARG(dev_inmask), CLARG(dev_outmask), CLARG(mwidth), CLARG(mheight), CLARG(msize));
if(err != CL_SUCCESS) goto error;

dev_accu = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 6 * iheight);
const int cacheline = dt_round_size(6, MAX(2, darktable.opencl->dev[devid].cacheline / sizeof(double)));
const size_t linesize = sizeof(double) * cacheline * iheight;

dev_accu = dt_opencl_alloc_device_buffer(devid, linesize);
if(dev_accu == NULL) goto error;
claccu = dt_calloc_align_float(6 * iheight);

claccu = (double*) dt_calloc_align(64, linesize);
if(claccu == NULL) goto error;

size_t sizes[] = { iheight, 1, 1};

dt_opencl_set_kernel_args(devid, gd->kernel_highlights_chroma, 0,
CLARG(dev_in), CLARG(dev_outmask), CLARG(dev_accu),
CLARG(roi_in->width), CLARG(roi_in->height), CLARG(mwidth), CLARG(msize),
CLARG(filters), CLARG(dev_xtrans), CLARG(dev_clips), CLARG(dev_dark));
CLARG(filters), CLARG(dev_xtrans), CLARG(dev_clips), CLARG(dev_dark), CLARG(cacheline));

err = dt_opencl_enqueue_kernel_ndim_with_local(devid, gd->kernel_highlights_chroma, sizes, NULL, 1);
if(err != CL_SUCCESS) goto error;

err = dt_opencl_read_buffer_from_device(devid, claccu, dev_accu, 0, 6 * iheight * sizeof(float), TRUE);
err = dt_opencl_read_buffer_from_device(devid, claccu, dev_accu, 0, linesize, TRUE);
if(err != CL_SUCCESS) goto error;

// collect row data and accumulate
dt_aligned_pixel_t sums = { 0.0f, 0.0f, 0.0f};
dt_aligned_pixel_t cnts = { 0.0f, 0.0f, 0.0f};
double sums[4] = { 0.0, 0.0, 0.0, 0.0};
int cnts[4] = { 0, 0, 0, 0};
for(int grp = 3; grp < roi_in->height-3; grp++)
{
for(int c = 0; c < 3; c++)
{
sums[c] += claccu[grp*6 + c];
cnts[c] += claccu[grp*6 + 3 + c];
sums[c] += claccu[grp*cacheline + c];
cnts[c] += (int)claccu[grp*cacheline + 3 + c];
}
}
for_each_channel(c) chrominance[c] = sums[c] / fmaxf(1.0f, cnts[c]);
for_each_channel(c) chrominance[c] = (float)(sums[c] / (double)MAX(1, cnts[c]));

if(piece->pipe->type == DT_DEV_PIXELPIPE_FULL)
{
for_each_channel(c) img_oppchroma[c] = chrominance[c];
img_opphash = opphash;
dt_print(DT_DEBUG_PIPE, "[opposed chroma cache] %f %f %f for opphash%22" PRIu64 "\n", chrominance[0], chrominance[1], chrominance[2], opphash);
dt_print(DT_DEBUG_PIPE, "[opposed chroma cache GPU] %f %f %f for opphash%22" PRIu64 "\n", chrominance[0], chrominance[1], chrominance[2], opphash);
}
}

Expand Down