/
reduction.cpp
348 lines (328 loc) · 14 KB
/
reduction.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#include <SYCL/sycl.hpp>
#include <iostream>
#include <sstream>
template <int reduction_class>
class reduce_t;
// Naive algorithm using tree based architecture, and using even threads
// to calculate the result. The algorithm time is log(n), and we use n threads.
// we use modulo operator to distinguish the even threads.
template <>
class reduce_t<0> {
public:
template <typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
// in_size is equivalent of total number of thread
index_t global_id = item_id.get_global(0);
index_t local_id = item_id.get_local(0);
local_acc[local_id] = (global_id < in_size) ? in_acc[global_id] : 0;
for (index_t i = 1; i < item_id.get_local_range(0); i *= 2) {
// wait for all thread to put the data in the local memory
item_id.barrier(cl::sycl::access::fence_space::local_space);
if (local_id % (2 * i) == 0)
local_acc[local_id] += local_acc[local_id + i];
}
if (item_id.get_local(0) == 0) {
out_acc[item_id.get_group(0)] = local_acc[0];
}
}
};
// using consecutive thread to calculate the result instead of even threads.
template <>
class reduce_t<1> {
public:
template <typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
// in_size is equivalent of total number of thread
index_t global_id = item_id.get_global(0);
index_t local_id = item_id.get_local(0);
local_acc[local_id] = (global_id < in_size) ? in_acc[global_id] : 0;
for (index_t i = 1; i < item_id.get_local_range(0); i *= 2) {
// wait for all thread to put the data in the local memory
item_id.barrier(cl::sycl::access::fence_space::local_space);
// replacing odd threads with contiguous threads
auto id = local_id * 2 * i;
if (id < item_id.get_local_range(0)) local_acc[id] += local_acc[id + i];
}
if (item_id.get_local(0) == 0) {
out_acc[item_id.get_group(0)] = local_acc[0];
}
}
};
// using consecutive thread to calculate the result instead of even threads.
template <>
class reduce_t<2> {
public:
template <typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
// in_size is equivalent of total number of thread
index_t global_id = item_id.get_global(0);
index_t local_id = item_id.get_local(0);
local_acc[local_id] = (global_id < in_size) ? in_acc[global_id] : 0;
for (index_t i = item_id.get_local_range(0) / 2; i > 0; i >>= 1) {
// wait for all thread to put the data in the local memory
item_id.barrier(cl::sycl::access::fence_space::local_space);
// replacing odd threads with contiguous threads
if (local_id < i) local_acc[local_id] += local_acc[local_id + i];
}
if (item_id.get_local(0) == 0) {
out_acc[item_id.get_group(0)] = local_acc[0];
}
}
};
// using consecutive thread to calculate the result instead of even threads.
template <>
class reduce_t<3> {
public:
template <typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
using output_type = typename write_accessor_t::value_type;
// in_size is equivalent of total number of thread
index_t global_id = item_id.get_global(0);
index_t local_id = item_id.get_local(0);
output_type private_sum = output_type(0);
// per thread reduction
for (int i = global_id; i < in_size; i += item_id.get_global_range(0)) {
private_sum += ((i < in_size) ? in_acc[i] : output_type(0));
}
local_acc[local_id] = private_sum;
for (index_t i = item_id.get_local_range(0) / 2; i > 0; i >>= 1) {
// wait for all thread to put the data in the local memory
item_id.barrier(cl::sycl::access::fence_space::local_space);
// replacing odd threads with contiguous threads
if (local_id < i) local_acc[local_id] += local_acc[local_id + i];
}
if (item_id.get_local(0) == 0) {
out_acc[item_id.get_group(0)] = local_acc[0];
}
}
};
// with static value for local size to allow compiler to unroll the parallel for
// loop
template <>
class reduce_t<4> {
public:
template <int local_size, typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
using output_type = typename write_accessor_t::value_type;
// in_size is equivalent of total number of thread
index_t global_id = item_id.get_global(0);
index_t local_id = item_id.get_local(0);
output_type private_sum = output_type(0);
// per thread reduction
for (int i = global_id; i < in_size; i += item_id.get_global_range(0)) {
private_sum += ((i < in_size) ? in_acc[i] : output_type(0));
}
local_acc[local_id] = private_sum;
// reduction for loop
for (index_t i = local_size / 2; i > 0; i >>= 1) {
// wait for all thread to put the data in the local memory
item_id.barrier(cl::sycl::access::fence_space::local_space);
// replacing odd threads with contiguous threads
if (local_id < i) local_acc[local_id] += local_acc[local_id + i];
}
if (item_id.get_local(0) == 0) {
out_acc[item_id.get_group(0)] = local_acc[0];
}
}
};
template <int reduction_class, int local_size>
struct reduction_factory {
template <typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
reduce_t<reduction_class>::template reduce<local_size>(
in_size, in_acc, local_acc, out_acc, item_id);
}
};
template <int reduction_class>
struct reduction_factory<reduction_class, -1> {
template <typename index_t, typename read_accessor_t,
typename local_accessor_t, typename write_accessor_t>
static void inline reduce(const index_t &in_size,
const read_accessor_t &in_acc,
local_accessor_t &local_acc,
write_accessor_t &out_acc,
cl::sycl::nd_item<1> &item_id) {
reduce_t<reduction_class>::reduce(in_size, in_acc, local_acc, out_acc,
item_id);
}
};
template <int reduction_class, int local_size, typename index_t,
typename read_accessor_t, typename local_accessor_t,
typename write_accessor_t>
class reduction_t {
private:
const read_accessor_t in_acc;
local_accessor_t local_acc;
write_accessor_t out_acc;
const index_t in_size;
public:
reduction_t(const read_accessor_t in_acc_, local_accessor_t local_acc_,
write_accessor_t out_acc_, const index_t in_size_)
: in_acc(in_acc_),
local_acc(local_acc_),
out_acc(out_acc_),
in_size(in_size_) {}
// kernel code
void inline operator()(cl::sycl::nd_item<1> item_id) {
reduction_factory<reduction_class,
((reduction_class > 3) ? local_size
: -1)>::reduce(in_size, in_acc,
local_acc, out_acc,
item_id);
}
};
//#define static_reduction_class 4;
template <typename index_t, typename data_t>
cl::sycl::buffer<data_t> inline get_out_buffer(
const index_t num_group, cl::sycl::buffer<data_t> out_buffer) {
return (num_group > 1)
? cl::sycl::buffer<data_t>(cl::sycl::range<1>{size_t(num_group)})
: out_buffer;
}
// to make global size multiple of local size
template <typename index_t>
inline index_t round_up(const index_t x, const index_t y) {
return ((x + y - 1) / y) * y;
}
// launching multiple kernel where the partial result is bigger than work group
// load
template <int work_group_load, int k_factor, int reduction_class,
typename index_t, typename data_t>
void reduction(index_t in_size, cl::sycl::queue &q,
cl::sycl::buffer<data_t> in_buff,
cl::sycl::buffer<data_t> out_buffer) {
using read_accessor_t =
cl::sycl::accessor<data_t, 1, cl::sycl::access::mode::read,
cl::sycl::access::target::global_buffer>;
using write_accessor_t =
cl::sycl::accessor<data_t, 1, cl::sycl::access::mode::write,
cl::sycl::access::target::global_buffer>;
using local_accessor_t =
cl::sycl::accessor<data_t, 1, cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>;
const constexpr index_t local_size = work_group_load / k_factor;
const index_t global_size = round_up(in_size / k_factor, local_size);
index_t num_group = global_size / local_size;
bool condition = (num_group > work_group_load) ? true : false;
auto temp_buff = get_out_buffer(num_group, out_buffer);
// submitting the SYCL kernel to the cvengine SYCL queue.
q.submit([&](cl::sycl::handler &cgh) {
// getting read access over the sycl buffer A inside the device kernel
auto in_acc =
in_buff.template get_access<cl::sycl::access::mode::read>(cgh);
// getting write access over the sycl buffer C inside the device kernel
auto out_acc =
temp_buff.template get_access<cl::sycl::access::mode::write>(cgh);
auto local_acc = local_accessor_t(local_size, cgh);
// constructing the kernel
cgh.parallel_for(
cl::sycl::nd_range<1>{cl::sycl::range<1>{size_t(global_size)},
cl::sycl::range<1>{size_t(local_size)}},
reduction_t<reduction_class, local_size, index_t, read_accessor_t,
local_accessor_t, write_accessor_t>(in_acc, local_acc,
out_acc, in_size));
});
if (condition) {
// launching a new kernel and passing tem_buff as an input
reduction<work_group_load, k_factor, reduction_class>(
num_group, q, temp_buff, out_buffer);
} else if (num_group > 1) {
// The temp_buff size is smaller than the work_group_load
auto host_out_acc =
out_buffer.template get_access<cl::sycl::access::mode::write>();
auto host_in_acc =
temp_buff.template get_access<cl::sycl::access::mode::read>();
// reduce the remaining on the host
for (index_t i = 0; i < num_group; i++) {
host_out_acc[0] += host_in_acc[i];
}
}
}
int main(int argc, char *argv[]) {
using data_t = double;
using index_t = int;
index_t in_size;
static constexpr index_t work_group_load = 256;
static constexpr index_t reduction_class = 4;
const constexpr index_t k_factor = (reduction_class > 2) ? 2 : 1;
std::istringstream ss(argv[1]);
if (!(ss >> in_size))
std::cerr << "Invalid input size " << argv[1]
<< ". Please insert the correct input size " << '\n';
// auto global_size = round_up(in_size / k_factor, local_size);
// We initialised the A and B as an input vector
std::vector<data_t> input(in_size, data_t(1));
// The output vector does not need to be initialized
std::vector<data_t> output(1);
{ // beginning of SYCL objects' scope
// constructing a SYCL queue for CVengine OpenCL device where automatically
// build the underlying context and command_queue for the chosen device.
auto q = cl::sycl::queue(
(cl::sycl::default_selector()), [&](cl::sycl::exception_list l) {
bool error = false;
for (auto e : l) {
try {
std::rethrow_exception(e);
} catch (const cl::sycl::exception &e) {
auto clError = e.get_cl_code();
std::cout << e.what() << "CLERRORCODE : " << clError << std::endl;
error = true;
}
}
if (error) {
throw std::runtime_error("SYCL errors detected");
}
});
// input SYCL buffer A
auto in_buff =
cl::sycl::buffer<data_t>(input.data(), cl::sycl::range<1>(in_size));
// output SYCL buffer C
auto out_buff = cl::sycl::buffer<data_t>(output.data(), 1);
// call reduction function
reduction<work_group_load, k_factor, reduction_class>(in_size, q, in_buff,
out_buff);
} // end of SYCL objects' scope
auto reference = 0;
for (int i = 0; i < in_size; i++) {
reference += input[i];
}
if (output[0] != reference) {
std::cout << "The result is wrong. expected : " << reference
<< " vs calculated: " << output[0] << "\n";
return 1;
} else {
std::cout << "The result is correct."
<< "\n";
}
return 0;
}