/
memtest_parallel.cpp
260 lines (232 loc) · 9.92 KB
/
memtest_parallel.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/*
* Copyright (c) 2014-2020 Embedded Systems and Applications, TU Darmstadt.
*
* This file is part of TaPaSCo
* (see https://github.com/esa-tu-darmstadt/tapasco).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <array>
#include <climits>
#include <iomanip>
#include <iostream>
#include <tapasco.hpp>
#include <vector>
using namespace tapasco;
// runtime for random access benchmark in milliseconds
constexpr int random_time_ms = 1000;
// iterations for batch access benchmark
constexpr int batch_iterations = 1000;
// iterations for latency benchmark
constexpr int latency_iterations = 100000;
// whether to initialize the memory before the benchmarks (required for ECC
// memory) should later be determined automatically when the status core
// provides this information
constexpr bool ecc_memory = true;
constexpr int PE_ID = 321;
// column width used for printing the results in a table
constexpr int col_width = 20;
constexpr char space = ' ';
// determines smallest transfer size for batch; transfer size is
// 2^batch_min_length Bytes
constexpr int batch_min_length = 10;
// determines biggest transfer size for batch; transfer size is
// 2^batch_max_length Bytes
constexpr int batch_max_length = 25;
// number of different request sizes for random benchmark
constexpr int random_byte_length_c = 10;
// request sizes for random benchmark in bytes
constexpr int random_byte_length[random_byte_length_c] = {
8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
constexpr int random_read_seed = 1234;
constexpr int random_write_seed = 5678;
double calcSpeed(unsigned long amount, unsigned long cycles, float clock) {
// calculate transfer speed in GiB/s for given transfer amount (in bytes) and
// time in cycles; clock speed is given in MHz
return (amount / (cycles / (clock * 1000000.0))) / (1024 * 1024 * 1024);
}
void executeRandomBenchmark(tapasco::Tapasco &tapasco, int op,
unsigned long cycles, int byte_length,
float designclk, int instances) {
char text[20];
// allocate vectors for jobs and return values
std::vector<tapasco::job_future> jobs;
std::vector<tapasco::RetVal<unsigned long>> retvals;
retvals.reserve(instances);
unsigned long rets[instances];
// start one job for each instance of the PE found
for (int instance = 0; instance < instances; instance++) {
rets[instance] = 0;
tapasco::RetVal<unsigned long> ret_val(&rets[instance]);
retvals.push_back(ret_val);
// first argument is operation: 3 means Random Read, 4 Random Write, 5
// Random Read/Write second argument is runtime in cycles third argument is
// request size in bytes (maximum 4096) fourth and fifth arguments are seeds
// for the read and write address generation respectively
auto job = tapasco.launch(PE_ID, retvals[instance], 3 + op, cycles,
byte_length, random_read_seed, random_write_seed);
jobs.push_back(job);
}
unsigned long acc = 0;
for (int instance = 0; instance < instances; instance++) {
// when job is finished use return value (completed requests) to calculate
// performance
jobs[instance]();
acc += *retvals[instance].value;
double iops = *retvals[instance].value / (cycles / (designclk * 1000000.0));
snprintf(text, 20, "%.2fMIOPS", iops / 1000000);
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< text;
}
double iops = acc / (cycles / (designclk * 1000000.0));
snprintf(text, 20, "%.2fMIOPS", iops / 1000000);
std::cout << std::left << std::setw(col_width) << std::setfill(space) << text;
}
void benchmarkRandom(tapasco::Tapasco &tapasco, float designclk,
unsigned long cycles, int instances) {
if (ecc_memory) {
std::cout << "Initializing memory (required for ECC memory)" << std::endl;
// To initialize memoryn write with biggest request size for double
// benchmark time one time with read-address-seed second time with
// write-address-seed this ensures all addresses which will later be
// accessed were already initialized
auto job = tapasco.launch(PE_ID, 4, 2 * cycles, 4096, random_read_seed,
random_read_seed);
job();
job = tapasco.launch(PE_ID, 4, 2 * cycles, 4096, random_write_seed,
random_write_seed);
job();
}
char text[20];
std::cout << std::endl << std::endl;
std::cout << "Random Access Read/Write (" << cycles << " Cycles)" << std::endl
<< std::endl;
// print table header
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< "Size";
for (int instance = 0; instance < instances; instance++) {
snprintf(text, 20, "Instance %i", instance);
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< text;
}
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< "Total";
std::cout << std::endl;
// execute random benchmark for different request sizes
for (int count = 0; count < random_byte_length_c; count++) {
snprintf(text, 20, "%iB", random_byte_length[count]);
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< text;
// execute Random Read/Write
executeRandomBenchmark(tapasco, 2, cycles, random_byte_length[count],
designclk, instances);
std::cout << std::endl;
}
}
void executeBatchBenchmark(tapasco::Tapasco &tapasco, float designclk, int op,
size_t size, int instances) {
// calculate transfer size (2^size)
size_t len = 1 << size;
// init one "global" accumulator and one accumulator for each PE
// will be used to accumulate the return values of the jobs over all
// iterations
unsigned long acc = 0;
unsigned long accs[instances];
for (int instance = 0; instance < instances; instance++)
accs[instance] = 0;
// execute given number of iterations
for (int i = 0; i < batch_iterations; i++) {
// allocate vectors for jobs and return values
std::vector<tapasco::job_future> jobs;
std::vector<tapasco::RetVal<unsigned long>> retvals;
retvals.reserve(instances);
unsigned long rets[instances];
// start one job for each instance of the PE found
for (int instance = 0; instance < instances; instance++) {
rets[instance] = 0;
tapasco::RetVal<unsigned long> ret_val(&rets[instance]);
retvals.push_back(ret_val);
// first argument is operation: 0 for Batch Read, 1 for Batch Write, 2 for
// Batch Read/Write second argument is unused third argument is transfer
// size
auto job = tapasco.launch(PE_ID, retvals[instance], op, 0, len);
jobs.push_back(job);
}
for (int instance = 0; instance < instances; instance++) {
// when job finished add return value (transfer time in cycles) to
// accumulators
jobs[instance]();
accs[instance] += *retvals[instance].value;
acc += *retvals[instance].value;
}
}
// Use accumulators to compute individual and total performance values
unsigned long total_data = len * batch_iterations;
if (op == 2)
total_data *= 2;
char text[20];
for (int instance = 0; instance < instances; instance++) {
snprintf(text, 20, "%#.3fGiB/s",
calcSpeed(total_data, accs[instance], designclk));
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< text;
}
snprintf(text, 20, "%#.3fGiB/s",
calcSpeed(total_data * instances, acc / instances, designclk));
std::cout << std::left << std::setw(col_width) << std::setfill(space) << text;
}
void benchmarkBatch(tapasco::Tapasco &tapasco, float designclk, int instances) {
char text[20];
std::cout << std::endl << std::endl;
std::cout << "Batch Access Read/Write (" << batch_iterations << " Iterations)"
<< std::endl
<< std::endl;
// print table header
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< "Size";
for (int instance = 0; instance < instances; instance++) {
snprintf(text, 20, "Instance %i", instance);
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< text;
}
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< "Total";
std::cout << std::endl;
// execute batch benchmark for different transfer sizes
for (size_t s = batch_min_length; s <= batch_max_length; s++) {
snprintf(text, 20, "%iKib", ((1 << s) / 1024));
std::cout << std::left << std::setw(col_width) << std::setfill(space)
<< text;
// execute Batch Read/Write
executeBatchBenchmark(tapasco, designclk, 2, s, instances);
std::cout << std::endl;
}
}
int main(int argc, char **argv) {
// Initialize TaPaSCo
tapasco::Tapasco tapasco;
// Check PE count
uint64_t instances = tapasco.kernel_pe_count(PE_ID);
std::cout << "Got " << instances << " instances @ "
<< tapasco.design_frequency() << "MHz" << std::endl;
if (!instances || instances < 2) {
std::cout << "Need at least two instance to run." << std::endl;
exit(1);
}
// runtime for random access benchmark
unsigned long cycles = random_time_ms * tapasco.design_frequency() * 1000;
benchmarkRandom(tapasco, tapasco.design_frequency(), cycles, instances);
benchmarkBatch(tapasco, tapasco.design_frequency(), instances);
return 0;
}