Skip to content
This repository
Newer
Older
100644 394 lines (346 sloc) 11.174 kb
27494a20 »
2012-06-02 Pull from FB rev 63ce89e
1 /*
2 * Copyright 2012 Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
18
19 #include "Benchmark.h"
20 #include "Foreach.h"
21 #include "String.h"
22 #include <algorithm>
23 #include <cmath>
24 #include <iostream>
25 #include <limits>
26 #include <utility>
27 #include <vector>
28
29 using namespace std;
30
31 DEFINE_bool(benchmark, false, "Run benchmarks.");
32
33 namespace folly {
34
35 BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent;
36
37 typedef function<uint64_t(unsigned int)> BenchmarkFun;
38 static vector<tuple<const char*, const char*, BenchmarkFun>> benchmarks;
39
40 // Add the global baseline
41 BENCHMARK(globalBenchmarkBaseline) {
42 asm volatile("");
43 }
44
45 void detail::addBenchmarkImpl(const char* file, const char* name,
46 BenchmarkFun fun) {
47 benchmarks.emplace_back(file, name, std::move(fun));
48 }
49
50 /**
51 * Given a point, gives density at that point as a number 0.0 < x <=
52 * 1.0. The result is 1.0 if all samples are equal to where, and
53 * decreases near 0 if all points are far away from it. The density is
54 * computed with the help of a radial basis function.
55 */
56 static double density(const double * begin, const double *const end,
57 const double where, const double bandwidth) {
58 assert(begin < end);
59 assert(bandwidth > 0.0);
60 double sum = 0.0;
61 FOR_EACH_RANGE (i, begin, end) {
62 auto d = (*i - where) / bandwidth;
63 sum += exp(- d * d);
64 }
65 return sum / (end - begin);
66 }
67
68 /**
69 * Computes mean and variance for a bunch of data points. Note that
70 * mean is currently not being used.
71 */
72 static pair<double, double>
73 meanVariance(const double * begin, const double *const end) {
74 assert(begin < end);
75 double sum = 0.0, sum2 = 0.0;
76 FOR_EACH_RANGE (i, begin, end) {
77 sum += *i;
78 sum2 += *i * *i;
79 }
80 auto const n = end - begin;
81 return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n));
82 }
83
84 /**
85 * Computes the mode of a sample set through brute force. Assumes
86 * input is sorted.
87 */
88 static double mode(const double * begin, const double *const end) {
89 assert(begin < end);
90 // Lower bound and upper bound for result and their respective
91 // densities.
92 auto
93 result = 0.0,
94 bestDensity = 0.0;
95
96 // Get the variance so we pass it down to density()
97 auto const sigma = meanVariance(begin, end).second;
98 if (!sigma) {
99 // No variance means constant signal
100 return *begin;
101 }
102
103 FOR_EACH_RANGE (i, begin, end) {
104 assert(i == begin || *i >= i[-1]);
105 auto candidate = density(begin, end, *i, sigma * sqrt(2.0));
106 if (candidate > bestDensity) {
107 // Found a new best
108 bestDensity = candidate;
109 result = *i;
110 } else {
111 // Density is decreasing... we could break here if we definitely
112 // knew this is unimodal.
113 }
114 }
115
116 return result;
117 }
118
119 /**
120 * Given a bunch of benchmark samples, estimate the actual run time.
121 */
122 static double estimateTime(double * begin, double * end) {
123 assert(begin < end);
124
125 // Current state of the art: get the minimum. After some
126 // experimentation, it seems taking the minimum is the best.
127
128 return *min_element(begin, end);
129
130 // What follows after estimates the time as the mode of the
131 // distribution.
132
133 // Select the awesomest (i.e. most frequent) result. We do this by
134 // sorting and then computing the longest run length.
135 sort(begin, end);
136
137 // Eliminate outliers. A time much larger than the minimum time is
138 // considered an outlier.
139 while (end[-1] > 2.0 * *begin) {
140 --end;
141 if (begin == end) {
142 LOG(INFO) << *begin;
143 }
144 assert(begin < end);
145 }
146
147 double result = 0;
148
149 /* Code used just for comparison purposes */ {
150 unsigned bestFrequency = 0;
151 unsigned candidateFrequency = 1;
152 double candidateValue = *begin;
153 for (auto current = begin + 1; ; ++current) {
154 if (current == end || *current != candidateValue) {
155 // Done with the current run, see if it was best
156 if (candidateFrequency > bestFrequency) {
157 bestFrequency = candidateFrequency;
158 result = candidateValue;
159 }
160 if (current == end) {
161 break;
162 }
163 // Start a new run
164 candidateValue = *current;
165 candidateFrequency = 1;
166 } else {
167 // Cool, inside a run, increase the frequency
168 ++candidateFrequency;
169 }
170 }
171 }
172
173 result = mode(begin, end);
174
175 return result;
176 }
177
178 static double runBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
179 const double globalBaseline) {
180 // They key here is accuracy; too low numbers means the accuracy was
181 // coarse. We up the ante until we get to at least minNanoseconds
182 // timings.
183 static uint64_t resolutionInNs = 0, coarseResolutionInNs = 0;
184 if (!resolutionInNs) {
185 timespec ts;
186 CHECK_EQ(0, clock_getres(detail::DEFAULT_CLOCK_ID, &ts));
187 CHECK_EQ(0, ts.tv_sec) << "Clock sucks.";
188 CHECK_LT(0, ts.tv_nsec) << "Clock too fast for its own good.";
189 CHECK_EQ(1, ts.tv_nsec) << "Clock too coarse, upgrade your kernel.";
190 resolutionInNs = ts.tv_nsec;
191 }
192 // Whe choose a minimum minimum (sic) of 10,000 nanoseconds, but if
193 // the clock resolution is worse than that, it will be larger. In
194 // essence we're aiming at making the quantization noise 0.01%.
195 static const auto minNanoseconds = min(resolutionInNs * 100000, 1000000000UL);
196
197 // We do measurements in several epochs and take the minimum, to
198 // account for jitter.
199 static const unsigned int epochs = 1000;
200 // We establish a total time budget as we don't want a measurement
201 // to take too long. This will curtail the number of actual epochs.
202 static const uint64_t timeBudgetInNs = 1000000000;
203 timespec global;
204 CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &global));
205
206 double epochResults[epochs] = { 0 };
207 size_t actualEpochs = 0;
208
209 for (; actualEpochs < epochs; ++actualEpochs) {
210 for (unsigned int n = 1; n < (1U << 30); n *= 2) {
211 auto const nsecs = fun(n);
212 if (nsecs < minNanoseconds) {
213 continue;
214 }
215 // We got an accurate enough timing, done. But only save if
216 // smaller than the current result.
217 epochResults[actualEpochs] = max(0.0, double(nsecs) / n - globalBaseline);
218 // Done with the current epoch, we got a meaningful timing.
219 break;
220 }
221 timespec now;
222 CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &now));
223 if (detail::timespecDiff(now, global) >= timeBudgetInNs) {
224 // No more time budget available.
225 ++actualEpochs;
226 break;
227 }
228 }
229
230 // If the benchmark was basically drowned in baseline noise, it's
231 // possible it became negative.
232 return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
233 }
234
235 static string humanReadable(double n, unsigned int decimals) {
236 auto a = fabs(n);
237 char suffix = ' ';
238
239 if (a >= 1E21) {
240 // Too big to be comprehended by the puny human brain
241 suffix = '!';
242 n /= 1E21;
243 } else if (a >= 1E18) {
244 // "EXA" written with suffix 'X' so as to not create confusion
245 // with scientific notation.
246 suffix = 'X';
247 n /= 1E18;
248 } else if (a >= 1E15) {
249 // "PETA"
250 suffix = 'P';
251 n /= 1E15;
252 } else if (a >= 1E12) {
253 // "TERA"
254 suffix = 'T';
255 n /= 1E12;
256 } else if (a >= 1E9) {
257 // "GIGA"
258 suffix = 'G';
259 n /= 1E9;
260 } else if (a >= 1E6) {
261 // "MEGA"
262 suffix = 'M';
263 n /= 1E6;
264 } else if (a >= 1E3) {
265 // "KILO"
266 suffix = 'K';
267 n /= 1E3;
268 } else if (a == 0.0) {
269 suffix = ' ';
270 } else if (a < 1E-15) {
271 // too small
272 suffix = '?';
273 n *= 1E18;
274 } else if (a < 1E-12) {
275 // "femto"
276 suffix = 'f';
277 n *= 1E15;
278 } else if (a < 1E-9) {
279 // "pico"
280 suffix = 'p';
281 n *= 1E12;
282 } else if (a < 1E-6) {
283 // "nano"
284 suffix = 'n';
285 n *= 1E9;
286 } else if (a < 1E-3) {
287 // "micro"
288 suffix = 'u';
289 n *= 1E6;
290 } else if (a < 1) {
291 // "mili"
292 suffix = 'm';
293 n *= 1E3;
294 }
295
296 return stringPrintf("%*.*f%c", decimals + 3 + 1, decimals, n, suffix);
297 }
298
299 static void printBenchmarkResults(
300 const vector<tuple<const char*, const char*, double> >& data) {
301 // Width available
302 static const uint columns = 76;
303
304 // Compute the longest benchmark name
305 size_t longestName = 0;
306 FOR_EACH_RANGE (i, 1, benchmarks.size()) {
307 longestName = max(longestName, strlen(get<1>(benchmarks[i])));
308 }
309
310 // Print a horizontal rule
311 auto separator = [&](char pad) {
312 puts(string(columns, pad).c_str());
313 };
314
315 // Print header for a file
316 auto header = [&](const char* file) {
317 separator('=');
318 printf("%-*srelative ns/iter iters/s\n",
319 columns - 26, file);
320 separator('=');
321 };
322
323 double baselineNsPerIter = numeric_limits<double>::max();
324 const char* lastFile = "";
325
326 for (auto& datum : data) {
327 auto file = get<0>(datum);
328 if (strcmp(file, lastFile)) {
329 // New file starting
330 header(file);
331 lastFile = file;
332 }
333
334 string s = get<1>(datum);
335 if (s == "-") {
336 separator('-');
337 continue;
338 }
339 bool useBaseline /* = void */;
340 if (s[0] == '%') {
341 s.erase(0, 1);
342 useBaseline = true;
343 } else {
344 baselineNsPerIter = get<2>(datum);
345 useBaseline = false;
346 }
347 s.resize(columns - 27, ' ');
348 auto nsPerIter = get<2>(datum);
349 auto itersPerSec = 1E9 / nsPerIter;
350 if (!useBaseline) {
351 // Print without baseline
352 printf("%*s %s %s\n",
353 static_cast<int>(s.size()), s.c_str(),
354 humanReadable(nsPerIter, 2).c_str(),
355 humanReadable(itersPerSec, 2).c_str());
356 } else {
357 // Print with baseline
358 auto rel = baselineNsPerIter / nsPerIter * 100.0;
359 printf("%*s %7.2f%% %s %s\n",
360 static_cast<int>(s.size()), s.c_str(),
361 rel,
362 humanReadable(nsPerIter, 2).c_str(),
363 humanReadable(itersPerSec, 2).c_str());
364 }
365 }
366 separator('=');
367 }
368
369 void runBenchmarks() {
370 CHECK(!benchmarks.empty());
371
372 vector<tuple<const char*, const char*, double>> results;
373 results.reserve(benchmarks.size() - 1);
374
375 // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
376
377 auto const globalBaseline = runBenchmarkGetNSPerIteration(
378 get<2>(benchmarks.front()), 0);
379 FOR_EACH_RANGE (i, 1, benchmarks.size()) {
380 auto elapsed = strcmp(get<1>(benchmarks[i]), "-") == 0
381 ? 0.0 // skip the separators
382 : runBenchmarkGetNSPerIteration(get<2>(benchmarks[i]),
383 globalBaseline);
384 results.emplace_back(get<0>(benchmarks[i]),
385 get<1>(benchmarks[i]), elapsed);
386 }
387
388 // PLEASE MAKE NOISE. MEASUREMENTS DONE.
389
390 printBenchmarkResults(results);
391 }
392
393 } // namespace folly
Something went wrong with that request. Please try again.