Skip to content
This repository has been archived by the owner on Oct 12, 2022. It is now read-only.


Merge pull request #1529 from MartinNowak/arrayop_bench
Browse files Browse the repository at this point in the history
benchmark for array ops
  • Loading branch information
WalterBright committed Aug 5, 2016
2 parents 2db9857 + dcd31e0 commit 73f00e8
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 2 deletions.
187 changes: 187 additions & 0 deletions benchmark/arrayops/arrayops.d
@@ -0,0 +1,187 @@
* Benchmark for array ops.
* Copyright: Copyright Martin Nowak 2016 -.
* License: $(LINK2, Boost License 1.0)
* Authors: Martin Nowak
import core.cpuid, std.algorithm, std.datetime, std.meta, std.stdio, std.string,

float[6] getLatencies(T, string op)()
enum N = (64 * (1 << 6) + 64) * T.sizeof;
auto a = Array!T(N), b = Array!T(N), c = Array!T(N);
float[6] latencies = float.max;
foreach (i, ref latency; latencies)
auto len = 1 << i;
foreach (_; 1 .. 32)
a[] = 24;
b[] = 4;
c[] = 2;
auto sw = StopWatch(AutoStart.yes);
foreach (off; size_t(0) .. size_t(64))
off = off * len + off;
enum op = op.replace("const", "2").replace("a",
"a[off .. off + len]").replace("b",
"b[off .. off + len]").replace("c", "c[off .. off + len]");
mixin(op ~ ";");
latency = min(latency, sw.peek.nsecs);
float[6] res = latencies[] / 1024;
return res;

float[4] getThroughput(T, string op)()
enum N = (40 * 1024 * 1024 + 64 * T.sizeof) / T.sizeof;
auto a = Array!T(N), b = Array!T(N), c = Array!T(N);
float[4] latencies = float.max;
size_t[4] lengths = [
8 * 1024 / T.sizeof, 32 * 1024 / T.sizeof, 512 * 1024 / T.sizeof, 32 * 1024 * 1024 / T
foreach (i, ref latency; latencies)
auto len = lengths[i] / 64;
foreach (_; 1 .. 4)
a[] = 24;
b[] = 4;
c[] = 2;
auto sw = StopWatch(AutoStart.yes);
foreach (off; size_t(0) .. size_t(64))
off = off * len + off;
enum op = op.replace("const", "2").replace("a",
"a[off .. off + len]").replace("b",
"b[off .. off + len]").replace("c", "c[off .. off + len]");
mixin(op ~ ";");
immutable nsecs = sw.peek.nsecs;
runMasked({latency = min(latency, nsecs);});
float[4] throughputs = void;
runMasked({throughputs = T.sizeof * lengths[] / latencies[];});
return throughputs;

string[] genOps()
string[] ops;
foreach (op1; ["+", "-", "*", "/"])
ops ~= "a " ~ op1 ~ "= b";
ops ~= "a " ~ op1 ~ "= const";
foreach (op2; ["+", "-", "*", "/"])
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c";
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " const";
return ops;

void runOp(string op)()
foreach (T; AliasSeq!(ubyte, ushort, uint, ulong, byte, short, int, long, float,
writefln("%s, %s, %(%.2f, %), %(%s, %)", T.stringof, op,
getLatencies!(T, op), getThroughput!(T, op));

struct Array(T)
import core.stdc.stdlib : free, malloc;

this(size_t n)
ary = (cast(T*) malloc(T.sizeof * n))[0 .. n];


T[] ary;
alias ary this;

version (X86)
version = SSE;
else version (X86_64)
version = SSE;
static assert(0, "unimplemented");

version (SSE)
uint mxcsr()
uint ret = void;
stmxcsr ret;
return ret;

void mxcsr(uint val)
ldmxcsr val;

enum FPU_EXCEPTION_MASKS = 1 << 12 | 1 << 11 | 1 << 10 | 1 << 9 | 1 << 8 | 1 << 7;
enum FPU_EXCEPTION_FLAGS = 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0;

void maskFPUExceptions()
mxcsr = mxcsr | FPU_EXCEPTION_MASKS;

void unmaskFPUExceptions()
mxcsr = mxcsr & ~FPU_EXCEPTION_MASKS;

uint FPUExceptionFlags()
return mxcsr & FPU_EXCEPTION_FLAGS;

void clearFPUExceptionFlags()
mxcsr = mxcsr & ~FPU_EXCEPTION_FLAGS;

void runMasked(scope void delegate() dg)
assert(FPUExceptionFlags == 0);

void main()

writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6)
.map!(i => 1 << i), ["8KB", "32KB", "512KB", "32MB"]);
foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps)))
Empty file.
33 changes: 33 additions & 0 deletions benchmark/arrayops/plot.R
@@ -0,0 +1,33 @@
# Use `R --vanilla < plot.R` to run this script.
# It will read all *.csv files from the current folder and create a comparison plot for them.

dat <- NULL
files <- list.files(pattern='*.csv')
for (file in files)
datFile <- read.csv(file) %>% tbl_df() %>%
if (is.null(dat))
dat = datFile
dat = bind_rows(dat, datFile)

latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency'))
throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput'))

levels(latencies$num_elems) <- sub("latency(\\d+)", "\\1", levels(latencies$num_elems))
levels(throughputs$array_size) <- sub("throughput(.+)", "\\1", levels(throughputs$array_size))

img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) +
facet_grid(op ~ file, scales="free_y") +
labs(x="num elements", y="latency / ns")
ggsave('array_ops_latency.svg', plot = img, width = 2 + 3 * length(files), height = 40)

img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) +
facet_grid(op ~ file, scales="free_y") +
labs(x="array size", y="throughput / (ops / ns)")
ggsave('array_ops_throughput.svg', plot = img, width = 2 + 3 * length(files), height = 40)
4 changes: 2 additions & 2 deletions benchmark/runbench.d
Expand Up @@ -53,9 +53,9 @@ void runTests(Config cfg)
string[string] extra_sources;
auto re = regex(cfg.pattern, "g");
auto self = buildPath(".", "runbench.d");
foreach(DirEntry src; dirEntries(".", SpanMode.depth))
foreach(DirEntry src; dirEntries(".", "*.d", SpanMode.depth))
if (!src.isFile || !endsWith(, ".d") || == self)
if (!src.isFile || == self ||".ignore").exists)

string mainsrc = extraSourceOf(;
Expand Down

0 comments on commit 73f00e8

Please sign in to comment.