Merge pull request snabbco#59 from lukego/nix-benchmarks

Import revamped nix benchmark framework
eugeneia · Jul 9, 2017 · f8bd4d8 · f8bd4d8
2 parents 67864a9 + 66fab90
commit f8bd4d8
Show file tree

Hide file tree

Showing 7 changed files with 219 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -131,6 +131,40 @@ $ make
 
 ... but make sure you have at least `make`, `clang`, and `luajit` in your `$PATH`.
 
+### Run the benchmarks
+
+Nix can also run the full benchmark suite and generate visualizations
+with R/ggplot2.
+
+The simplest incantation tests one branch:
+
+```shell
+$ nix-build testsuite/bench --arg Asrc ./.   # note: ./. means ./
+```
+
+You can also test several branches (A-E), give them names, specify
+command-line arguments, say how many tests to run, and allow parallel
+execution:
+
+```shell
+# Run the benchmarks and create result visualizations result/
+$ nix-build testsuite/bench                     \
+            --arg    Asrc ~/git/raptorjit       \
+            --argstr Aname master               \
+            --arg    Bsrc ~/git/raptorjit-hack  \
+            --argstr Bname hacked               \
+            --arg    Csrc ~/git/raptorjit-hack2 \
+            --argstr Cname hacked-O1            \
+            --argstr Cargs -O1                  \
+            --arg    runs 100                   \
+            -j 5           # Run up to 5 tests in parallel
+```
+
+If you are using a distributed nix environment such
+as [Hydra](https://nixos.org/hydra/) then the tests can be
+automatically parallelized and distributed across a suitable build
+farm.
+
 ### Quotes
 
 Here are some borrowed words to put this branch into context:

diff --git a/testsuite/bench/PARAM_x86_CI.txt b/testsuite/bench/PARAM_x86_CI.txt
@@ -1,22 +1,21 @@
-array3d 300
+array3d 500
 binary-trees 16
 chameneos 1e7
-coroutine-ring 2e7
+coroutine-ring 5e7
 euler14-bit 2e7
 fannkuch 11
 fasta 5e6
 life
 mandelbrot 5000
 mandelbrot-bit 5000
-md5 20000
-nbody 5e6
+md5 30000
+nbody 8e6
 nsieve 12
-nsieve-bit 12
+nsieve-bit 13
 nsieve-bit-fp 12
-partialsums 1e7
+partialsums 3e7
 pidigits-nogmp 5000
 ray 9
-recursive-ack 10
 recursive-fib 40
 scimark-fft 50000
 scimark-lu 5000

diff --git a/testsuite/bench/bench.R b/testsuite/bench/bench.R
@@ -0,0 +1,49 @@
+# R subroutines for reading and visualizing benchmark results.
+
+suppressPackageStartupMessages({
+  library(dplyr)
+  library(ggplot2)
+})
+
+## R library routines for analyzing benchmark results
+bench.read <- function(filename) {
+  data <- read.csv(filename)
+  ## baseline is the mean performance of the "A" version
+  baseline <- data %>%
+    filter(letter=="A") %>%
+    group_by(benchmark) %>%
+    summarize(baseline = mean(cycles))
+  ## Add 'relative' performance column: compared to mean from baseline branch
+  relative <- data %>%
+    left_join(baseline, by="benchmark") %>%
+    group_by(benchmark, version) %>%
+    mutate(relative = first(baseline) / cycles)
+  return(relative)
+}
+
+## Jitter plot faceted by benchmark
+bench.jitterplot <- function(data) {
+  ggplot(aes(y=relative, x=version, color=version), data=data) +
+    geom_jitter(shape=1, alpha=0.5) +
+    scale_y_continuous(breaks=seq(0, 3, 0.1), labels=scales::percent) +
+    theme(aspect.ratio = 1) +
+    theme(axis.text.x = element_text(angle=90)) +
+    ylab("Performance relative to baseline average") +
+    ggtitle("Comparative performance between RaptorJIT versions") +
+    facet_wrap(~ benchmark, scales="free_x")
+}
+
+## ECDF plot faceted by benchmark
+bench.ecdfplot <- function(data) {
+  ggplot(aes(x=relative, color=version), data=data) +
+  stat_ecdf() +
+  scale_x_continuous(labels=scales::percent) +
+  scale_y_log10(labels=scales::percent) +
+  theme(aspect.ratio = 1) +
+  theme(axis.text.x = element_text(angle=90)) +
+  ylab("Performance relative to baseline average") +
+  xlab("Percentage of results at or above this performance level") +
+  ggtitle("Comparative performance between RaptorJIT variants") +
+  facet_wrap(~ benchmark)
+}
+
diff --git a/testsuite/bench/default.nix b/testsuite/bench/default.nix
@@ -0,0 +1,103 @@
+# Run a large parallel benchmark campaign and generate R/ggplot2 reports.
+
+{ pkgs ? (import ../../pkgs.nix) {},
+  Asrc,        Aname ? "A", Aargs ? "",
+  Bsrc ? null, Bname ? "B", Bargs ? "",
+  Csrc ? null, Cname ? "C", Cargs ? "",
+  Dsrc ? null, Dname ? "D", Dargs ? "",
+  Esrc ? null, Ename ? "E", Eargs ? "",
+  hardware ? null,
+  runs ? 30 }:
+
+with pkgs;
+with stdenv;
+
+# Derivation to run benchmarks and produce a CSV result.
+let benchmark = letter: name: src: args: run:
+  let raptorjit = (import src {inherit pkgs; version = name;}).raptorjit; in
+  mkDerivation {
+    name = "benchmark-${name}-${toString run}";
+    src = pkgs.lib.cleanSource ./.;
+    # Force consistent hardware
+    requiredSystemFeatures = if hardware != null then [hardware] else [];
+    buildInputs = [ raptorjit linuxPackages.perf utillinux ];
+    buildPhase = ''
+      # Run multiple iterations of the benchmarks
+      echo "Run $run"
+      mkdir -p result/$run
+      # Run each individual benchmark
+      cat PARAM_x86_CI.txt |
+        (while read benchmark params; do
+           echo "running $benchmark"
+           # Execute with performance monitoring & time supervision
+           # Note: discard stdout due to overwhelming output
+           timeout -sKILL 60 \
+             perf stat -x, -o result/$run/$benchmark.perf \
+             raptorjit ${args} -e "math.randomseed(${toString run})" $benchmark.lua $params \
+                > /dev/null || \
+                rm result/$run/$benchmark.perf
+        done)
+    '';
+    installPhase = ''
+      # Copy the raw perf output for reference
+      cp -r result $out
+      # Log the exact CPU
+      lscpu > $out/cpu.txt
+      # Create a CSV file
+      # Create the rows based on the perf logs
+      for result in result/*.perf; do
+        version=${name}
+        benchmark=$(basename -s.perf -a $result)
+        instructions=$(awk -F, -e '$3 == "instructions" { print $1; }' $result)
+        cycles=$(      awk -F, -e '$3 == "cycles"       { print $1; }' $result)
+        echo ${letter},$version,$benchmark,${toString run},$instructions,$cycles >> $out/bench.csv
+      done
+    '';
+  };
+
+# Run a set of benchmarks and aggregate the results into a CSV file.
+# Each benchmark run is a separate derivation. This allows nix to
+# parallelize and distribute the benchmarking.
+  benchmarkSet = letter: name: src: args:
+    let benchmarks = map (benchmark letter name src args) (pkgs.lib.range 1 runs);
+    in
+      runCommand "benchmarks-${name}" { buildInputs = benchmarks; } ''
+        source $stdenv/setup
+        mkdir -p $out
+        for dir in ${pkgs.lib.fold (acc: x: "${acc} ${x}") "" benchmarks}; do
+          cat $dir/bench.csv >> $out/bench.csv
+        done
+      '';
+
+  benchA =                      (benchmarkSet "A" Aname Asrc Aargs);
+  benchB = if Bsrc != null then (benchmarkSet "B" Bname Bsrc Bargs) else "";
+  benchC = if Csrc != null then (benchmarkSet "C" Cname Csrc Cargs) else "";
+  benchD = if Dsrc != null then (benchmarkSet "D" Dname Dsrc Dargs) else "";
+  benchE = if Esrc != null then (benchmarkSet "E" Ename Esrc Eargs) else "";
+in
+
+rec {
+  benchmarkResults = mkDerivation {
+    name = "benchmark-results";
+    buildInputs = with pkgs.rPackages; [ pkgs.R ggplot2 dplyr ];
+    builder = pkgs.writeText "builder.csv" ''
+      source $stdenv/setup
+      # Get the CSV file
+      mkdir -p $out/nix-support
+      echo "letter,version,benchmark,run,instructions,cycles" > bench.csv
+                            cat ${benchA}/bench.csv >> bench.csv
+      [ -n "${benchB}" ] && cat ${benchB}/bench.csv >> bench.csv
+      [ -n "${benchC}" ] && cat ${benchC}/bench.csv >> bench.csv
+      [ -n "${benchD}" ] && cat ${benchD}/bench.csv >> bench.csv
+      [ -n "${benchE}" ] && cat ${benchE}/bench.csv >> bench.csv
+      cp bench.csv $out
+      echo "file CSV $out/bench.csv" >> $out/nix-support/hydra-build-products
+      # Generate the report
+      (cd ${./.}; Rscript ./generate.R $out/bench.csv $out)
+      for png in $out/*.png; do
+        echo "file PNG $png" >> $out/nix-support/hydra-build-products
+      done
+    '';
+  };
+}
+
diff --git a/testsuite/bench/generate.R b/testsuite/bench/generate.R
@@ -0,0 +1,25 @@
+#!/usr/bin/env nix-shell
+#!nix-shell -i Rscript -p R rpkgs.dplyr rpkgs.ggplot2
+
+# R command-line program for making visualizations from benchmark results.
+
+suppressWarnings(source("bench.R"))
+
+args <- commandArgs(trailingOnly=T)
+if (length(args) != 2) {
+    message("Usage: generate.R <csv> <outdir>"); quit(status=1)
+}
+
+filename <- args[[1]]
+outdir   <- args[[2]]
+
+data <- bench.read(filename)
+if (!dir.exists(outdir)) { dir.create(outdir, recursive=T) }
+
+ggsave(filename = file.path(outdir,"bench-jitter.png"),
+       plot = bench.jitterplot(data),
+       width=12, height=12)
+
+ggsave(filename = file.path(outdir,"bench-ecdf.png"),
+       plot = bench.ecdfplot(data),
+       width=12, height=12)
diff --git a/testsuite/bench/life.lua b/testsuite/bench/life.lua
@@ -103,7 +103,7 @@ function LIFE(w,h)
     thisgen:draw()
     write("Life - generation ",gen,"\n")
     gen=gen+1
-    if gen>2000 then break end
+    if gen>10000 then break end
     --delay()		-- no delay
   end
 end

diff --git a/testsuite/bench/roulette.lua b/testsuite/bench/roulette.lua
@@ -6,7 +6,7 @@
 -- (Let the test harness determine the random seed)
 -- math.randomseed(os.time())
 
-local population = 100e6
+local population = 200e6
 local live = 0
 local die  = 0