From 689e73f30c508fbceb5d2b3f8216ea61b21f5b49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Britto?= <jabcalves@gmail.com>
Date: Wed, 1 Jul 2020 15:25:03 -0300
Subject: [PATCH] Add protobuf standard benchmarks

---
 bench/.gitignore                |  3 ++
 bench/README.md                 | 48 ++++++++++++++++++++++++++++
 bench/script/standard_bench.exs | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)
 create mode 100644 bench/script/standard_bench.exs

diff --git a/bench/.gitignore b/bench/.gitignore
index 2a9a0581..8b1e2452 100644
--- a/bench/.gitignore
+++ b/bench/.gitignore
@@ -2,3 +2,6 @@
 /deps
 erl_crash.dump
 benchmarks
+/data/datasets.tar.gz
+/data/dataset.google_message3*.pb
+/data/dataset.google_message4.pb
diff --git a/bench/README.md b/bench/README.md
index e8419909..cbd7ad41 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -144,6 +144,54 @@ Generated benchmarks/output/encode.html
 Opened report using open
 ```
 
+## Protobuf standard benchmarks
+
+Protobuf includes benchmarks for its official language implementations, such as Python, C++
+and Golang. They measure average encode and decode throughput for each built-in dataset. This
+is useful to check how Elixir matches up with them. You can read more about these benchmarks
+[here](https://github.com/protocolbuffers/protobuf/blob/master/benchmarks/README.md).
+
+To run the standard benchmarks for Elixir, download the datasets then run `standard_bench.exs`.
+
+```console
+$ mix run script/standard_bench.exs
+Message benchmarks.proto2.GoogleMessage1 of dataset file data/dataset.google_message1_proto2.pb
+Average throughput for parse_from_benchmark: 18.48 MB/s
+Average throughput for serialize_to_benchmark: 6.19 MB/s
+
+Message benchmarks.proto3.GoogleMessage1 of dataset file data/dataset.google_message1_proto3.pb
+Average throughput for parse_from_benchmark: 18.4 MB/s
+Average throughput for serialize_to_benchmark: 11.1 MB/s
+
+Message benchmarks.proto2.GoogleMessage2 of dataset file data/dataset.google_message2.pb
+Average throughput for parse_from_benchmark: 47.82 MB/s
+Average throughput for serialize_to_benchmark: 5656.75 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_1.pb
+Average throughput for parse_from_benchmark: 19.94 MB/s
+Average throughput for serialize_to_benchmark: 45.5 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_2.pb
+Average throughput for parse_from_benchmark: 110.65 MB/s
+Average throughput for serialize_to_benchmark: 164.96 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_3.pb
+Average throughput for parse_from_benchmark: 9.8 MB/s
+Average throughput for serialize_to_benchmark: 6.84 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_4.pb
+Average throughput for parse_from_benchmark: 5254.14 MB/s
+Average throughput for serialize_to_benchmark: 737.71 MB/s
+
+Message benchmarks.google_message3.GoogleMessage3 of dataset file data/dataset.google_message3_5.pb
+Average throughput for parse_from_benchmark: 3.77 MB/s
+Average throughput for serialize_to_benchmark: 3.29 MB/s
+
+Message benchmarks.google_message4.GoogleMessage4 of dataset file data/dataset.google_message4.pb
+Average throughput for parse_from_benchmark: 20.06 MB/s
+Average throughput for serialize_to_benchmark: 32.46 MB/s
+```
+
 ## Contributing
 
 If you have trouble using the downloaded datasets, they might have been upgraded and their
diff --git a/bench/script/standard_bench.exs b/bench/script/standard_bench.exs
new file mode 100644
index 00000000..8162fba2
--- /dev/null
+++ b/bench/script/standard_bench.exs
@@ -0,0 +1,56 @@
+# Standard benchmark. Its output is compatible with the built-in benchmarks from
+# protobuf for official language implementations, including encoding and decoding
+# throughput on each dataset.
+#
+# Based on Python's implementation:
+# https://github.com/protocolbuffers/protobuf/blob/master/benchmarks/python/py_benchmark.py
+
+single = fn fun, inputs ->
+  Enum.reduce(inputs, 0, fn input, total ->
+    {time, _result} = :timer.tc(fun, [input])
+    total + time
+  end)
+end
+
+repeat = fn fun, inputs, reps ->
+  Enum.reduce(1..reps, 0, fn _, total ->
+    total + single.(fun, inputs)
+  end)
+end
+
+run = fn fun, inputs ->
+  target_run_time = 3_000_000
+  single_run_time = single.(fun, inputs)
+
+  with true <- single_run_time < target_run_time,
+       reps when reps > 1 <- trunc(ceil(target_run_time / single_run_time)) do
+    repeat.(fun, inputs, reps) / reps
+  else
+    _ -> single_run_time
+  end
+end
+
+throughput = fn bytes, microseconds ->
+  megabytes = bytes / 1_048_576
+  seconds = microseconds / 1_000_000
+  Float.round(megabytes / seconds, 2)
+end
+
+for file <- Path.wildcard("data/*.pb") do
+  %{payload: payloads, message_name: mod_name} = ProtoBench.load(file)
+  module = ProtoBench.mod_name(mod_name)
+
+  IO.puts("Message #{mod_name} of dataset file #{file}")
+
+  bytes = Enum.reduce(payloads, 0, &(byte_size(&1) + &2))
+  messages = Enum.map(payloads, &module.decode/1)
+
+  parse = throughput.(bytes, run.(&module.decode/1, payloads))
+
+  IO.puts("Average throughput for parse_from_benchmark: #{parse} MB/s")
+
+  serialize = throughput.(bytes, run.(&module.encode/1, messages))
+
+  IO.puts("Average throughput for serialize_to_benchmark: #{serialize} MB/s")
+  IO.puts("")
+end