Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmarks comparing performance of Table operations 'vectorized' in Java vs performed in Enso #7270

Merged
merged 35 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
1be9e55
add basic benchmarks, modify suite to allow to request GC
radeusgd Jul 12, 2023
b6cf40e
Split into files
radeusgd Jul 12, 2023
6e5d718
forgot an `if`
radeusgd Jul 12, 2023
aa829f6
update iters
radeusgd Jul 13, 2023
b5781eb
Add callback benchmark, clean imports
radeusgd Jul 13, 2023
23f5396
fixes
radeusgd Jul 13, 2023
81a33a1
Add also a comparison on primitives
radeusgd Jul 13, 2023
25d465a
Check a bit better Column.from_vector implementation
radeusgd Jul 13, 2023
d93ff75
Test avoiding date conversion when building Column.from_vector
radeusgd Jul 13, 2023
6f46c42
Implement Column_Map_2: Date.year
radeusgd Jul 14, 2023
fc305f7
Merge branch 'develop' into wip/radeusgd/benchmark-vectorized-column-…
radeusgd Jul 14, 2023
1b2fda6
new aggregate tests
radeusgd Jul 14, 2023
4641920
Merge branch 'develop' into wip/radeusgd/benchmark-vectorized-column-…
radeusgd Jul 17, 2023
81e0db4
Move benchmarks to a separate project
radeusgd Jul 17, 2023
171cdb2
Merge branch 'develop' into wip/radeusgd/benchmark-vectorized-column-…
radeusgd Jul 17, 2023
b6fe3b7
Adding polyglot helpers for Exploratory_Benchmarks, to avoid adding c…
radeusgd Jul 17, 2023
1561444
Move away from modifying stdlib into special helpers for benchmarks
radeusgd Jul 17, 2023
2cb4188
fix imports
radeusgd Jul 17, 2023
a9e0825
GC configurable
radeusgd Jul 17, 2023
ba375fc
Add analysis workflow
radeusgd Jul 17, 2023
b642b1c
Add README
radeusgd Jul 17, 2023
d8ad9ca
Add README 2
radeusgd Jul 17, 2023
0e5d8a7
Make Java implementations stable, add `current_implementation`
radeusgd Jul 17, 2023
d650516
Adding verification if all benchmarks actually compute the same thing
radeusgd Jul 17, 2023
d3224f7
update settings
radeusgd Jul 17, 2023
863e486
javafmt
radeusgd Jul 17, 2023
f5d8ae1
Merge branch 'develop' into wip/radeusgd/benchmark-vectorized-column-…
radeusgd Jul 18, 2023
a09f73f
Update benchmark workflow to work with new API changes
radeusgd Jul 18, 2023
be66a76
scalafmtSbt
radeusgd Jul 18, 2023
863903a
formatting
radeusgd Jul 18, 2023
e4853f9
Merge branch 'develop' into wip/radeusgd/benchmark-vectorized-column-…
radeusgd Jul 20, 2023
cd0a984
revert fromItems rename
radeusgd Jul 20, 2023
d10009a
rename
radeusgd Jul 20, 2023
06d17ba
Merge branch 'develop' into wip/radeusgd/benchmark-vectorized-column-…
mergify[bot] Jul 21, 2023
f1bf33e
Update after merge
radeusgd Jul 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,8 @@ lazy val enso = (project in file("."))
`std-table`,
`std-aws`,
`simple-httpbin`,
`enso-test-java-helpers`
`enso-test-java-helpers`,
`exploratory-benchmark-java-helpers`
)
.settings(Global / concurrentRestrictions += Tags.exclusive(Exclusive))
.settings(
Expand Down Expand Up @@ -1337,6 +1338,7 @@ lazy val runtime = (project in file("engine/runtime"))
(Runtime / compile) := (Runtime / compile)
.dependsOn(`std-base` / Compile / packageBin)
.dependsOn(`enso-test-java-helpers` / Compile / packageBin)
.dependsOn(`exploratory-benchmark-java-helpers` / Compile / packageBin)
.dependsOn(`std-image` / Compile / packageBin)
.dependsOn(`std-database` / Compile / packageBin)
.dependsOn(`std-google-api` / Compile / packageBin)
Expand Down Expand Up @@ -1985,6 +1987,20 @@ lazy val `enso-test-java-helpers` = project
.dependsOn(`std-base` % "provided")
.dependsOn(`std-table` % "provided")

lazy val `exploratory-benchmark-java-helpers` = project
.in(file("test/Exploratory_Benchmarks/polyglot-sources/exploratory-benchmark-java-helpers"))
.settings(
frgaalJavaCompilerSetting,
autoScalaLibrary := false,
Compile / packageBin / artifactPath :=
file("test/Exploratory_Benchmarks/polyglot/java/exploratory-benchmark-java-helpers.jar"),
libraryDependencies ++= Seq(
"org.graalvm.sdk" % "graal-sdk" % graalVersion % "provided"
),
)
.dependsOn(`std-base` % "provided")
.dependsOn(`std-table` % "provided")

lazy val `std-table` = project
.in(file("std-bits") / "table")
.enablePlugins(Antlr4Plugin)
Expand Down Expand Up @@ -2292,11 +2308,13 @@ pkgStdLibInternal := Def.inputTask {
(`std-table` / Compile / packageBin).value
case "TestHelpers" =>
(`enso-test-java-helpers` / Compile / packageBin).value
(`exploratory-benchmark-java-helpers` / Compile / packageBin).value
case "AWS" =>
(`std-aws` / Compile / packageBin).value
case _ if buildAllCmd =>
(`std-base` / Compile / packageBin).value
(`enso-test-java-helpers` / Compile / packageBin).value
(`exploratory-benchmark-java-helpers` / Compile / packageBin).value
(`std-table` / Compile / packageBin).value
(`std-database` / Compile / packageBin).value
(`std-image` / Compile / packageBin).value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ type Column
Column.from_vector "My Column" [1, 2, 3, 4, 5]
from_vector : Text -> Vector -> Column
from_vector name items =
expected_storage_type = Nothing
Illegal_Argument.handle_java_exception <|
Column.Value (Java_Column.fromItems name items)
Column.Value (Java_Column.fromItemsConvertPolyglot name items expected_storage_type)
radeusgd marked this conversation as resolved.
Show resolved Hide resolved

## PRIVATE
Creates a new column given a name and an internal Java storage.
Expand Down
15 changes: 13 additions & 2 deletions distribution/lib/Standard/Test/0.0.0-dev/src/Bench.enso
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ type Bench
- label: A name for the measurement.
- iter_size: The number of runs per iteration.
- num_iters: The number of iterations per measurement.
- run_gc_between_iterations: Whether to try running the garbage collector
between iterations. Defaults to False. This is helpful when testing
memory intensive operations, to ensure that GC runs between iterations
and not _during_ iterations. The time taken to run the requested
garbage collection will not be counted into the iteration time, however
there is no guarantee that the JVM will actually accept the GC hint and
it is still possible the JVM may run GC during an iteration. But
setting this option to True should make it less likely for GC to
interrupt measurements.

> Example
Measure a computation called "foo" with an iteration size of 2 and a number
Expand All @@ -19,8 +28,8 @@ type Bench

example_measure =
Bench.measure Examples.get_boolean "foo" iter_size=2 num_iters=1
measure : Any -> Text -> Integer -> Integer -> Nothing
measure ~act label iter_size num_iters =
measure : Any -> Text -> Integer -> Integer -> Boolean -> Nothing
measure ~act label iter_size num_iters run_gc_between_iterations=False =
dry_run = Environment.get "ENSO_BENCHMARK_TEST_DRY_RUN" "False" == "True"
result = Ref.new 0.0
single_call = _ ->
Expand All @@ -29,6 +38,8 @@ type Bench
x2 = System.nano_time
x2 - x1
iteration = it_size -> it_num ->
if run_gc_between_iterations then
Runtime.gc
act_it_num = num_iters - it_num
res = times it_size single_call
avg = avg_list res
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@

import org.enso.base.Text_Utils;
import org.enso.base.polyglot.Polyglot_Utils;
import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.builder.InferredBuilder;
import org.enso.table.data.column.builder.LongBuilder;
import org.enso.table.data.column.builder.NumericBuilder;
import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.LongStorage;
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.data.index.DefaultIndex;
import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask;
Expand Down Expand Up @@ -116,18 +121,13 @@ public Column rename(String name) {
return new Column(name, storage);
}

/**
* Creates a new column with given name and elements.
*
* @param name the name to use
* @param items the items contained in the column
* @return a column with given name and items
*/
public static Column fromItems(String name, List<Value> items) {
/** Creates a column from an Enso array, ensuring Enso dates are converted to Java dates. */
public static Column fromItemsConvertPolyglot(String name, List<Value> items, StorageType expectedType) throws ClassCastException {
Context context = Context.getCurrent();
InferredBuilder builder = new InferredBuilder(items.size());
int n = items.size();
Builder builder = expectedType == null ? new InferredBuilder(n) : Builder.getForType(expectedType, n);

// ToDo: This a workaround for an issue with polyglot layer. #5590 is related.
// to revert replace with: for (Value item : items) {
for (Object item : items) {
if (item instanceof Value v) {
Object converted = Polyglot_Utils.convertPolyglotValue(v);
Expand All @@ -142,6 +142,20 @@ public static Column fromItems(String name, List<Value> items) {
return new Column(name, storage);
}

/** Creates a column from an Enso array. No polyglot conversion happens. This is unsafe */
public static Column fromItemsRaw(String name, List<Object> items, StorageType expectedType) throws ClassCastException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do remember when fromItems introduced List<Value>... deciding whether List<Value> is needed or whether List<Object> is enough is a delicate dance balancing GraalVM Host Interop, GraalVM version, Enso implementation of interop and needs of the libraries.

In any case, I believe the #1 thing we need is to get stable, independent benchmarks being executed daily by the CI. Without it we will just shift the code back and forth without being sure the system is improved.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now I will keep it as is, but I will be adding tickets to revisit this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, I don't think we should add this to the core Column type.
It can be a static method anywhere from my read so I suggest it goes into the exploratory helper Java.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to keep it in std-lib, because I actually plan to use it in #6111 - there when we know the expected value type, if the value type is not Mixed nor Date (etc.), we can guarantee no dates will come in - and so we do not need the conversion to happen. This will make the map operation and construction of columns like integer column much faster.

I can make it a static, but since I plan to use it in the main codebase soon, I did not want to move it to the helpers - since I'd have to move it back again when implementing #6111

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair - felt for an experiment we shouldn't be changing the core but happy given next step will be to change it.

Context context = Context.getCurrent();
int n = items.size();
Builder builder = expectedType == null ? new InferredBuilder(n) : Builder.getForType(expectedType, n);

for (Object item : items) {
builder.appendNoGrow(item);
context.safepoint();
}
var storage = builder.seal();
return new Column(name, storage);
}

/**
* Creates a new column with given name and elements.
*
Expand All @@ -155,7 +169,7 @@ public static Column fromRepeatedItems(String name, List<Value> items, int repea
}

if (repeat == 1) {
return fromItems(name, items);
return fromItemsConvertPolyglot(name, items, null);
}

Context context = Context.getCurrent();
Expand Down
52 changes: 52 additions & 0 deletions test/Exploratory_Benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Exploring Table operation performance

These benchmarks are used to compare various approaches to computing operations
on Table columns, to find out what best practices should we use for these and
find venues for optimization of the language and Table implementation.

These benchmarks are not meant to be used for tracking performance of the
current implementation itself. That is supposed to be done by another
project - `Table_Benchmarks`.

## Structure

Currently, the benchmarks are split into a few files, each exploring some
separate topic, like mapping a single column, combining two columns with
some operation, or computing an aggregate operation over a column. In each
file, there may be a few Enso types, each representing a separate benchmark.
Usually, we have two benchmarks for each operation type - one dealing with a
primitive value type like integers (`long` in the Java side) and another
dealing with a reference type like `String` or `Date`. We expect the
performance characteristics between these may differ, e.g. because Java
allows to use `long` without boxing, so we compare them separately.

Each Enso type for a given benchmark contains multiple methods which represent
various 'approaches' to computing the same operation.

Each benchmark run has a name that consists of the type it defines it, a dot
and the method representing the particular approach,
e.g. `Boxed_Map_Test.enso_map_as_vector`.

## Running

The runner is very simple. If any options are to be customized, the Enso
file itself needs to be modified. One can run the whole project to run all
the benchmarks, or run only a specific file.

## Analysis

The output of the benchmarks should be saved to a file. Then that file can
be loaded using the Enso workflow in `tools/performance/benchmark-analysis`.

The workflow is tuned to analysing these comparative benchmarks.

At the top, one can select which file is to be analyzed. Below there is a
dropdown allowing to select one particular benchmark (represented by the type,
e.g. `Boxed_Map_Test`). With that selected, one can display a scatter plot
visualization comparing various approaches of that one given benchmark. On
the plot we can see runtimes of subsequent iterations. Later, we drop the
first 40 iterations (the number can easily be customized in the workflow) to
ensure sufficient warm-up for each benchmark. Then a table is displayed
computing the average runtime of each approach and how they compare relative
to each other - a dropdown allows to select one benchmark that will be used
as a reference point (100%) for the average runtime comparison.
6 changes: 6 additions & 0 deletions test/Exploratory_Benchmarks/package.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: Exploratory_Benchmarks
enso-version: default
version: 0.0.1
license: MIT
author: enso-dev@enso.org
maintainer: enso-dev@enso.org
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package org.enso.exploratory_benchmark_helpers;

import java.util.BitSet;
import java.util.function.Function;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.builder.InferredBuilder;
import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.column.storage.datetime.DateStorage;
import org.enso.table.data.column.storage.numeric.LongStorage;
import org.enso.table.data.column.storage.type.StorageType;

public class MapHelpers {
public static StringStorage stringConcatBimap(StringStorage storage1, StringStorage storage2) {
if (storage1.size() != storage2.size()) {
throw new IllegalArgumentException("Storage sizes must match");
}

int n = storage1.size();
String[] result = new String[n];
for (int i = 0; i < n; i++) {
if (!storage1.isNa(i) && !storage2.isNa(i)) {
result[i] = storage1.getItem(i) + storage2.getItem(i);
} else {
result[i] = null;
}
}
return new StringStorage(result, n);
}

public static LongStorage longAddBimap(LongStorage storage1, LongStorage storage2) {
if (storage1.size() != storage2.size()) {
throw new IllegalArgumentException("Storage sizes must match");
}

int n = storage1.size();
long[] result = new long[n];
BitSet missing = new BitSet();
for (int i = 0; i < n; i++) {
if (!storage1.isNa(i) && !storage2.isNa(i)) {
result[i] = storage1.getItem(i) + storage2.getItem(i);
} else {
missing.set(i);
}
}
return new LongStorage(result, n, missing);
}

public static BoolStorage textEndsWith(StringStorage storage, String suffix) {
int n = storage.size();
BitSet result = new BitSet();
BitSet missing = new BitSet();
for (int i = 0; i < n; i++) {
if (storage.isNa(i)) {
missing.set(i);
} else {
if (Text_Utils.ends_with(storage.getItem(i), suffix)) {
result.set(i);
}
}
}
return new BoolStorage(result, missing, n, false);
}

public static LongStorage longAdd(LongStorage storage, long shift) {
int n = storage.size();
long[] result = new long[n];
BitSet missing = new BitSet();
for (int i = 0; i < n; i++) {
if (!storage.isNa(i)) {
result[i] = storage.getItem(i) + shift;
} else {
missing.set(i);
}
}
return new LongStorage(result, n, missing);
}

public static LongStorage getYear(DateStorage storage) {
int n = storage.size();
long[] result = new long[n];
BitSet missing = new BitSet();
for (int i = 0; i < n; i++) {
if (!storage.isNa(i)) {
result[i] = storage.getItem(i).getYear();
} else {
missing.set(i);
}
}
return new LongStorage(result, n, missing);
}

public static Storage<?> mapCallback(
Storage<?> storage, Function<Object, Object> fn, StorageType expectedType) {
int n = storage.size();
Builder builder =
expectedType == null ? new InferredBuilder(n) : Builder.getForType(expectedType, n);
for (int i = 0; i < n; i++) {
if (!storage.isNa(i)) {
builder.append(fn.apply(storage.getItemBoxed(i)));
} else {
builder.appendNulls(1);
}
}
return builder.seal();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.enso.exploratory_benchmark_helpers;

import java.time.LocalDate;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.column.storage.datetime.DateStorage;
import org.enso.table.data.column.storage.numeric.LongStorage;

public class SimpleStorageAggregateHelpers {
public static long sumLongStorage(LongStorage storage) {
long sum = 0;
for (int i = 0; i < storage.size(); i++) {
if (!storage.isNa(i)) {
sum += storage.getItem(i);
}
}
return sum;
}

public static long sumMonthsOfDateStorage(DateStorage storage) {
long sum = 0;
for (LocalDate date : storage.getData()) {
if (date != null) {
sum += date.getMonthValue();
}
}
return sum;
}

public static String longestText(StringStorage storage) {
long longest = -1;
String longestText = null;
int n = storage.size();
for (int i = 0; i < n; i++) {
if (!storage.isNa(i)) {
String text = storage.getItem(i);
long length = Text_Utils.grapheme_length(text);
if (length > longest) {
longest = length;
longestText = text;
}
}
}
return longestText;
}
}
3 changes: 3 additions & 0 deletions test/Exploratory_Benchmarks/src/Main.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import project.Table.Main as Table_Main

main = Table_Main.spec
Loading
Loading