enso-org · mergify · Jul 21, 2023 · Jul 12, 2023 · Jul 12, 2023 · Jul 12, 2023
@@ -299,7 +299,8 @@ lazy val enso = (project in file("."))
     `std-table`,
     `std-aws`,
     `simple-httpbin`,
-    `enso-test-java-helpers`
+    `enso-test-java-helpers`,
+    `exploratory-benchmark-java-helpers`
   )
   .settings(Global / concurrentRestrictions += Tags.exclusive(Exclusive))
   .settings(
@@ -1337,6 +1338,7 @@ lazy val runtime = (project in file("engine/runtime"))
     (Runtime / compile) := (Runtime / compile)
       .dependsOn(`std-base` / Compile / packageBin)
       .dependsOn(`enso-test-java-helpers` / Compile / packageBin)
+      .dependsOn(`exploratory-benchmark-java-helpers` / Compile / packageBin)
       .dependsOn(`std-image` / Compile / packageBin)
       .dependsOn(`std-database` / Compile / packageBin)
       .dependsOn(`std-google-api` / Compile / packageBin)
@@ -1985,6 +1987,20 @@ lazy val `enso-test-java-helpers` = project
   .dependsOn(`std-base` % "provided")
   .dependsOn(`std-table` % "provided")
 
+lazy val `exploratory-benchmark-java-helpers` = project
+  .in(file("test/Exploratory_Benchmarks/polyglot-sources/exploratory-benchmark-java-helpers"))
+  .settings(
+    frgaalJavaCompilerSetting,
+    autoScalaLibrary := false,
+    Compile / packageBin / artifactPath :=
+      file("test/Exploratory_Benchmarks/polyglot/java/exploratory-benchmark-java-helpers.jar"),
+    libraryDependencies ++= Seq(
+      "org.graalvm.sdk" % "graal-sdk" % graalVersion % "provided"
+    ),
+  )
+  .dependsOn(`std-base` % "provided")
+  .dependsOn(`std-table` % "provided")
+
 lazy val `std-table` = project
   .in(file("std-bits") / "table")
   .enablePlugins(Antlr4Plugin)
@@ -2292,11 +2308,13 @@ pkgStdLibInternal := Def.inputTask {
       (`std-table` / Compile / packageBin).value
     case "TestHelpers" =>
       (`enso-test-java-helpers` / Compile / packageBin).value
+      (`exploratory-benchmark-java-helpers` / Compile / packageBin).value
     case "AWS" =>
       (`std-aws` / Compile / packageBin).value
     case _ if buildAllCmd =>
       (`std-base` / Compile / packageBin).value
       (`enso-test-java-helpers` / Compile / packageBin).value
+      (`exploratory-benchmark-java-helpers` / Compile / packageBin).value
       (`std-table` / Compile / packageBin).value
       (`std-database` / Compile / packageBin).value
       (`std-image` / Compile / packageBin).value

@@ -47,8 +47,9 @@ type Column
                  Column.from_vector "My Column" [1, 2, 3, 4, 5]
     from_vector : Text -> Vector -> Column
     from_vector name items =
+        expected_storage_type = Nothing
         Illegal_Argument.handle_java_exception <|
-            Column.Value (Java_Column.fromItems name items)
+            Column.Value (Java_Column.fromItemsConvertPolyglot name items expected_storage_type)
 
     ## PRIVATE
        Creates a new column given a name and an internal Java storage.

@@ -9,6 +9,15 @@ type Bench
        - label: A name for the measurement.
        - iter_size: The number of runs per iteration.
        - num_iters: The number of iterations per measurement.
+       - run_gc_between_iterations: Whether to try running the garbage collector
+         between iterations. Defaults to False. This is helpful when testing
+         memory intensive operations, to ensure that GC runs between iterations
+         and not _during_ iterations. The time taken to run the requested
+         garbage collection will not be counted into the iteration time, however
+         there is no guarantee that the JVM will actually accept the GC hint and
+         it is still possible the JVM may run GC during an iteration. But
+         setting this option to True should make it less likely for GC to
+         interrupt measurements.
 
        > Example
          Measure a computation called "foo" with an iteration size of 2 and a number
@@ -19,8 +28,8 @@ type Bench
 
              example_measure =
                  Bench.measure Examples.get_boolean "foo" iter_size=2 num_iters=1
-    measure : Any -> Text -> Integer -> Integer -> Nothing
-    measure ~act label iter_size num_iters =
+    measure : Any -> Text -> Integer -> Integer -> Boolean -> Nothing
+    measure ~act label iter_size num_iters run_gc_between_iterations=False =
         dry_run = Environment.get "ENSO_BENCHMARK_TEST_DRY_RUN" "False" == "True"
         result = Ref.new 0.0
         single_call = _ ->
@@ -29,6 +38,8 @@ type Bench
             x2 = System.nano_time
             x2 - x1
         iteration = it_size -> it_num ->
+            if run_gc_between_iterations then
+                Runtime.gc
             act_it_num = num_iters - it_num
             res = times it_size single_call
             avg = avg_list res

@@ -2,9 +2,14 @@
 
 import org.enso.base.Text_Utils;
 import org.enso.base.polyglot.Polyglot_Utils;
+import org.enso.table.data.column.builder.Builder;
 import org.enso.table.data.column.builder.InferredBuilder;
+import org.enso.table.data.column.builder.LongBuilder;
+import org.enso.table.data.column.builder.NumericBuilder;
 import org.enso.table.data.column.storage.BoolStorage;
 import org.enso.table.data.column.storage.Storage;
+import org.enso.table.data.column.storage.numeric.LongStorage;
+import org.enso.table.data.column.storage.type.StorageType;
 import org.enso.table.data.index.DefaultIndex;
 import org.enso.table.data.index.Index;
 import org.enso.table.data.mask.OrderMask;
@@ -116,18 +121,13 @@ public Column rename(String name) {
     return new Column(name, storage);
   }
 
-  /**
-   * Creates a new column with given name and elements.
-   *
-   * @param name the name to use
-   * @param items the items contained in the column
-   * @return a column with given name and items
-   */
-  public static Column fromItems(String name, List<Value> items) {
+  /** Creates a column from an Enso array, ensuring Enso dates are converted to Java dates. */
+  public static Column fromItemsConvertPolyglot(String name, List<Value> items, StorageType expectedType) throws ClassCastException {
     Context context = Context.getCurrent();
-    InferredBuilder builder = new InferredBuilder(items.size());
+    int n = items.size();
+    Builder builder = expectedType == null ? new InferredBuilder(n) : Builder.getForType(expectedType, n);
+
     // ToDo: This a workaround for an issue with polyglot layer. #5590 is related.
-    // to revert replace with: for (Value item : items) {
     for (Object item : items) {
       if (item instanceof Value v) {
         Object converted = Polyglot_Utils.convertPolyglotValue(v);
@@ -142,6 +142,20 @@ public static Column fromItems(String name, List<Value> items) {
     return new Column(name, storage);
   }
 
+  /** Creates a column from an Enso array. No polyglot conversion happens. This is unsafe */
+  public static Column fromItemsRaw(String name, List<Object> items, StorageType expectedType) throws ClassCastException {
+    Context context = Context.getCurrent();
+    int n = items.size();
+    Builder builder = expectedType == null ? new InferredBuilder(n) : Builder.getForType(expectedType, n);
+
+    for (Object item : items) {
+      builder.appendNoGrow(item);
+      context.safepoint();
+    }
+    var storage = builder.seal();
+    return new Column(name, storage);
+  }
+
   /**
    * Creates a new column with given name and elements.
    *
@@ -155,7 +169,7 @@ public static Column fromRepeatedItems(String name, List<Value> items, int repea
     }
 
     if (repeat == 1) {
-      return fromItems(name, items);
+      return fromItemsConvertPolyglot(name, items, null);
     }
 
     Context context = Context.getCurrent();

@@ -0,0 +1,52 @@
+# Exploring Table operation performance
+
+These benchmarks are used to compare various approaches to computing operations
+on Table columns, to find out what best practices should we use for these and
+find venues for optimization of the language and Table implementation.
+
+These benchmarks are not meant to be used for tracking performance of the
+current implementation itself. That is supposed to be done by another
+project - `Table_Benchmarks`.
+
+## Structure
+
+Currently, the benchmarks are split into a few files, each exploring some
+separate topic, like mapping a single column, combining two columns with
+some operation, or computing an aggregate operation over a column. In each
+file, there may be a few Enso types, each representing a separate benchmark.
+Usually, we have two benchmarks for each operation type - one dealing with a
+primitive value type like integers (`long` in the Java side) and another
+dealing with a reference type like `String` or `Date`. We expect the
+performance characteristics between these may differ, e.g. because Java
+allows to use `long` without boxing, so we compare them separately.
+
+Each Enso type for a given benchmark contains multiple methods which represent
+various 'approaches' to computing the same operation.
+
+Each benchmark run has a name that consists of the type it defines it, a dot
+and the method representing the particular approach,
+e.g. `Boxed_Map_Test.enso_map_as_vector`.
+
+## Running
+
+The runner is very simple. If any options are to be customized, the Enso
+file itself needs to be modified. One can run the whole project to run all
+the benchmarks, or run only a specific file.
+
+## Analysis
+
+The output of the benchmarks should be saved to a file. Then that file can
+be loaded using the Enso workflow in `tools/performance/benchmark-analysis`.
+
+The workflow is tuned to analysing these comparative benchmarks.
+
+At the top, one can select which file is to be analyzed. Below there is a
+dropdown allowing to select one particular benchmark (represented by the type,
+e.g. `Boxed_Map_Test`). With that selected, one can display a scatter plot 
+visualization comparing various approaches of that one given benchmark. On 
+the plot we can see runtimes of subsequent iterations. Later, we drop the 
+first 40 iterations (the number can easily be customized in the workflow) to 
+ensure sufficient warm-up for each benchmark. Then a table is displayed 
+computing the average runtime of each approach and how they compare relative 
+to each other - a dropdown allows to select one benchmark that will be used 
+as a reference point (100%) for the average runtime comparison.
@@ -0,0 +1,6 @@
+name: Exploratory_Benchmarks
+enso-version: default
+version: 0.0.1
+license: MIT
+author: enso-dev@enso.org
+maintainer: enso-dev@enso.org
@@ -0,0 +1,109 @@
+package org.enso.exploratory_benchmark_helpers;
+
+import java.util.BitSet;
+import java.util.function.Function;
+import org.enso.base.Text_Utils;
+import org.enso.table.data.column.builder.Builder;
+import org.enso.table.data.column.builder.InferredBuilder;
+import org.enso.table.data.column.storage.BoolStorage;
+import org.enso.table.data.column.storage.Storage;
+import org.enso.table.data.column.storage.StringStorage;
+import org.enso.table.data.column.storage.datetime.DateStorage;
+import org.enso.table.data.column.storage.numeric.LongStorage;
+import org.enso.table.data.column.storage.type.StorageType;
+
+public class MapHelpers {
+  public static StringStorage stringConcatBimap(StringStorage storage1, StringStorage storage2) {
+    if (storage1.size() != storage2.size()) {
+      throw new IllegalArgumentException("Storage sizes must match");
+    }
+
+    int n = storage1.size();
+    String[] result = new String[n];
+    for (int i = 0; i < n; i++) {
+      if (!storage1.isNa(i) && !storage2.isNa(i)) {
+        result[i] = storage1.getItem(i) + storage2.getItem(i);
+      } else {
+        result[i] = null;
+      }
+    }
+    return new StringStorage(result, n);
+  }
+
+  public static LongStorage longAddBimap(LongStorage storage1, LongStorage storage2) {
+    if (storage1.size() != storage2.size()) {
+      throw new IllegalArgumentException("Storage sizes must match");
+    }
+
+    int n = storage1.size();
+    long[] result = new long[n];
+    BitSet missing = new BitSet();
+    for (int i = 0; i < n; i++) {
+      if (!storage1.isNa(i) && !storage2.isNa(i)) {
+        result[i] = storage1.getItem(i) + storage2.getItem(i);
+      } else {
+        missing.set(i);
+      }
+    }
+    return new LongStorage(result, n, missing);
+  }
+
+  public static BoolStorage textEndsWith(StringStorage storage, String suffix) {
+    int n = storage.size();
+    BitSet result = new BitSet();
+    BitSet missing = new BitSet();
+    for (int i = 0; i < n; i++) {
+      if (storage.isNa(i)) {
+        missing.set(i);
+      } else {
+        if (Text_Utils.ends_with(storage.getItem(i), suffix)) {
+          result.set(i);
+        }
+      }
+    }
+    return new BoolStorage(result, missing, n, false);
+  }
+
+  public static LongStorage longAdd(LongStorage storage, long shift) {
+    int n = storage.size();
+    long[] result = new long[n];
+    BitSet missing = new BitSet();
+    for (int i = 0; i < n; i++) {
+      if (!storage.isNa(i)) {
+        result[i] = storage.getItem(i) + shift;
+      } else {
+        missing.set(i);
+      }
+    }
+    return new LongStorage(result, n, missing);
+  }
+
+  public static LongStorage getYear(DateStorage storage) {
+    int n = storage.size();
+    long[] result = new long[n];
+    BitSet missing = new BitSet();
+    for (int i = 0; i < n; i++) {
+      if (!storage.isNa(i)) {
+        result[i] = storage.getItem(i).getYear();
+      } else {
+        missing.set(i);
+      }
+    }
+    return new LongStorage(result, n, missing);
+  }
+
+  public static Storage<?> mapCallback(
+      Storage<?> storage, Function<Object, Object> fn, StorageType expectedType) {
+    int n = storage.size();
+    Builder builder =
+        expectedType == null ? new InferredBuilder(n) : Builder.getForType(expectedType, n);
+    for (int i = 0; i < n; i++) {
+      if (!storage.isNa(i)) {
+        builder.append(fn.apply(storage.getItemBoxed(i)));
+      } else {
+        builder.appendNulls(1);
+      }
+    }
+    return builder.seal();
+  }
+}
@@ -0,0 +1,46 @@
+package org.enso.exploratory_benchmark_helpers;
+
+import java.time.LocalDate;
+import org.enso.base.Text_Utils;
+import org.enso.table.data.column.storage.StringStorage;
+import org.enso.table.data.column.storage.datetime.DateStorage;
+import org.enso.table.data.column.storage.numeric.LongStorage;
+
+public class SimpleStorageAggregateHelpers {
+  public static long sumLongStorage(LongStorage storage) {
+    long sum = 0;
+    for (int i = 0; i < storage.size(); i++) {
+      if (!storage.isNa(i)) {
+        sum += storage.getItem(i);
+      }
+    }
+    return sum;
+  }
+
+  public static long sumMonthsOfDateStorage(DateStorage storage) {
+    long sum = 0;
+    for (LocalDate date : storage.getData()) {
+      if (date != null) {
+        sum += date.getMonthValue();
+      }
+    }
+    return sum;
+  }
+
+  public static String longestText(StringStorage storage) {
+    long longest = -1;
+    String longestText = null;
+    int n = storage.size();
+    for (int i = 0; i < n; i++) {
+      if (!storage.isNa(i)) {
+        String text = storage.getItem(i);
+        long length = Text_Utils.grapheme_length(text);
+        if (length > longest) {
+          longest = length;
+          longestText = text;
+        }
+      }
+    }
+    return longestText;
+  }
+}
@@ -0,0 +1,3 @@
+import project.Table.Main as Table_Main
+
+main = Table_Main.spec