diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmark.java new file mode 100644 index 0000000000000..13446b8ef6c77 --- /dev/null +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmark.java @@ -0,0 +1,74 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ +package org.elasticsearch.benchmark.bytes; + +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.logging.LogConfigurator; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 4, time = 1) +@Measurement(iterations = 5, time = 1) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Fork(value = 1) +public class BytesArrayIndexOfBenchmark { + + static { + LogConfigurator.configureESLogging(); // native access requires logging to be initialized + } + + static final byte MARKER = (byte) '\n'; + + @Param(value = { "64", "127", "128", "4096", "16384", "65536", "1048576" }) + public int size; + + BytesArray bytesArray; + + @Setup + public void setup() { + byte[] bytes = new byte[size]; + bytes[bytes.length - 1] = MARKER; + bytesArray = new BytesArray(bytes, 0, bytes.length); + } + + @Benchmark + public int indexOf() { + return bytesArray.indexOf(MARKER, 0); + } + + @Benchmark + @Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" }) + public int indexOfPanama() { + return bytesArray.indexOf(MARKER, 0); + } + + @Benchmark + public int withOffsetIndexOf() { + return bytesArray.indexOf(MARKER, 1); + } + + @Benchmark + @Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" }) + public int withOffsetIndexPanama() { + return bytesArray.indexOf(MARKER, 1); + } +} diff --git a/benchmarks/src/test/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmarkTests.java b/benchmarks/src/test/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmarkTests.java new file mode 100644 index 0000000000000..feea8eae29eea --- /dev/null +++ b/benchmarks/src/test/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmarkTests.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.benchmark.bytes; + +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.test.ESTestCase; +import org.openjdk.jmh.annotations.Param; + +import java.util.Arrays; + +public class BytesArrayIndexOfBenchmarkTests extends ESTestCase { + + final int size; + + public BytesArrayIndexOfBenchmarkTests(int size) { + this.size = size; + } + + public void testIndexOf() { + var bench = new BytesArrayIndexOfBenchmark(); + bench.size = size; + bench.setup(); + assertEquals(size - 1, bench.indexOf()); + assertEquals(size - 1, bench.indexOfPanama()); + assertEquals(size - 1, bench.withOffsetIndexOf()); + assertEquals(size - 1, bench.withOffsetIndexPanama()); + } + + @ParametersFactory + public static Iterable parametersFactory() { + try { + var params = BytesArrayIndexOfBenchmark.class.getField("size").getAnnotationsByType(Param.class)[0].value(); + return () -> Arrays.stream(params).map(Integer::parseInt).map(i -> new Object[] { i }).iterator(); + } catch (NoSuchFieldException e) { + throw new AssertionError(e); + } + } +} diff --git a/docs/changelog/135087.yaml b/docs/changelog/135087.yaml new file mode 100644 index 0000000000000..a2c2cda54d3d7 --- /dev/null +++ b/docs/changelog/135087.yaml @@ -0,0 +1,5 @@ +pr: 135087 +summary: "Optimize `BytesArray::indexOf,` which is used heavily in ndjson parsing" +area: "Performance" +type: feature +issues: [] diff --git a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/ESVectorUtil.java b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/ESVectorUtil.java index c083f1c92a4fd..ec7f8cbfc16aa 100644 --- a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/ESVectorUtil.java +++ b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/ESVectorUtil.java @@ -19,6 +19,7 @@ import java.lang.invoke.MethodHandle; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; +import java.util.Objects; import static org.elasticsearch.simdvec.internal.vectorization.ESVectorUtilSupport.B_QUERY; @@ -399,4 +400,22 @@ public static void transposeHalfByte(int[] q, byte[] quantQueryByte) { } IMPL.transposeHalfByte(q, quantQueryByte); } + + /** + * Searches for the first occurrence of the given marker byte in the specified range of the array. + * + *

The search starts at {@code offset} and examines at most {@code length} bytes. The return + * value is the relative index of the first occurrence of {@code marker} within this slice, + * or {@code -1} if not found. + * + * @param bytes the byte array to search + * @param offset the starting index within the array + * @param length the number of bytes to examine + * @param marker the byte to search for + * @return the relative index (0..length-1) of the first match, or {@code -1} if not found + */ + public static int indexOf(byte[] bytes, int offset, int length, byte marker) { + Objects.checkFromIndexSize(offset, length, bytes.length); + return IMPL.indexOf(bytes, offset, length, marker); + } } diff --git a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ByteArrayUtils.java b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ByteArrayUtils.java new file mode 100644 index 0000000000000..b755c90c764bb --- /dev/null +++ b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ByteArrayUtils.java @@ -0,0 +1,110 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.simdvec.internal.vectorization; + +import static org.apache.lucene.util.BitUtil.VH_LE_LONG; + +/** Byte array utilities. */ +final class ByteArrayUtils { + + /** + * Implementation of {@link ESVectorUtilSupport#indexOf(byte[], int, int, byte)} using fast + * SWAR (SIMD Within A Register) loop. + */ + static int indexOf(final byte[] bytes, final int offset, final int len, final byte marker) { + final int end = offset + len; + int i = offset; + + // First, try to find the marker in the first few bytes, so we can enter the faster 8-byte aligned loop below. + // The idea for this logic is taken from Netty's io.netty.buffer.ByteBufUtil.firstIndexOf and optimized for little endian hardware. + // See e.g. https://richardstartin.github.io/posts/finding-bytes for the idea behind this optimization. + final int byteCount = len & 7; + if (byteCount > 0) { + final int index = unrolledFirstIndexOf(bytes, i, byteCount, marker); + if (index != -1) { + return index - offset; + } + i += byteCount; + if (i == end) { + return -1; + } + } + final int longCount = len >>> 3; + // faster SWAR (SIMD Within A Register) loop + final long pattern = compilePattern(marker); + for (int j = 0; j < longCount; j++) { + int index = findInLong(readLongLE(bytes, i), pattern); + if (index < Long.BYTES) { + return i + index - offset; + } + i += Long.BYTES; + } + return -1; + } + + private static long readLongLE(byte[] arr, int offset) { + return (long) VH_LE_LONG.get(arr, offset); + } + + private static long compilePattern(byte byteToFind) { + return (byteToFind & 0xFFL) * 0x101010101010101L; + } + + private static int findInLong(long word, long pattern) { + long input = word ^ pattern; + long tmp = (input & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL; + tmp = ~(tmp | input | 0x7F7F7F7F7F7F7F7FL); + final int binaryPosition = Long.numberOfTrailingZeros(tmp); + return binaryPosition >>> 3; + } + + private static int unrolledFirstIndexOf(byte[] buffer, int fromIndex, int byteCount, byte value) { + if (buffer[fromIndex] == value) { + return fromIndex; + } + if (byteCount == 1) { + return -1; + } + if (buffer[fromIndex + 1] == value) { + return fromIndex + 1; + } + if (byteCount == 2) { + return -1; + } + if (buffer[fromIndex + 2] == value) { + return fromIndex + 2; + } + if (byteCount == 3) { + return -1; + } + if (buffer[fromIndex + 3] == value) { + return fromIndex + 3; + } + if (byteCount == 4) { + return -1; + } + if (buffer[fromIndex + 4] == value) { + return fromIndex + 4; + } + if (byteCount == 5) { + return -1; + } + if (buffer[fromIndex + 5] == value) { + return fromIndex + 5; + } + if (byteCount == 6) { + return -1; + } + if (buffer[fromIndex + 6] == value) { + return fromIndex + 6; + } + return -1; + } +} diff --git a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java index c78970a0c8794..5bc8d024ccebd 100644 --- a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java +++ b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java @@ -403,4 +403,9 @@ public static void transposeHalfByteImpl(int[] q, byte[] quantQueryByte) { quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte; quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte; } + + @Override + public int indexOf(byte[] bytes, int offset, int length, byte marker) { + return ByteArrayUtils.indexOf(bytes, offset, length, marker); + } } diff --git a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ESVectorUtilSupport.java b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ESVectorUtilSupport.java index 08c256051661e..48c3fa9675f35 100644 --- a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ESVectorUtilSupport.java +++ b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ESVectorUtilSupport.java @@ -67,4 +67,6 @@ void soarDistanceBulk( void packAsBinary(int[] vector, byte[] packed); void transposeHalfByte(int[] q, byte[] quantQueryByte); + + int indexOf(byte[] bytes, int offset, int length, byte marker); } diff --git a/libs/simdvec/src/main21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java b/libs/simdvec/src/main21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java index 62637a621cd0b..2ede0df76970c 100644 --- a/libs/simdvec/src/main21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java +++ b/libs/simdvec/src/main21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java @@ -1122,4 +1122,26 @@ private void transposeHalfByte128(int[] q, byte[] quantQueryByte) { quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte; quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte; } + + @Override + public int indexOf(final byte[] bytes, final int offset, final int length, final byte marker) { + final ByteVector markerVector = ByteVector.broadcast(ByteVector.SPECIES_PREFERRED, marker); + final int loopBound = ByteVector.SPECIES_PREFERRED.loopBound(length); + for (int i = 0; i < loopBound; i += ByteVector.SPECIES_PREFERRED.length()) { + ByteVector chunk = ByteVector.fromArray(ByteVector.SPECIES_PREFERRED, bytes, offset + i); + VectorMask mask = chunk.eq(markerVector); + if (mask.anyTrue()) { + return i + mask.firstTrue(); + } + } + // tail + if (loopBound < length) { + int remaining = length - loopBound; + int tail = ByteArrayUtils.indexOf(bytes, offset + loopBound, remaining, marker); + if (tail >= 0) { + return loopBound + tail; + } + } + return -1; + } } diff --git a/libs/simdvec/src/test/java/org/elasticsearch/simdvec/ESVectorUtilTests.java b/libs/simdvec/src/test/java/org/elasticsearch/simdvec/ESVectorUtilTests.java index 24aff1107d7e7..89f9352976669 100644 --- a/libs/simdvec/src/test/java/org/elasticsearch/simdvec/ESVectorUtilTests.java +++ b/libs/simdvec/src/test/java/org/elasticsearch/simdvec/ESVectorUtilTests.java @@ -440,4 +440,91 @@ public static int popcount(byte[] a, int aOffset, byte[] b, int length) { } return res; } + + // -- indexOf + + static final Class IOOBE = IndexOutOfBoundsException.class; + + public void testIndexOfBounds() { + int iterations = atLeast(50); + for (int i = 0; i < iterations; i++) { + int size = random().nextInt(2, 5000); + var bytes = new byte[size]; + expectThrows(IOOBE, () -> ESVectorUtil.indexOf(bytes, 0, bytes.length + 1, (byte) 0x0A)); + expectThrows(IOOBE, () -> ESVectorUtil.indexOf(bytes, 1, bytes.length, (byte) 0x0A)); + expectThrows(IOOBE, () -> ESVectorUtil.indexOf(bytes, bytes.length, 1, (byte) 0x0A)); + expectThrows(IOOBE, () -> ESVectorUtil.indexOf(bytes, bytes.length - 1, 2, (byte) 0x0A)); + expectThrows(IOOBE, () -> ESVectorUtil.indexOf(bytes, randomIntBetween(2, size), bytes.length, (byte) 0x0A)); + } + } + + public void testIndexOfSimple() { + int iterations = atLeast(50); + for (int i = 0; i < iterations; i++) { + int size = random().nextInt(2, 5000); + var bytes = new byte[size]; + byte marker = (byte) 0x0A; + int markerIdx = randomIntBetween(0, bytes.length - 1); + bytes[markerIdx] = marker; + + assertEquals(markerIdx, ESVectorUtil.indexOf(bytes, 0, bytes.length, marker)); + assertEquals(markerIdx, defaultedProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length, marker)); + assertEquals(markerIdx, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length, marker)); + + bytes = new byte[size]; + bytes[bytes.length - 1] = marker; + assertEquals(bytes.length - 1, ESVectorUtil.indexOf(bytes, 0, bytes.length, marker)); + assertEquals(bytes.length - 1, defaultedProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length, marker)); + assertEquals(bytes.length - 1, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length, marker)); + + assertEquals(bytes.length - 2, ESVectorUtil.indexOf(bytes, 1, bytes.length - 1, marker)); + assertEquals(bytes.length - 2, defaultedProvider.getVectorUtilSupport().indexOf(bytes, 1, bytes.length - 1, marker)); + assertEquals(bytes.length - 2, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, 1, bytes.length - 1, marker)); + + // not found + assertEquals(-1, ESVectorUtil.indexOf(bytes, 0, bytes.length - 1, marker)); + assertEquals(-1, defaultedProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length - 1, marker)); + assertEquals(-1, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length - 1, marker)); + + bytes = new byte[size]; + bytes[0] = marker; + assertEquals(0, ESVectorUtil.indexOf(bytes, 0, bytes.length, marker)); + assertEquals(0, defaultedProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length, marker)); + assertEquals(0, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, 0, bytes.length, marker)); + + // not found + assertEquals(-1, ESVectorUtil.indexOf(bytes, 1, bytes.length - 1, marker)); + assertEquals(-1, defaultedProvider.getVectorUtilSupport().indexOf(bytes, 1, bytes.length - 1, marker)); + assertEquals(-1, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, 1, bytes.length - 1, marker)); + } + } + + public void testIndexOfRandom() { + int iterations = atLeast(50); + for (int i = 0; i < iterations; i++) { + int size = random().nextInt(2, 5000); + var bytes = new byte[size]; + random().nextBytes(bytes); + byte marker = randomByte(); + int markerIdx = randomIntBetween(0, bytes.length - 1); + bytes[markerIdx] = marker; + + final int offset = randomIntBetween(0, bytes.length - 2); + final int length = randomIntBetween(0, bytes.length - offset); + final int expectedIdx = scalarIndexOf(bytes, offset, length, marker); + assertEquals(expectedIdx, ESVectorUtil.indexOf(bytes, offset, length, marker)); + assertEquals(expectedIdx, defaultedProvider.getVectorUtilSupport().indexOf(bytes, offset, length, marker)); + assertEquals(expectedIdx, defOrPanamaProvider.getVectorUtilSupport().indexOf(bytes, offset, length, marker)); + } + } + + static int scalarIndexOf(byte[] bytes, final int offset, final int length, final byte marker) { + final int end = offset + length; + for (int i = offset; i < end; i++) { + if (bytes[i] == marker) { + return i - offset; + } + } + return -1; + } } diff --git a/server/src/main/java/org/elasticsearch/common/bytes/BytesArray.java b/server/src/main/java/org/elasticsearch/common/bytes/BytesArray.java index 811c3c1c9875b..ab6a5a49fec3f 100644 --- a/server/src/main/java/org/elasticsearch/common/bytes/BytesArray.java +++ b/server/src/main/java/org/elasticsearch/common/bytes/BytesArray.java @@ -13,6 +13,7 @@ import org.apache.lucene.util.BytesRefIterator; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.util.ByteUtils; +import org.elasticsearch.simdvec.ESVectorUtil; import java.io.IOException; import java.io.OutputStream; @@ -59,90 +60,9 @@ public byte get(int index) { @Override public int indexOf(byte marker, int from) { - final int len = length - from; - // cache object fields (even when final this is a valid optimization, see https://openjdk.org/jeps/8132243) - final int offsetAsLocal = offset; - int off = offsetAsLocal + from; - final int toIndex = offsetAsLocal + length; - final byte[] bytesAsLocal = bytes; - // First, try to find the marker in the first few bytes, so we can enter the faster 8-byte aligned loop below. - // The idea for this logic is taken from Netty's io.netty.buffer.ByteBufUtil.firstIndexOf and optimized for little endian hardware. - // See e.g. https://richardstartin.github.io/posts/finding-bytes for the idea behind this optimization. - final int byteCount = len & 7; - if (byteCount > 0) { - final int index = unrolledFirstIndexOf(bytesAsLocal, off, byteCount, marker); - if (index != -1) { - return index - offsetAsLocal; - } - off += byteCount; - if (off == toIndex) { - return -1; - } - } - final int longCount = len >>> 3; - // faster SWAR (SIMD Within A Register) loop - final long pattern = compilePattern(marker); - for (int i = 0; i < longCount; i++) { - int index = findInLong(ByteUtils.readLongLE(bytesAsLocal, off), pattern); - if (index < Long.BYTES) { - return off + index - offsetAsLocal; - } - off += Long.BYTES; - } - return -1; - } - - private static long compilePattern(byte byteToFind) { - return (byteToFind & 0xFFL) * 0x101010101010101L; - } - - private static int findInLong(long word, long pattern) { - long input = word ^ pattern; - long tmp = (input & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL; - tmp = ~(tmp | input | 0x7F7F7F7F7F7F7F7FL); - final int binaryPosition = Long.numberOfTrailingZeros(tmp); - return binaryPosition >>> 3; - } - - private static int unrolledFirstIndexOf(byte[] buffer, int fromIndex, int byteCount, byte value) { - if (buffer[fromIndex] == value) { - return fromIndex; - } - if (byteCount == 1) { - return -1; - } - if (buffer[fromIndex + 1] == value) { - return fromIndex + 1; - } - if (byteCount == 2) { - return -1; - } - if (buffer[fromIndex + 2] == value) { - return fromIndex + 2; - } - if (byteCount == 3) { - return -1; - } - if (buffer[fromIndex + 3] == value) { - return fromIndex + 3; - } - if (byteCount == 4) { - return -1; - } - if (buffer[fromIndex + 4] == value) { - return fromIndex + 4; - } - if (byteCount == 5) { - return -1; - } - if (buffer[fromIndex + 5] == value) { - return fromIndex + 5; - } - if (byteCount == 6) { - return -1; - } - if (buffer[fromIndex + 6] == value) { - return fromIndex + 6; + int idx = ESVectorUtil.indexOf(bytes, offset + from, length - from, marker); + if (idx >= 0) { + return from + idx; } return -1; }