elastic · ChrisHegarty · Sep 20, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmark.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+package org.elasticsearch.benchmark.bytes;
+
+import org.elasticsearch.common.bytes.BytesArray;
+import org.elasticsearch.common.logging.LogConfigurator;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 4, time = 1)
+@Measurement(iterations = 5, time = 1)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+@Fork(value = 1)
+public class BytesArrayIndexOfBenchmark {
+
+    static {
+        LogConfigurator.configureESLogging(); // native access requires logging to be initialized
+    }
+
+    static final byte MARKER = (byte) '\n';
+
+    @Param(value = { "64", "127", "128", "4096", "16384", "65536", "1048576" })
+    public int size;
+
+    BytesArray bytesArray;
+
+    @Setup
+    public void setup() {
+        byte[] bytes = new byte[size];
+        bytes[bytes.length - 1] = MARKER;
+        bytesArray = new BytesArray(bytes, 0, bytes.length);
+    }
+
+    @Benchmark
+    public int indexOf() {
+        return bytesArray.indexOf(MARKER, 0);
+    }
+
+    @Benchmark
+    @Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
+    public int indexOfPanama() {
+        return bytesArray.indexOf(MARKER, 0);
+    }
+
+    @Benchmark
+    public int withOffsetIndexOf() {
+        return bytesArray.indexOf(MARKER, 1);
+    }
+
+    @Benchmark
+    @Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
+    public int withOffsetIndexPanama() {
+        return bytesArray.indexOf(MARKER, 1);
+    }
+}
diff --git a/...arks/src/test/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmarkTests.java b/...arks/src/test/java/org/elasticsearch/benchmark/bytes/BytesArrayIndexOfBenchmarkTests.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.benchmark.bytes;
+
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+
+import org.elasticsearch.test.ESTestCase;
+import org.openjdk.jmh.annotations.Param;
+
+import java.util.Arrays;
+
+public class BytesArrayIndexOfBenchmarkTests extends ESTestCase {
+
+    final int size;
+
+    public BytesArrayIndexOfBenchmarkTests(int size) {
+        this.size = size;
+    }
+
+    public void testIndexOf() {
+        var bench = new BytesArrayIndexOfBenchmark();
+        bench.size = size;
+        bench.setup();
+        assertEquals(size - 1, bench.indexOf());
+        assertEquals(size - 1, bench.indexOfPanama());
+        assertEquals(size - 1, bench.withOffsetIndexOf());
+        assertEquals(size - 1, bench.withOffsetIndexPanama());
+    }
+
+    @ParametersFactory
+    public static Iterable<Object[]> parametersFactory() {
+        try {
+            var params = BytesArrayIndexOfBenchmark.class.getField("size").getAnnotationsByType(Param.class)[0].value();
+            return () -> Arrays.stream(params).map(Integer::parseInt).map(i -> new Object[] { i }).iterator();
+        } catch (NoSuchFieldException e) {
+            throw new AssertionError(e);
+        }
+    }
+}
diff --git a/docs/changelog/135087.yaml b/docs/changelog/135087.yaml
@@ -0,0 +1,5 @@
+pr: 135087
+summary: "Optimize `BytesArray::indexOf,` which is used heavily in ndjson parsing"
+area: "Performance"
+type: feature
+issues: []
diff --git a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/ESVectorUtil.java b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/ESVectorUtil.java
@@ -19,6 +19,7 @@
 import java.lang.invoke.MethodHandle;
 import java.lang.invoke.MethodHandles;
 import java.lang.invoke.MethodType;
+import java.util.Objects;
 
 import static org.elasticsearch.simdvec.internal.vectorization.ESVectorUtilSupport.B_QUERY;
 
@@ -399,4 +400,22 @@ public static void transposeHalfByte(int[] q, byte[] quantQueryByte) {
         }
         IMPL.transposeHalfByte(q, quantQueryByte);
     }
+
+    /**
+     * Searches for the first occurrence of the given marker byte in the specified range of the array.
+     *
+     * <p>The search starts at {@code offset} and examines at most {@code length} bytes. The return
+     * value is the relative index of the first occurrence of {@code marker} within this slice,
+     * or {@code -1} if not found.
+     *
+     * @param bytes  the byte array to search
+     * @param offset the starting index within the array
+     * @param length the number of bytes to examine
+     * @param marker the byte to search for
+     * @return the relative index (0..length-1) of the first match, or {@code -1} if not found
+     */
+    public static int indexOf(byte[] bytes, int offset, int length, byte marker) {
+        Objects.checkFromIndexSize(offset, length, bytes.length);
+        return IMPL.indexOf(bytes, offset, length, marker);
+    }
 }
diff --git a/...imdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ByteArrayUtils.java b/...imdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ByteArrayUtils.java
@@ -0,0 +1,110 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.simdvec.internal.vectorization;
+
+import static org.apache.lucene.util.BitUtil.VH_LE_LONG;
+
+/** Byte array utilities. */
+final class ByteArrayUtils {
+
+    /**
+     * Implementation of {@link ESVectorUtilSupport#indexOf(byte[], int, int, byte)} using fast
+     * SWAR (SIMD Within A Register) loop.
+     */
+    static int indexOf(final byte[] bytes, final int offset, final int len, final byte marker) {
+        final int end = offset + len;
+        int i = offset;
+
+        // First, try to find the marker in the first few bytes, so we can enter the faster 8-byte aligned loop below.
+        // The idea for this logic is taken from Netty's io.netty.buffer.ByteBufUtil.firstIndexOf and optimized for little endian hardware.
+        // See e.g. https://richardstartin.github.io/posts/finding-bytes for the idea behind this optimization.
+        final int byteCount = len & 7;
+        if (byteCount > 0) {
+            final int index = unrolledFirstIndexOf(bytes, i, byteCount, marker);
+            if (index != -1) {
+                return index - offset;
+            }
+            i += byteCount;
+            if (i == end) {
+                return -1;
+            }
+        }
+        final int longCount = len >>> 3;
+        // faster SWAR (SIMD Within A Register) loop
+        final long pattern = compilePattern(marker);
+        for (int j = 0; j < longCount; j++) {
+            int index = findInLong(readLongLE(bytes, i), pattern);
+            if (index < Long.BYTES) {
+                return i + index - offset;
+            }
+            i += Long.BYTES;
+        }
+        return -1;
+    }
+
+    private static long readLongLE(byte[] arr, int offset) {
+        return (long) VH_LE_LONG.get(arr, offset);
+    }
+
+    private static long compilePattern(byte byteToFind) {
+        return (byteToFind & 0xFFL) * 0x101010101010101L;
+    }
+
+    private static int findInLong(long word, long pattern) {
+        long input = word ^ pattern;
+        long tmp = (input & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL;
+        tmp = ~(tmp | input | 0x7F7F7F7F7F7F7F7FL);
+        final int binaryPosition = Long.numberOfTrailingZeros(tmp);
+        return binaryPosition >>> 3;
+    }
+
+    private static int unrolledFirstIndexOf(byte[] buffer, int fromIndex, int byteCount, byte value) {
+        if (buffer[fromIndex] == value) {
+            return fromIndex;
+        }
+        if (byteCount == 1) {
+            return -1;
+        }
+        if (buffer[fromIndex + 1] == value) {
+            return fromIndex + 1;
+        }
+        if (byteCount == 2) {
+            return -1;
+        }
+        if (buffer[fromIndex + 2] == value) {
+            return fromIndex + 2;
+        }
+        if (byteCount == 3) {
+            return -1;
+        }
+        if (buffer[fromIndex + 3] == value) {
+            return fromIndex + 3;
+        }
+        if (byteCount == 4) {
+            return -1;
+        }
+        if (buffer[fromIndex + 4] == value) {
+            return fromIndex + 4;
+        }
+        if (byteCount == 5) {
+            return -1;
+        }
+        if (buffer[fromIndex + 5] == value) {
+            return fromIndex + 5;
+        }
+        if (byteCount == 6) {
+            return -1;
+        }
+        if (buffer[fromIndex + 6] == value) {
+            return fromIndex + 6;
+        }
+        return -1;
+    }
+}
diff --git a/...ain/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java b/...ain/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java
@@ -403,4 +403,9 @@ public static void transposeHalfByteImpl(int[] q, byte[] quantQueryByte) {
         quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte;
         quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte;
     }
+
+    @Override
+    public int indexOf(byte[] bytes, int offset, int length, byte marker) {
+        return ByteArrayUtils.indexOf(bytes, offset, length, marker);
+    }
 }
diff --git a/...c/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ESVectorUtilSupport.java b/...c/src/main/java/org/elasticsearch/simdvec/internal/vectorization/ESVectorUtilSupport.java
@@ -67,4 +67,6 @@ void soarDistanceBulk(
     void packAsBinary(int[] vector, byte[] packed);
 
     void transposeHalfByte(int[] q, byte[] quantQueryByte);
+
+    int indexOf(byte[] bytes, int offset, int length, byte marker);
 }
diff --git a/...in21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java b/...in21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java
@@ -1122,4 +1122,26 @@ private void transposeHalfByte128(int[] q, byte[] quantQueryByte) {
         quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte;
         quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte;
     }
+
+    @Override
+    public int indexOf(final byte[] bytes, final int offset, final int length, final byte marker) {
+        final ByteVector markerVector = ByteVector.broadcast(ByteVector.SPECIES_PREFERRED, marker);
+        final int loopBound = ByteVector.SPECIES_PREFERRED.loopBound(length);
+        for (int i = 0; i < loopBound; i += ByteVector.SPECIES_PREFERRED.length()) {
+            ByteVector chunk = ByteVector.fromArray(ByteVector.SPECIES_PREFERRED, bytes, offset + i);
+            VectorMask<Byte> mask = chunk.eq(markerVector);
+            if (mask.anyTrue()) {
+                return i + mask.firstTrue();
+            }
+        }
+        // tail
+        if (loopBound < length) {
+            int remaining = length - loopBound;
+            int tail = ByteArrayUtils.indexOf(bytes, offset + loopBound, remaining, marker);
+            if (tail >= 0) {
+                return loopBound + tail;
+            }
+        }
+        return -1;
+    }
 }