From 416589d13eb785633bc01112144b8e884cba5ff7 Mon Sep 17 00:00:00 2001
From: David Kyle <david.kyle@elastic.co>
Date: Fri, 22 Mar 2024 11:58:57 +0000
Subject: [PATCH] [ML] Fix Array out of bounds exception in the XLM Roberta
 tokenizer (#106655) (#106661)

Increases the buffer size for the normalised form of the input unicode
character. Certain characters can have surprisingly long normalised forms
---
 docs/changelog/106655.yaml                                   | 5 +++++
 .../nlp/tokenizers/PrecompiledCharMapNormalizer.java         | 5 +----
 .../nlp/tokenizers/PrecompiledCharMapNormalizerTests.java    | 5 +++++
 3 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelog/106655.yaml

diff --git a/docs/changelog/106655.yaml b/docs/changelog/106655.yaml
new file mode 100644
index 0000000000000..98078595d5f0c
--- /dev/null
+++ b/docs/changelog/106655.yaml
@@ -0,0 +1,5 @@
+pr: 106655
+summary: Fix Array out of bounds exception in the XLM Roberta tokenizer
+area: Machine Learning
+type: bug
+issues: []
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
index 836c9a78f19d9..93dc8077196d7 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
@@ -73,10 +73,8 @@ static Config fromBase64EncodedResource(String resourcePath) throws IOException
     private final int[] offsets;
     // The entire normalized bytes representations delimited by NULL
     private final byte[] normalizedStrUtf8Bytes;
-    // Continually reused to copy a single char into utf8 bytes
-    private final byte[] reusableCharByteBuffer = new byte[4];
     // reusable char buffer for decoding utf8 bytes to determine char offset corrections
-    private final char[] reusableCharDecodeBuffer = new char[8];
+    private final char[] reusableCharDecodeBuffer = new char[64];
     private Reader transformedInput;
 
     public PrecompiledCharMapNormalizer(int[] offsets, String normalizedStr, Reader in) {
@@ -172,7 +170,6 @@ Reader normalize(CharSequence str) {
         ByteBuffer byteBuffer = StandardCharsets.UTF_8.encode(CharBuffer.wrap(str));
         byte[] strBytes = new byte[byteBuffer.limit()];
         byteBuffer.get(strBytes);
-        int[] strCp = str.codePoints().toArray();
         BreakIterator b = BreakIterator.getCharacterInstance(Locale.ROOT);
         b.setText(str);
         // We iterate the whole string, so b.first() is always `0`
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java
index d542b97eee192..eef9902d35e59 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java
@@ -57,6 +57,11 @@ public void testEmoji() throws IOException {
         assertNormalization("😀", parsed, "😀");
     }
 
+    public void testCharThatNormalizesToLongText() throws IOException {
+        PrecompiledCharMapNormalizer.Config parsed = loadTestCharMap();
+        assertNormalization("ﷺ", parsed, "صلى الله عليه وسلم");
+    }
+
     private void assertNormalization(String input, PrecompiledCharMapNormalizer.Config config, String expected) throws IOException {
         PrecompiledCharMapNormalizer normalizer = new PrecompiledCharMapNormalizer(
             config.offsets(),