[ML] Fix Array out of bounds exception in the XLM Roberta tokenizer (#…

…106655) (#106661) Increases the buffer size for the normalised form of the input unicode character. Certain characters can have surprisingly long normalised forms
elastic · Mar 22, 2024 · 416589d · 416589d
1 parent 72aa514
commit 416589d
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 4 deletions.
diff --git a/docs/changelog/106655.yaml b/docs/changelog/106655.yaml
@@ -0,0 +1,5 @@
+pr: 106655
+summary: Fix Array out of bounds exception in the XLM Roberta tokenizer
+area: Machine Learning
+type: bug
+issues: []
diff --git a/...ava/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/...ava/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
@@ -73,10 +73,8 @@ static Config fromBase64EncodedResource(String resourcePath) throws IOException
     private final int[] offsets;
     // The entire normalized bytes representations delimited by NULL
     private final byte[] normalizedStrUtf8Bytes;
-    // Continually reused to copy a single char into utf8 bytes
-    private final byte[] reusableCharByteBuffer = new byte[4];
     // reusable char buffer for decoding utf8 bytes to determine char offset corrections
-    private final char[] reusableCharDecodeBuffer = new char[8];
+    private final char[] reusableCharDecodeBuffer = new char[64];
     private Reader transformedInput;
 
     public PrecompiledCharMapNormalizer(int[] offsets, String normalizedStr, Reader in) {
@@ -172,7 +170,6 @@ Reader normalize(CharSequence str) {
         ByteBuffer byteBuffer = StandardCharsets.UTF_8.encode(CharBuffer.wrap(str));
         byte[] strBytes = new byte[byteBuffer.limit()];
         byteBuffer.get(strBytes);
-        int[] strCp = str.codePoints().toArray();
         BreakIterator b = BreakIterator.getCharacterInstance(Locale.ROOT);
         b.setText(str);
         // We iterate the whole string, so b.first() is always `0`

diff --git a/...rg/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java b/...rg/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java
@@ -57,6 +57,11 @@ public void testEmoji() throws IOException {
         assertNormalization("😀", parsed, "😀");
     }
 
+    public void testCharThatNormalizesToLongText() throws IOException {
+        PrecompiledCharMapNormalizer.Config parsed = loadTestCharMap();
+        assertNormalization("ﷺ", parsed, "صلى الله عليه وسلم");
+    }
+
     private void assertNormalization(String input, PrecompiledCharMapNormalizer.Config config, String expected) throws IOException {
         PrecompiledCharMapNormalizer normalizer = new PrecompiledCharMapNormalizer(
             config.offsets(),