From 416589d13eb785633bc01112144b8e884cba5ff7 Mon Sep 17 00:00:00 2001 From: David Kyle Date: Fri, 22 Mar 2024 11:58:57 +0000 Subject: [PATCH] [ML] Fix Array out of bounds exception in the XLM Roberta tokenizer (#106655) (#106661) Increases the buffer size for the normalised form of the input unicode character. Certain characters can have surprisingly long normalised forms --- docs/changelog/106655.yaml | 5 +++++ .../nlp/tokenizers/PrecompiledCharMapNormalizer.java | 5 +---- .../nlp/tokenizers/PrecompiledCharMapNormalizerTests.java | 5 +++++ 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 docs/changelog/106655.yaml diff --git a/docs/changelog/106655.yaml b/docs/changelog/106655.yaml new file mode 100644 index 0000000000000..98078595d5f0c --- /dev/null +++ b/docs/changelog/106655.yaml @@ -0,0 +1,5 @@ +pr: 106655 +summary: Fix Array out of bounds exception in the XLM Roberta tokenizer +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java index 836c9a78f19d9..93dc8077196d7 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java @@ -73,10 +73,8 @@ static Config fromBase64EncodedResource(String resourcePath) throws IOException private final int[] offsets; // The entire normalized bytes representations delimited by NULL private final byte[] normalizedStrUtf8Bytes; - // Continually reused to copy a single char into utf8 bytes - private final byte[] reusableCharByteBuffer = new byte[4]; // reusable char buffer for decoding utf8 bytes to determine char offset corrections - private final char[] reusableCharDecodeBuffer = new char[8]; + private final char[] reusableCharDecodeBuffer = new char[64]; private Reader transformedInput; public PrecompiledCharMapNormalizer(int[] offsets, String normalizedStr, Reader in) { @@ -172,7 +170,6 @@ Reader normalize(CharSequence str) { ByteBuffer byteBuffer = StandardCharsets.UTF_8.encode(CharBuffer.wrap(str)); byte[] strBytes = new byte[byteBuffer.limit()]; byteBuffer.get(strBytes); - int[] strCp = str.codePoints().toArray(); BreakIterator b = BreakIterator.getCharacterInstance(Locale.ROOT); b.setText(str); // We iterate the whole string, so b.first() is always `0` diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java index d542b97eee192..eef9902d35e59 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java @@ -57,6 +57,11 @@ public void testEmoji() throws IOException { assertNormalization("😀", parsed, "😀"); } + public void testCharThatNormalizesToLongText() throws IOException { + PrecompiledCharMapNormalizer.Config parsed = loadTestCharMap(); + assertNormalization("ﷺ", parsed, "صلى الله عليه وسلم"); + } + private void assertNormalization(String input, PrecompiledCharMapNormalizer.Config config, String expected) throws IOException { PrecompiledCharMapNormalizer normalizer = new PrecompiledCharMapNormalizer( config.offsets(),