Skip to content

Commit

Permalink
[ML] Fix Array out of bounds exception in the XLM Roberta tokenizer (#…
Browse files Browse the repository at this point in the history
…106655) (#106661)

Increases the buffer size for the normalised form of the input unicode 
character. Certain characters can have surprisingly long normalised forms
  • Loading branch information
davidkyle committed Mar 22, 2024
1 parent 72aa514 commit 416589d
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 4 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/106655.yaml
@@ -0,0 +1,5 @@
pr: 106655
summary: Fix Array out of bounds exception in the XLM Roberta tokenizer
area: Machine Learning
type: bug
issues: []
Expand Up @@ -73,10 +73,8 @@ static Config fromBase64EncodedResource(String resourcePath) throws IOException
private final int[] offsets;
// The entire normalized bytes representations delimited by NULL
private final byte[] normalizedStrUtf8Bytes;
// Continually reused to copy a single char into utf8 bytes
private final byte[] reusableCharByteBuffer = new byte[4];
// reusable char buffer for decoding utf8 bytes to determine char offset corrections
private final char[] reusableCharDecodeBuffer = new char[8];
private final char[] reusableCharDecodeBuffer = new char[64];
private Reader transformedInput;

public PrecompiledCharMapNormalizer(int[] offsets, String normalizedStr, Reader in) {
Expand Down Expand Up @@ -172,7 +170,6 @@ Reader normalize(CharSequence str) {
ByteBuffer byteBuffer = StandardCharsets.UTF_8.encode(CharBuffer.wrap(str));
byte[] strBytes = new byte[byteBuffer.limit()];
byteBuffer.get(strBytes);
int[] strCp = str.codePoints().toArray();
BreakIterator b = BreakIterator.getCharacterInstance(Locale.ROOT);
b.setText(str);
// We iterate the whole string, so b.first() is always `0`
Expand Down
Expand Up @@ -57,6 +57,11 @@ public void testEmoji() throws IOException {
assertNormalization("😀", parsed, "😀");
}

public void testCharThatNormalizesToLongText() throws IOException {
PrecompiledCharMapNormalizer.Config parsed = loadTestCharMap();
assertNormalization("ﷺ", parsed, "صلى الله عليه وسلم");
}

private void assertNormalization(String input, PrecompiledCharMapNormalizer.Config config, String expected) throws IOException {
PrecompiledCharMapNormalizer normalizer = new PrecompiledCharMapNormalizer(
config.offsets(),
Expand Down

0 comments on commit 416589d

Please sign in to comment.