diff --git a/CodenameOne/src/com/codename1/util/regex/RECharacter.java b/CodenameOne/src/com/codename1/util/regex/RECharacter.java index de2561c004..595237077e 100644 --- a/CodenameOne/src/com/codename1/util/regex/RECharacter.java +++ b/CodenameOne/src/com/codename1/util/regex/RECharacter.java @@ -192,12 +192,30 @@ public static byte getType(char c) { //# return CHAR_CLASSES[i][c - spaceIndex]; //# } //# } +//# return UNASSIGNED; //#else if (c < 128) { return CHAR_CLASSES[c]; } -//#endif + // The framework is compiled against the CLDC11 stub, which does not + // expose Character.getType or isLetter. Compose what we need from the + // available primitives. Letters that are neither cased nor digits + // (modifier letters, OTHER_LETTER such as CJK ideographs) still fall + // through to UNASSIGNED. + if (Character.isLowerCase(c)) { + return LOWERCASE_LETTER; + } + if (Character.isUpperCase(c)) { + return UPPERCASE_LETTER; + } + if (Character.isDigit(c)) { + return DECIMAL_DIGIT_NUMBER; + } + if (Character.isSpaceChar(c)) { + return SPACE_SEPARATOR; + } return UNASSIGNED; +//#endif } //#endif diff --git a/maven/core-unittests/src/test/java/com/codename1/util/regex/RETest.java b/maven/core-unittests/src/test/java/com/codename1/util/regex/RETest.java index fbee325046..8617f4d8a7 100644 --- a/maven/core-unittests/src/test/java/com/codename1/util/regex/RETest.java +++ b/maven/core-unittests/src/test/java/com/codename1/util/regex/RETest.java @@ -70,4 +70,94 @@ void testPosixClassesAndEscapes() throws Exception { assertFalse(wordThenDigits.match("item-42")); } + // Non-Latin coverage. Source files must remain ASCII-only (CI javac uses + // the platform default encoding), so non-ASCII test data is written with + // Java's backslash-u escape syntax inside string literals. + // + // U+00E7 c-cedilla (lower) U+00C7 C-cedilla (upper) + // U+03B1 Greek alpha (lower) U+03A3 Greek Sigma (upper) + // U+044F Cyrillic ya (lower) U+042F Cyrillic YA (upper) + // U+00BD vulgar fraction one-half (OTHER_NUMBER, not a decimal digit) + // U+20AC euro sign (CURRENCY_SYMBOL) + // + // The framework is compiled against the CLDC11 stub, which exposes only + // isLowerCase / isUpperCase / isDigit / isSpaceChar (no isLetter or + // getType). That is enough for cased letters in Latin / Greek / Cyrillic + // and decimal digits, but uncased letters such as CJK ideographs + // (OTHER_LETTER) cannot be classified and remain unmatched here. + + @FormTest + void testPosixAlphaMatchesNonLatinLetters() throws Exception { + RE alpha = new RE("^[[:alpha:]]+$"); + assertTrue(alpha.match("\u00E7\u00C7"), "Latin with cedilla"); + assertTrue(alpha.match("\u03B1\u03A3"), "Greek letters"); + assertTrue(alpha.match("\u042F\u044F"), "Cyrillic letters"); + assertTrue(alpha.match("abc\u00E7\u03B1\u042F"), "mixed scripts"); + + assertFalse(alpha.match("\u00E71"), "letter followed by ASCII digit"); + assertFalse(alpha.match("\u00BD"), "vulgar fraction is not alpha"); + assertFalse(alpha.match("\u20AC"), "currency symbol is not alpha"); + } + + @FormTest + void testPosixAlnumMatchesNonLatinLettersAndDigits() throws Exception { + RE alnum = new RE("^[[:alnum:]]+$"); + assertTrue(alnum.match("\u00E7123"), "c-cedilla followed by digits"); + assertTrue(alnum.match("\u03B1\u03B2\u03B3"), "Greek run"); + assertTrue(alnum.match("abc\u042F9"), "ASCII + Cyrillic + digit"); + + assertFalse(alnum.match("\u00E7-123"), "hyphen breaks alnum"); + assertFalse(alnum.match("\u00BD"), "fraction is not alnum (not a decimal digit)"); + assertFalse(alnum.match("\u20AC"), "currency symbol is not alnum"); + } + + @FormTest + void testPosixLowerUpperOnNonLatinLetters() throws Exception { + RE lower = new RE("^[[:lower:]]+$"); + assertTrue(lower.match("\u00E7"), "c-cedilla is lower"); + assertTrue(lower.match("\u03B1"), "Greek alpha is lower"); + assertTrue(lower.match("\u044F"), "Cyrillic ya is lower"); + assertFalse(lower.match("\u00C7"), "C-cedilla is not lower"); + assertFalse(lower.match("\u042F"), "Cyrillic YA is not lower"); + + RE upper = new RE("^[[:upper:]]+$"); + assertTrue(upper.match("\u00C7"), "C-cedilla is upper"); + assertTrue(upper.match("\u03A3"), "Greek Sigma is upper"); + assertTrue(upper.match("\u042F"), "Cyrillic YA is upper"); + assertFalse(upper.match("\u00E7"), "c-cedilla is not upper"); + } + + @FormTest + void testReportedAlphaAlnumCaptureBug() throws Exception { + // Regression: "test:\\s*([[:alpha:]][[:alnum:]]*)" used to silently fail + // to match identifiers that begin with a non-ASCII letter, because + // RECharacter.getType() returned UNASSIGNED for any char >= 128. + RE expression = new RE("test:\\s*([[:alpha:]][[:alnum:]]*)"); + + assertTrue(expression.match("test: \u00E7123"), + "alpha+alnum should match identifier starting with c-cedilla"); + assertEquals("\u00E7123", expression.getParen(1)); + + assertTrue(expression.match("test: \u03B1\u03B2\u03B30"), + "alpha+alnum should match a Greek identifier"); + assertEquals("\u03B1\u03B2\u03B30", expression.getParen(1)); + + assertTrue(expression.match("test: \u042F\u044F1"), + "alpha+alnum should match a Cyrillic identifier"); + assertEquals("\u042F\u044F1", expression.getParen(1)); + + // A leading ASCII digit is still rejected (must start with [[:alpha:]]). + assertFalse(expression.match("test: 9abc")); + } + + @FormTest + void testPosixDigitIsAsciiOnlyForOtherNumbers() throws Exception { + // [[:digit:]] is decimal digits; vulgar fractions / superscripts + // (OTHER_NUMBER) and currency / symbols must not match. + RE digit = new RE("^[[:digit:]]+$"); + assertFalse(digit.match("\u00BD"), "one-half is not a decimal digit"); + assertFalse(digit.match("\u20AC"), "euro sign is not a digit"); + assertFalse(digit.match("\u00E7"), "letter is not a digit"); + } + }