Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion CodenameOne/src/com/codename1/util/regex/RECharacter.java
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,30 @@ public static byte getType(char c) {
//# return CHAR_CLASSES[i][c - spaceIndex];
//# }
//# }
//# return UNASSIGNED;
//#else
if (c < 128) {
return CHAR_CLASSES[c];
}
//#endif
// The framework is compiled against the CLDC11 stub, which does not
// expose Character.getType or isLetter. Compose what we need from the
// available primitives. Letters that are neither cased nor digits
// (modifier letters, OTHER_LETTER such as CJK ideographs) still fall
// through to UNASSIGNED.
if (Character.isLowerCase(c)) {
return LOWERCASE_LETTER;
}
if (Character.isUpperCase(c)) {
return UPPERCASE_LETTER;
}
if (Character.isDigit(c)) {
return DECIMAL_DIGIT_NUMBER;
}
if (Character.isSpaceChar(c)) {
return SPACE_SEPARATOR;
}
return UNASSIGNED;
//#endif
}
//#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,94 @@ void testPosixClassesAndEscapes() throws Exception {
assertFalse(wordThenDigits.match("item-42"));
}

// Non-Latin coverage. Source files must remain ASCII-only (CI javac uses
// the platform default encoding), so non-ASCII test data is written with
// Java's backslash-u escape syntax inside string literals.
//
// U+00E7 c-cedilla (lower) U+00C7 C-cedilla (upper)
// U+03B1 Greek alpha (lower) U+03A3 Greek Sigma (upper)
// U+044F Cyrillic ya (lower) U+042F Cyrillic YA (upper)
// U+00BD vulgar fraction one-half (OTHER_NUMBER, not a decimal digit)
// U+20AC euro sign (CURRENCY_SYMBOL)
//
// The framework is compiled against the CLDC11 stub, which exposes only
// isLowerCase / isUpperCase / isDigit / isSpaceChar (no isLetter or
// getType). That is enough for cased letters in Latin / Greek / Cyrillic
// and decimal digits, but uncased letters such as CJK ideographs
// (OTHER_LETTER) cannot be classified and remain unmatched here.

@FormTest
void testPosixAlphaMatchesNonLatinLetters() throws Exception {
RE alpha = new RE("^[[:alpha:]]+$");
assertTrue(alpha.match("\u00E7\u00C7"), "Latin with cedilla");
assertTrue(alpha.match("\u03B1\u03A3"), "Greek letters");
assertTrue(alpha.match("\u042F\u044F"), "Cyrillic letters");
assertTrue(alpha.match("abc\u00E7\u03B1\u042F"), "mixed scripts");

assertFalse(alpha.match("\u00E71"), "letter followed by ASCII digit");
assertFalse(alpha.match("\u00BD"), "vulgar fraction is not alpha");
assertFalse(alpha.match("\u20AC"), "currency symbol is not alpha");
}

@FormTest
void testPosixAlnumMatchesNonLatinLettersAndDigits() throws Exception {
RE alnum = new RE("^[[:alnum:]]+$");
assertTrue(alnum.match("\u00E7123"), "c-cedilla followed by digits");
assertTrue(alnum.match("\u03B1\u03B2\u03B3"), "Greek run");
assertTrue(alnum.match("abc\u042F9"), "ASCII + Cyrillic + digit");

assertFalse(alnum.match("\u00E7-123"), "hyphen breaks alnum");
assertFalse(alnum.match("\u00BD"), "fraction is not alnum (not a decimal digit)");
assertFalse(alnum.match("\u20AC"), "currency symbol is not alnum");
}

@FormTest
void testPosixLowerUpperOnNonLatinLetters() throws Exception {
RE lower = new RE("^[[:lower:]]+$");
assertTrue(lower.match("\u00E7"), "c-cedilla is lower");
assertTrue(lower.match("\u03B1"), "Greek alpha is lower");
assertTrue(lower.match("\u044F"), "Cyrillic ya is lower");
assertFalse(lower.match("\u00C7"), "C-cedilla is not lower");
assertFalse(lower.match("\u042F"), "Cyrillic YA is not lower");

RE upper = new RE("^[[:upper:]]+$");
assertTrue(upper.match("\u00C7"), "C-cedilla is upper");
assertTrue(upper.match("\u03A3"), "Greek Sigma is upper");
assertTrue(upper.match("\u042F"), "Cyrillic YA is upper");
assertFalse(upper.match("\u00E7"), "c-cedilla is not upper");
}

@FormTest
void testReportedAlphaAlnumCaptureBug() throws Exception {
// Regression: "test:\\s*([[:alpha:]][[:alnum:]]*)" used to silently fail
// to match identifiers that begin with a non-ASCII letter, because
// RECharacter.getType() returned UNASSIGNED for any char >= 128.
RE expression = new RE("test:\\s*([[:alpha:]][[:alnum:]]*)");

assertTrue(expression.match("test: \u00E7123"),
"alpha+alnum should match identifier starting with c-cedilla");
assertEquals("\u00E7123", expression.getParen(1));

assertTrue(expression.match("test: \u03B1\u03B2\u03B30"),
"alpha+alnum should match a Greek identifier");
assertEquals("\u03B1\u03B2\u03B30", expression.getParen(1));

assertTrue(expression.match("test: \u042F\u044F1"),
"alpha+alnum should match a Cyrillic identifier");
assertEquals("\u042F\u044F1", expression.getParen(1));

// A leading ASCII digit is still rejected (must start with [[:alpha:]]).
assertFalse(expression.match("test: 9abc"));
}

@FormTest
void testPosixDigitIsAsciiOnlyForOtherNumbers() throws Exception {
// [[:digit:]] is decimal digits; vulgar fractions / superscripts
// (OTHER_NUMBER) and currency / symbols must not match.
RE digit = new RE("^[[:digit:]]+$");
assertFalse(digit.match("\u00BD"), "one-half is not a decimal digit");
assertFalse(digit.match("\u20AC"), "euro sign is not a digit");
assertFalse(digit.match("\u00E7"), "letter is not a digit");
}

}
Loading