From 25049de90fba35afac8ef30a6e36cfb2f43e66e8 Mon Sep 17 00:00:00 2001 From: Doug Ilijev Date: Wed, 11 Jan 2017 17:16:43 -0800 Subject: [PATCH] Fix incorrect range which causes incorrect matches. Updated range and added regression tests. Relevant lines of the UnicodeData.txt for Unicode 8.0 [1][2]: 01DE;LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON;Lu;0;L;00C4 0304;;;;N;LATIN CAPITAL LETTER A DIAERESIS MACRON;;;01DF; 01DF;LATIN SMALL LETTER A WITH DIAERESIS AND MACRON;Ll;0;L;00E4 0304;;;;N;LATIN SMALL LETTER A DIAERESIS MACRON;;01DE;;01DE 01E0;LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON;Lu;0;L;0226 0304;;;;N;LATIN CAPITAL LETTER A DOT MACRON;;;01E1; 01E1;LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON;Ll;0;L;0227 0304;;;;N;LATIN SMALL LETTER A DOT MACRON;;01E0;;01E0 01E2;LATIN CAPITAL LETTER AE WITH MACRON;Lu;0;L;00C6 0304;;;;N;LATIN CAPITAL LETTER A E MACRON;;;01E3; 01E3;LATIN SMALL LETTER AE WITH MACRON;Ll;0;L;00E6 0304;;;;N;LATIN SMALL LETTER A E MACRON;;01E2;;01E2 01E4;LATIN CAPITAL LETTER G WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER G BAR;;;01E5; 01E5;LATIN SMALL LETTER G WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER G BAR;;01E4;;01E4 01E6;LATIN CAPITAL LETTER G WITH CARON;Lu;0;L;0047 030C;;;;N;LATIN CAPITAL LETTER G HACEK;;;01E7; 01E7;LATIN SMALL LETTER G WITH CARON;Ll;0;L;0067 030C;;;;N;LATIN SMALL LETTER G HACEK;;01E6;;01E6 01E8;LATIN CAPITAL LETTER K WITH CARON;Lu;0;L;004B 030C;;;;N;LATIN CAPITAL LETTER K HACEK;;;01E9; 01E9;LATIN SMALL LETTER K WITH CARON;Ll;0;L;006B 030C;;;;N;LATIN SMALL LETTER K HACEK;;01E8;;01E8 01EA;LATIN CAPITAL LETTER O WITH OGONEK;Lu;0;L;004F 0328;;;;N;LATIN CAPITAL LETTER O OGONEK;;;01EB; 01EB;LATIN SMALL LETTER O WITH OGONEK;Ll;0;L;006F 0328;;;;N;LATIN SMALL LETTER O OGONEK;;01EA;;01EA 01EC;LATIN CAPITAL LETTER O WITH OGONEK AND MACRON;Lu;0;L;01EA 0304;;;;N;LATIN CAPITAL LETTER O OGONEK MACRON;;;01ED; 01ED;LATIN SMALL LETTER O WITH OGONEK AND MACRON;Ll;0;L;01EB 0304;;;;N;LATIN SMALL LETTER O OGONEK MACRON;;01EC;;01EC 01EE;LATIN CAPITAL LETTER EZH WITH CARON;Lu;0;L;01B7 030C;;;;N;LATIN CAPITAL LETTER YOGH HACEK;;;01EF; 01EF;LATIN SMALL LETTER EZH WITH CARON;Ll;0;L;0292 030C;;;;N;LATIN SMALL LETTER YOGH HACEK;;01EE;;01EE 01F0;LATIN SMALL LETTER J WITH CARON;Ll;0;L;006A 030C;;;;N;LATIN SMALL LETTER J HACEK;;;; 01F1;LATIN CAPITAL LETTER DZ;Lu;0;L; 0044 005A;;;;N;;;;01F3;01F2 DZ (uppercase) 01F2;LATIN CAPITAL LETTER D WITH SMALL LETTER Z;Lt;0;L; 0044 007A;;;;N;;;01F1;01F3;01F2 DZ (titlecase) 01F3;LATIN SMALL LETTER DZ;Ll;0;L; 0064 007A;;;;N;;;01F1;;01F2 DZ (lowercase) 01F4;LATIN CAPITAL LETTER G WITH ACUTE;Lu;0;L;0047 0301;;;;N;;;;01F5; [3] 01F5;LATIN SMALL LETTER G WITH ACUTE;Ll;0;L;0067 0301;;;;N;;;01F4;;01F4 [3] -- [1] Currently fixing bugs in Unicode 8.0 because the source code claims compliance with Unicode 8.0 at the moment. Will update to Unicode 9.0 later. [2] These lines in Unicode 8.0 are equivalent to the lines in Unicode 9.0. [3] Already included in the table as a pair mapping. --- lib/Parser/CaseInsensitive.cpp | 2 +- test/es6/regex-unicode-CaseInsensitive.js | 78 +++++++++++++++++++++++ test/es6/rlexe.xml | 5 ++ 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 test/es6/regex-unicode-CaseInsensitive.js diff --git a/lib/Parser/CaseInsensitive.cpp b/lib/Parser/CaseInsensitive.cpp index c6c4cc28029..f8ab41e5597 100644 --- a/lib/Parser/CaseInsensitive.cpp +++ b/lib/Parser/CaseInsensitive.cpp @@ -295,7 +295,7 @@ END { 1, MappingSource::UnicodeData, 0x01cc, 0x01cc, -2, -1, 0, 0, 2, MappingSource::UnicodeData, 0x01cd, 0x01dc, -1, 1, 1, 1, 1, MappingSource::UnicodeData, 0x01dd, 0x01dd, -79, 0, 0, 0, - 2, MappingSource::UnicodeData, 0x01de, 0x01f5, -1, 1, 1, 1, + 2, MappingSource::UnicodeData, 0x01de, 0x01ef, -1, 1, 1, 1, 1, MappingSource::UnicodeData, 0x01f1, 0x01f1, 0, 1, 2, 2, 1, MappingSource::UnicodeData, 0x01f2, 0x01f2, -1, 0, 1, 1, 1, MappingSource::UnicodeData, 0x01f3, 0x01f3, -2, -1, 0, 0, diff --git a/test/es6/regex-unicode-CaseInsensitive.js b/test/es6/regex-unicode-CaseInsensitive.js new file mode 100644 index 00000000000..b066f23db6c --- /dev/null +++ b/test/es6/regex-unicode-CaseInsensitive.js @@ -0,0 +1,78 @@ +//------------------------------------------------------------------------------------------------------- +// Copyright (C) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. +//------------------------------------------------------------------------------------------------------- + +function assertMatches(re, codepoint, str) { + let passed = re.test(str); + if (!passed) { + console.log("FAILED -- regex: " + re.toString() + " should match codepoint: " + codepoint.toString(16)); + } +} + +function assertDoesNotMatch(re, codepoint, str) { + let passed = re.test(str); + if (passed) { + console.log("FAILED -- regex: " + re.toString() + " should not match codepoint: " + codepoint.toString(16)); + } +} + +// Detect regressions in the CaseInsensitive table + +// 01BA != 01BB under /i. +assertDoesNotMatch(/\u{01ba}/iu, 0x01bb, "\u01bb"); +assertDoesNotMatch(/\u{01bb}/iu, 0x01ba, "\u01ba"); + +// 01F0 doesn't match anything +assertDoesNotMatch(/\u{01f0}/iu, 0x01f1, "\u01f1"); +assertDoesNotMatch(/\u{01f1}/iu, 0x01f0, "\u01f0"); + +// 01F4-5 match (G with ACUTE) +assertMatches(/\u{01f4}/iu, 0x01f5, "\u01f5"); +assertMatches(/\u{01f5}/iu, 0x01f4, "\u01f4"); + +// +// Latin ligature triples DZ WITH CARON, LJ, NJ (01C4-01CC); DZ (01F1-3) +// + +assertMatches(/\u{01c4}/iu, 0x01c4, '\u{01c4}'); +assertMatches(/\u{01c4}/iu, 0x01c5, '\u{01c5}'); +assertMatches(/\u{01c4}/iu, 0x01c6, '\u{01c6}'); +assertMatches(/\u{01c5}/iu, 0x01c4, '\u{01c4}'); +assertMatches(/\u{01c5}/iu, 0x01c5, '\u{01c5}'); +assertMatches(/\u{01c5}/iu, 0x01c6, '\u{01c6}'); +assertMatches(/\u{01c6}/iu, 0x01c4, '\u{01c4}'); +assertMatches(/\u{01c6}/iu, 0x01c5, '\u{01c5}'); +assertMatches(/\u{01c6}/iu, 0x01c6, '\u{01c6}'); + +assertMatches(/\u{01c7}/iu, 0x01c7, '\u{01c7}'); +assertMatches(/\u{01c7}/iu, 0x01c8, '\u{01c8}'); +assertMatches(/\u{01c7}/iu, 0x01c9, '\u{01c9}'); +assertMatches(/\u{01c9}/iu, 0x01c7, '\u{01c7}'); +assertMatches(/\u{01c9}/iu, 0x01c8, '\u{01c8}'); +assertMatches(/\u{01c9}/iu, 0x01c9, '\u{01c9}'); +assertMatches(/\u{01c8}/iu, 0x01c7, '\u{01c7}'); +assertMatches(/\u{01c8}/iu, 0x01c8, '\u{01c8}'); +assertMatches(/\u{01c8}/iu, 0x01c9, '\u{01c9}'); + +assertMatches(/\u{01ca}/iu, 0x01ca, '\u{01ca}'); +assertMatches(/\u{01ca}/iu, 0x01cb, '\u{01cb}'); +assertMatches(/\u{01ca}/iu, 0x01cc, '\u{01cc}'); +assertMatches(/\u{01cb}/iu, 0x01ca, '\u{01ca}'); +assertMatches(/\u{01cb}/iu, 0x01cb, '\u{01cb}'); +assertMatches(/\u{01cb}/iu, 0x01cc, '\u{01cc}'); +assertMatches(/\u{01cc}/iu, 0x01ca, '\u{01ca}'); +assertMatches(/\u{01cc}/iu, 0x01cb, '\u{01cb}'); +assertMatches(/\u{01cc}/iu, 0x01cc, '\u{01cc}'); + +assertMatches(/\u{01f1}/iu, 0x01f1, '\u{01f1}'); +assertMatches(/\u{01f1}/iu, 0x01f2, '\u{01f2}'); +assertMatches(/\u{01f1}/iu, 0x01f3, '\u{01f3}'); +assertMatches(/\u{01f2}/iu, 0x01f2, '\u{01f2}'); +assertMatches(/\u{01f2}/iu, 0x01f1, '\u{01f1}'); +assertMatches(/\u{01f2}/iu, 0x01f3, '\u{01f3}'); +assertMatches(/\u{01f3}/iu, 0x01f1, '\u{01f1}'); +assertMatches(/\u{01f3}/iu, 0x01f2, '\u{01f2}'); +assertMatches(/\u{01f3}/iu, 0x01f3, '\u{01f3}'); + +console.log("PASS"); diff --git a/test/es6/rlexe.xml b/test/es6/rlexe.xml index 236520f42c9..013e3f36901 100644 --- a/test/es6/rlexe.xml +++ b/test/es6/rlexe.xml @@ -997,6 +997,11 @@ -args summary -endargs + + + regex-unicode-CaseInsensitive.js + + regex-set.js