feat: add option allowEscape to no-misleading-character-class rule (

#18208) * feat: add option `allowEscape` to `no-misleading-character-class` rule * fix and clarify docs * add `incorrect` / `correct` tags * change a test case * don't allow unescaped combining marks
eslint · May 10, 2024 · 069aa68 · 069aa68
1 parent 7065196
commit 069aa68
Show file tree

Hide file tree

Showing 3 changed files with 286 additions and 36 deletions.
diff --git a/docs/src/rules/no-misleading-character-class.md b/docs/src/rules/no-misleading-character-class.md
@@ -7,10 +7,10 @@ rule_type: problem
 
 
 
-Unicode includes the characters which are made with multiple code points.
-RegExp character class syntax (`/[abc]/`) cannot handle characters which are made by multiple code points as a character; those characters will be dissolved to each code point. For example, `❇️` is made by `❇` (`U+2747`) and VARIATION SELECTOR-16 (`U+FE0F`). If this character is in RegExp character class, it will match to either `❇` (`U+2747`) or VARIATION SELECTOR-16 (`U+FE0F`) rather than `❇️`.
+Unicode includes characters which are made by multiple code points.
+RegExp character class syntax (`/[abc]/`) cannot handle characters which are made by multiple code points as a character; those characters will be dissolved to each code point. For example, `❇️` is made by `❇` (`U+2747`) and VARIATION SELECTOR-16 (`U+FE0F`). If this character is in a RegExp character class, it will match either `❇` (`U+2747`) or VARIATION SELECTOR-16 (`U+FE0F`) rather than `❇️`.
 
-This rule reports the regular expressions which include multiple code point characters in character class syntax. This rule considers the following characters as multiple code point characters.
+This rule reports regular expressions which include multiple code point characters in character class syntax. This rule considers the following characters as multiple code point characters.
 
 **A character with combining characters:**
 
@@ -51,7 +51,7 @@ The combining characters are characters which belong to one of `Mc`, `Me`, and `
 
 ## Rule Details
 
-This rule reports the regular expressions which include multiple code point characters in character class syntax.
+This rule reports regular expressions which include multiple code point characters in character class syntax.
 
 Examples of **incorrect** code for this rule:
 
@@ -66,6 +66,7 @@ Examples of **incorrect** code for this rule:
 /^[🇯🇵]$/u;
 /^[👨‍👩‍👦]$/u;
 /^[👍]$/;
+new RegExp("[🎵]");
 ```
 
 :::
@@ -80,6 +81,50 @@ Examples of **correct** code for this rule:
 /^[abc]$/;
 /^[👍]$/u;
 /^[\q{👶🏻}]$/v;
+new RegExp("^[]$");
+new RegExp(`[Á-${z}]`, "u"); // variable pattern
+```
+
+:::
+
+## Options
+
+This rule has an object option:
+
+* `"allowEscape"`: When set to `true`, the rule allows any grouping of code points inside a character class as long as they are written using escape sequences. This option only has effect on regular expression literals and on regular expressions created with the `RegExp` constructor with a literal argument as a pattern.
+
+### allowEscape
+
+Examples of **incorrect** code for this rule with the `{ "allowEscape": true }` option:
+
+::: incorrect
+
+```js
+/* eslint no-misleading-character-class: ["error", { "allowEscape": true }] */
+
+/[\👍]/; // backslash can be omitted
+
+new RegExp("[\ud83d" + "\udc4d]");
+
+const pattern = "[\ud83d\udc4d]";
+new RegExp(pattern);
+```
+
+:::
+
+Examples of **correct** code for this rule with the `{ "allowEscape": true }` option:
+
+::: correct
+
+```js
+/* eslint no-misleading-character-class: ["error", { "allowEscape": true }] */
+
+/[\ud83d\udc4d]/;
+/[\u00B7\u0300-\u036F]/u;
+/[👨\u200d👩]/u;
+new RegExp("[\x41\u0301]");
+new RegExp(`[\u{1F1EF}\u{1F1F5}]`, "u");
+new RegExp("[\\u{1F1EF}\\u{1F1F5}]", "u");
 ```
 
 :::

diff --git a/lib/rules/no-misleading-character-class.js b/lib/rules/no-misleading-character-class.js
@@ -85,12 +85,10 @@ function isUnicodeCodePointEscape(char) {
 const findCharacterSequences = {
     *surrogatePairWithoutUFlag(chars) {
         for (const [index, char] of chars.entries()) {
-            if (index === 0) {
-                continue;
-            }
             const previous = chars[index - 1];
 
             if (
+                previous && char &&
                 isSurrogatePair(previous.value, char.value) &&
                 !isUnicodeCodePointEscape(previous) &&
                 !isUnicodeCodePointEscape(char)
@@ -102,12 +100,10 @@ const findCharacterSequences = {
 
     *surrogatePair(chars) {
         for (const [index, char] of chars.entries()) {
-            if (index === 0) {
-                continue;
-            }
             const previous = chars[index - 1];
 
             if (
+                previous && char &&
                 isSurrogatePair(previous.value, char.value) &&
                 (
                     isUnicodeCodePointEscape(previous) ||
@@ -119,14 +115,17 @@ const findCharacterSequences = {
         }
     },
 
-    *combiningClass(chars) {
+    *combiningClass(chars, unfilteredChars) {
+
+        /*
+         * When `allowEscape` is `true`, a combined character should only be allowed if the combining mark appears as an escape sequence.
+         * This means that the base character should be considered even if it's escaped.
+         */
         for (const [index, char] of chars.entries()) {
-            if (index === 0) {
-                continue;
-            }
-            const previous = chars[index - 1];
+            const previous = unfilteredChars[index - 1];
 
             if (
+                previous && char &&
                 isCombiningCharacter(char.value) &&
                 !isCombiningCharacter(previous.value)
             ) {
@@ -137,12 +136,10 @@ const findCharacterSequences = {
 
     *emojiModifier(chars) {
         for (const [index, char] of chars.entries()) {
-            if (index === 0) {
-                continue;
-            }
             const previous = chars[index - 1];
 
             if (
+                previous && char &&
                 isEmojiModifier(char.value) &&
                 !isEmojiModifier(previous.value)
             ) {
@@ -153,12 +150,10 @@ const findCharacterSequences = {
 
     *regionalIndicatorSymbol(chars) {
         for (const [index, char] of chars.entries()) {
-            if (index === 0) {
-                continue;
-            }
             const previous = chars[index - 1];
 
             if (
+                previous && char &&
                 isRegionalIndicatorSymbol(char.value) &&
                 isRegionalIndicatorSymbol(previous.value)
             ) {
@@ -171,17 +166,18 @@ const findCharacterSequences = {
         let sequence = null;
 
         for (const [index, char] of chars.entries()) {
-            if (index === 0 || index === chars.length - 1) {
-                continue;
-            }
+            const previous = chars[index - 1];
+            const next = chars[index + 1];
+
             if (
+                previous && char && next &&
                 char.value === 0x200d &&
-                chars[index - 1].value !== 0x200d &&
-                chars[index + 1].value !== 0x200d
+                previous.value !== 0x200d &&
+                next.value !== 0x200d
             ) {
                 if (sequence) {
-                    if (sequence.at(-1) === chars[index - 1]) {
-                        sequence.push(char, chars[index + 1]); // append to the sequence
+                    if (sequence.at(-1) === previous) {
+                        sequence.push(char, next); // append to the sequence
                     } else {
                         yield sequence;
                         sequence = chars.slice(index - 1, index + 2);
@@ -227,6 +223,41 @@ function getStaticValueOrRegex(node, initialScope) {
     return staticValue;
 }
 
+/**
+ * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
+ * This function requires the source text of the character to be known.
+ * @param {Character} char Character to check.
+ * @param {string} charSource Source text of the character to check.
+ * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
+ */
+function checkForAcceptableEscape(char, charSource) {
+    if (!charSource.startsWith("\\")) {
+        return false;
+    }
+    const match = /(?<=^\\+).$/su.exec(charSource);
+
+    return match?.[0] !== String.fromCodePoint(char.value);
+}
+
+/**
+ * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
+ * This function works with characters that are produced by a string or template literal.
+ * It requires the source text and the CodeUnit list of the literal to be known.
+ * @param {Character} char Character to check.
+ * @param {string} nodeSource Source text of the string or template literal that produces the character.
+ * @param {CodeUnit[]} codeUnits List of CodeUnit objects of the literal that produces the character.
+ * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
+ */
+function checkForAcceptableEscapeInString(char, nodeSource, codeUnits) {
+    const firstIndex = char.start;
+    const lastIndex = char.end - 1;
+    const start = codeUnits[firstIndex].start;
+    const end = codeUnits[lastIndex].end;
+    const charSource = nodeSource.slice(start, end);
+
+    return checkForAcceptableEscape(char, charSource);
+}
+
 //------------------------------------------------------------------------------
 // Rule Definition
 //------------------------------------------------------------------------------
@@ -244,7 +275,18 @@ module.exports = {
 
         hasSuggestions: true,
 
-        schema: [],
+        schema: [
+            {
+                type: "object",
+                properties: {
+                    allowEscape: {
+                        type: "boolean",
+                        default: false
+                    }
+                },
+                additionalProperties: false
+            }
+        ],
 
         messages: {
             surrogatePairWithoutUFlag: "Unexpected surrogate pair in character class. Use 'u' flag.",
@@ -257,6 +299,7 @@ module.exports = {
         }
     },
     create(context) {
+        const allowEscape = context.options[0]?.allowEscape;
         const sourceCode = context.sourceCode;
         const parser = new RegExpParser();
         const checkedPatternNodes = new Set();
@@ -288,24 +331,62 @@ module.exports = {
                 return;
             }
 
+            let codeUnits = null;
+
+            /**
+             * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
+             * For the purposes of this rule, an escape sequence is considered acceptable if it consists of one or more backslashes followed by the character being escaped.
+             * @param {Character} char Character to check.
+             * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
+             */
+            function isAcceptableEscapeSequence(char) {
+                if (node.type === "Literal" && node.regex) {
+                    return checkForAcceptableEscape(char, char.raw);
+                }
+                if (node.type === "Literal" && typeof node.value === "string") {
+                    const nodeSource = node.raw;
+
+                    codeUnits ??= parseStringLiteral(nodeSource);
+
+                    return checkForAcceptableEscapeInString(char, nodeSource, codeUnits);
+                }
+                if (astUtils.isStaticTemplateLiteral(node)) {
+                    const nodeSource = sourceCode.getText(node);
+
+                    codeUnits ??= parseTemplateToken(nodeSource);
+
+                    return checkForAcceptableEscapeInString(char, nodeSource, codeUnits);
+                }
+                return false;
+            }
+
             const foundKindMatches = new Map();
 
             visitRegExpAST(patternNode, {
                 onCharacterClassEnter(ccNode) {
-                    for (const chars of iterateCharacterSequence(ccNode.elements)) {
+                    for (const unfilteredChars of iterateCharacterSequence(ccNode.elements)) {
+                        let chars;
+
+                        if (allowEscape) {
+
+                            // Replace escape sequences with null to avoid having them flagged.
+                            chars = unfilteredChars.map(char => (isAcceptableEscapeSequence(char) ? null : char));
+                        } else {
+                            chars = unfilteredChars;
+                        }
                         for (const kind of kinds) {
+                            const matches = findCharacterSequences[kind](chars, unfilteredChars);
+
                             if (foundKindMatches.has(kind)) {
-                                foundKindMatches.get(kind).push(...findCharacterSequences[kind](chars));
+                                foundKindMatches.get(kind).push(...matches);
                             } else {
-                                foundKindMatches.set(kind, [...findCharacterSequences[kind](chars)]);
+                                foundKindMatches.set(kind, [...matches]);
                             }
                         }
                     }
                 }
             });
 
-            let codeUnits = null;
-
             /**
              * Finds the report loc(s) for a range of matches.
              * Only literals and expression-less templates generate granular errors.