diff --git a/python/ql/lib/change-notes/2024-01-17-regex-flag-parsing.md b/python/ql/lib/change-notes/2024-01-17-regex-flag-parsing.md new file mode 100644 index 000000000000..532ab1a88dc0 --- /dev/null +++ b/python/ql/lib/change-notes/2024-01-17-regex-flag-parsing.md @@ -0,0 +1,4 @@ +--- +category: fix +--- +* Fixed regular expressions containing flags not being parsed correctly in some cases. diff --git a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll index 90a976824ac3..6a70108c7f9c 100644 --- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll +++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll @@ -116,13 +116,14 @@ class RegExp extends Expr instanceof StrConst { /** * Gets a mode (if any) of this regular expression. Can be any of: - * DEBUG - * IGNORECASE - * LOCALE - * MULTILINE - * DOTALL - * UNICODE - * VERBOSE + * - DEBUG + * - ASCII + * - IGNORECASE + * - LOCALE + * - MULTILINE + * - DOTALL + * - UNICODE + * - VERBOSE */ string getAMode() { result = FindRegexMode::getAMode(this) @@ -695,7 +696,10 @@ class RegExp extends Expr instanceof StrConst { */ private predicate flag_group_start(int start, int end) { this.flag_group_start_no_modes(start, _) and - end = max(int i | this.mode_character(start, i) | i + 1) + // Check if this is a group with flags, and therefore the `:` should be excluded + exists(int maybe_end | maybe_end = max(int i | this.mode_character(start, i) | i + 1) | + if this.getChar(maybe_end) = ":" then end = maybe_end + 1 else end = maybe_end + ) } /** @@ -705,19 +709,21 @@ class RegExp extends Expr instanceof StrConst { private predicate flag_group_start_no_modes(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and - this.getChar(start + 2) in ["i", "L", "m", "s", "u", "x"] and + // "-" is for removing flags, e.g. `(?-i:text)` + this.getChar(start + 2) in ["-", "a", "i", "L", "m", "s", "u", "x"] and end = start + 2 } /** - * Holds if `pos` contains a mo character from the + * Holds if `pos` contains a mode character from the * flag group starting at `start`. */ private predicate mode_character(int start, int pos) { this.flag_group_start_no_modes(start, pos) or this.mode_character(start, pos - 1) and - this.getChar(pos) in ["i", "L", "m", "s", "u", "x"] + // "-" is for removing flags, e.g. `(?-i:text)` + this.getChar(pos) in ["-", "a", "i", "L", "m", "s", "u", "x"] } /** @@ -728,9 +734,13 @@ class RegExp extends Expr instanceof StrConst { * ``` */ private predicate flag(string c) { - exists(int pos | - this.mode_character(_, pos) and - this.getChar(pos) = c + exists(int start, int pos | + this.mode_character(start, pos) and + this.getChar(pos) = c and + // Ignore if flag is removed using `-`; use `<=` to also exclude `-` itself + not exists(int minus_pos | + this.mode_character(start, minus_pos) and this.getChar(minus_pos) = "-" and minus_pos <= pos + ) ) } @@ -740,6 +750,8 @@ class RegExp extends Expr instanceof StrConst { */ string getModeFromPrefix() { exists(string c | this.flag(c) | + c = "a" and result = "ASCII" + or c = "i" and result = "IGNORECASE" or c = "L" and result = "LOCALE" diff --git a/python/ql/test/library-tests/regex/Characters.expected b/python/ql/test/library-tests/regex/Characters.expected index 7a1ca9c874fa..dcbdd6b7d7ca 100644 --- a/python/ql/test/library-tests/regex/Characters.expected +++ b/python/ql/test/library-tests/regex/Characters.expected @@ -20,6 +20,7 @@ | (?!not-this)^[A-Z_]+$ | 16 | 17 | | (?!not-this)^[A-Z_]+$ | 17 | 18 | | (?!not-this)^[A-Z_]+$ | 20 | 21 | +| (?-imsx:a+) | 8 | 9 | | (?:(?:\n\r?)\|^)( *)\\S | 6 | 7 | | (?:(?:\n\r?)\|^)( *)\\S | 7 | 8 | | (?:(?:\n\r?)\|^)( *)\\S | 11 | 12 | @@ -35,9 +36,14 @@ | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 19 | 21 | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 22 | 23 | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 24 | 25 | +| (?Li)a+ | 5 | 6 | | (?P[\\w]+)\| | 10 | 12 | +| (?a-imsx:a+) | 9 | 10 | +| (?aimsx)a+ | 8 | 9 | +| (?aimsx:a+) | 8 | 9 | | (?m)^(?!$) | 4 | 5 | | (?m)^(?!$) | 8 | 9 | +| (?ui)a+ | 5 | 6 | | (\\033\|~{) | 1 | 5 | | (\\033\|~{) | 6 | 7 | | (\\033\|~{) | 7 | 8 | diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected index e388e0d1fdf7..43695539c0b7 100644 --- a/python/ql/test/library-tests/regex/FirstLast.expected +++ b/python/ql/test/library-tests/regex/FirstLast.expected @@ -7,6 +7,10 @@ | (?!not-this)^[A-Z_]+$ | last | 13 | 19 | | (?!not-this)^[A-Z_]+$ | last | 13 | 20 | | (?!not-this)^[A-Z_]+$ | last | 20 | 21 | +| (?-imsx:a+) | first | 8 | 9 | +| (?-imsx:a+) | first | 8 | 10 | +| (?-imsx:a+) | last | 8 | 9 | +| (?-imsx:a+) | last | 8 | 10 | | (?:(?:\n\r?)\|^)( *)\\S | first | 6 | 7 | | (?:(?:\n\r?)\|^)( *)\\S | first | 11 | 12 | | (?:(?:\n\r?)\|^)( *)\\S | last | 17 | 19 | @@ -18,14 +22,34 @@ | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 8 | 9 | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 11 | 12 | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | last | 21 | 26 | +| (?Li)a+ | first | 5 | 6 | +| (?Li)a+ | first | 5 | 7 | +| (?Li)a+ | last | 5 | 6 | +| (?Li)a+ | last | 5 | 7 | | (?P[\\w]+)\| | first | 9 | 13 | | (?P[\\w]+)\| | first | 9 | 14 | | (?P[\\w]+)\| | last | 9 | 13 | | (?P[\\w]+)\| | last | 9 | 14 | +| (?a-imsx:a+) | first | 9 | 10 | +| (?a-imsx:a+) | first | 9 | 11 | +| (?a-imsx:a+) | last | 9 | 10 | +| (?a-imsx:a+) | last | 9 | 11 | +| (?aimsx)a+ | first | 8 | 9 | +| (?aimsx)a+ | first | 8 | 10 | +| (?aimsx)a+ | last | 8 | 9 | +| (?aimsx)a+ | last | 8 | 10 | +| (?aimsx:a+) | first | 8 | 9 | +| (?aimsx:a+) | first | 8 | 10 | +| (?aimsx:a+) | last | 8 | 9 | +| (?aimsx:a+) | last | 8 | 10 | | (?m)^(?!$) | first | 4 | 5 | | (?m)^(?!$) | first | 8 | 9 | | (?m)^(?!$) | last | 4 | 5 | | (?m)^(?!$) | last | 8 | 9 | +| (?ui)a+ | first | 5 | 6 | +| (?ui)a+ | first | 5 | 7 | +| (?ui)a+ | last | 5 | 6 | +| (?ui)a+ | last | 5 | 7 | | (\\033\|~{) | first | 1 | 5 | | (\\033\|~{) | first | 6 | 8 | | (\\033\|~{) | last | 1 | 5 | diff --git a/python/ql/test/library-tests/regex/GroupContents.expected b/python/ql/test/library-tests/regex/GroupContents.expected index c7c4ac97a1e3..0e2ca7cda612 100644 --- a/python/ql/test/library-tests/regex/GroupContents.expected +++ b/python/ql/test/library-tests/regex/GroupContents.expected @@ -1,4 +1,5 @@ | (?!not-this)^[A-Z_]+$ | 0 | 12 | (?!not-this) | 3 | 11 | not-this | +| (?-imsx:a+) | 0 | 11 | (?-imsx:a+) | 8 | 10 | a+ | | (?:(?:\n\r?)\|^)( *)\\S | 0 | 13 | (?:(?:\n\r?)\|^) | 3 | 12 | (?:\n\r?)\|^ | | (?:(?:\n\r?)\|^)( *)\\S | 3 | 10 | (?:\n\r?) | 6 | 9 | \n\r? | | (?:(?:\n\r?)\|^)( *)\\S | 13 | 17 | ( *) | 14 | 16 | * | @@ -8,6 +9,8 @@ | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 10 | (?:[^%]\|^) | 3 | 9 | [^%]\|^ | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 14 | 19 | (\\w*) | 15 | 18 | \\w* | | (?P[\\w]+)\| | 0 | 15 | (?P[\\w]+) | 9 | 14 | [\\w]+ | +| (?a-imsx:a+) | 0 | 12 | (?a-imsx:a+) | 9 | 11 | a+ | +| (?aimsx:a+) | 0 | 11 | (?aimsx:a+) | 8 | 10 | a+ | | (?m)^(?!$) | 5 | 10 | (?!$) | 8 | 9 | $ | | (\\033\|~{) | 0 | 9 | (\\033\|~{) | 1 | 8 | \\033\|~{ | | \\[(?P[^[]*)\\]\\((?P[^)]*) | 2 | 16 | (?P[^[]*) | 10 | 15 | [^[]* | diff --git a/python/ql/test/library-tests/regex/Mode.expected b/python/ql/test/library-tests/regex/Mode.expected index 3fcfc8672c1b..eeb2c911214e 100644 --- a/python/ql/test/library-tests/regex/Mode.expected +++ b/python/ql/test/library-tests/regex/Mode.expected @@ -10,4 +10,19 @@ | 54 | DOTALL | | 54 | VERBOSE | | 56 | VERBOSE | -| 68 | MULTILINE | +| 59 | ASCII | +| 59 | DOTALL | +| 59 | IGNORECASE | +| 59 | MULTILINE | +| 59 | VERBOSE | +| 60 | IGNORECASE | +| 60 | UNICODE | +| 61 | IGNORECASE | +| 61 | LOCALE | +| 62 | ASCII | +| 62 | DOTALL | +| 62 | IGNORECASE | +| 62 | MULTILINE | +| 62 | VERBOSE | +| 64 | ASCII | +| 76 | MULTILINE | diff --git a/python/ql/test/library-tests/regex/Qualified.expected b/python/ql/test/library-tests/regex/Qualified.expected index b698a1d09557..01540b83c55b 100644 --- a/python/ql/test/library-tests/regex/Qualified.expected +++ b/python/ql/test/library-tests/regex/Qualified.expected @@ -1,9 +1,15 @@ | (?!not-this)^[A-Z_]+$ | 13 | 20 | false | true | +| (?-imsx:a+) | 8 | 10 | false | true | | (?:(?:\n\r?)\|^)( *)\\S | 7 | 9 | true | false | | (?:(?:\n\r?)\|^)( *)\\S | 14 | 16 | true | true | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 11 | true | false | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 15 | 18 | true | true | +| (?Li)a+ | 5 | 7 | false | true | | (?P[\\w]+)\| | 9 | 14 | false | true | +| (?a-imsx:a+) | 9 | 11 | false | true | +| (?aimsx)a+ | 8 | 10 | false | true | +| (?aimsx:a+) | 8 | 10 | false | true | +| (?ui)a+ | 5 | 7 | false | true | | \\A[+-]?\\d+ | 2 | 7 | true | false | | \\A[+-]?\\d+ | 7 | 10 | false | true | | \\[(?P[^[]*)\\]\\((?P[^)]*) | 10 | 15 | true | true | diff --git a/python/ql/test/library-tests/regex/Regex.expected b/python/ql/test/library-tests/regex/Regex.expected index e5e0ea1719ec..be32d3a4da37 100644 --- a/python/ql/test/library-tests/regex/Regex.expected +++ b/python/ql/test/library-tests/regex/Regex.expected @@ -26,6 +26,10 @@ | (?!not-this)^[A-Z_]+$ | qualified | 13 | 20 | | (?!not-this)^[A-Z_]+$ | sequence | 0 | 21 | | (?!not-this)^[A-Z_]+$ | sequence | 3 | 11 | +| (?-imsx:a+) | char | 8 | 9 | +| (?-imsx:a+) | non-empty group | 0 | 11 | +| (?-imsx:a+) | qualified | 8 | 10 | +| (?-imsx:a+) | sequence | 0 | 11 | | (?:(?:\n\r?)\|^)( *)\\S | ^ | 11 | 12 | | (?:(?:\n\r?)\|^)( *)\\S | char | 6 | 7 | | (?:(?:\n\r?)\|^)( *)\\S | char | 7 | 8 | @@ -69,18 +73,38 @@ | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 0 | 26 | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 3 | 7 | | (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 8 | 9 | +| (?Li)a+ | char | 5 | 6 | +| (?Li)a+ | empty group | 0 | 5 | +| (?Li)a+ | qualified | 5 | 7 | +| (?Li)a+ | sequence | 0 | 7 | | (?P[\\w]+)\| | char | 10 | 12 | | (?P[\\w]+)\| | char-set | 9 | 13 | | (?P[\\w]+)\| | choice | 0 | 16 | | (?P[\\w]+)\| | non-empty group | 0 | 15 | | (?P[\\w]+)\| | qualified | 9 | 14 | | (?P[\\w]+)\| | sequence | 0 | 15 | +| (?a-imsx:a+) | char | 9 | 10 | +| (?a-imsx:a+) | non-empty group | 0 | 12 | +| (?a-imsx:a+) | qualified | 9 | 11 | +| (?a-imsx:a+) | sequence | 0 | 12 | +| (?aimsx)a+ | char | 8 | 9 | +| (?aimsx)a+ | empty group | 0 | 8 | +| (?aimsx)a+ | qualified | 8 | 10 | +| (?aimsx)a+ | sequence | 0 | 10 | +| (?aimsx:a+) | char | 8 | 9 | +| (?aimsx:a+) | non-empty group | 0 | 11 | +| (?aimsx:a+) | qualified | 8 | 10 | +| (?aimsx:a+) | sequence | 0 | 11 | | (?m)^(?!$) | $ | 8 | 9 | | (?m)^(?!$) | ^ | 4 | 5 | | (?m)^(?!$) | empty group | 0 | 4 | | (?m)^(?!$) | empty group | 5 | 10 | | (?m)^(?!$) | sequence | 0 | 10 | | (?m)^(?!$) | sequence | 8 | 9 | +| (?ui)a+ | char | 5 | 6 | +| (?ui)a+ | empty group | 0 | 5 | +| (?ui)a+ | qualified | 5 | 7 | +| (?ui)a+ | sequence | 0 | 7 | | (\\033\|~{) | char | 1 | 5 | | (\\033\|~{) | char | 6 | 7 | | (\\033\|~{) | char | 7 | 8 | diff --git a/python/ql/test/library-tests/regex/test.py b/python/ql/test/library-tests/regex/test.py index b8e6daea7e6e..713f69b7270c 100644 --- a/python/ql/test/library-tests/regex/test.py +++ b/python/ql/test/library-tests/regex/test.py @@ -55,6 +55,14 @@ # re.X is an alias for re.VERBOSE re.compile("", re.X) +#Inline flags; 'a', 'L' and 'u' are mutually exclusive +re.compile("(?aimsx)a+") +re.compile("(?ui)a+") +re.compile(b"(?Li)a+") +re.compile("(?aimsx:a+)") +re.compile("(?-imsx:a+)") +re.compile("(?a-imsx:a+)") + #empty choice re.compile(r'|x') re.compile(r'x|')