From d95f533d196f6419989ac10136328d3d6200b027 Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 28 Mar 2022 11:18:55 +1300 Subject: [PATCH 1/8] Ruby: Add getLastChild to RegExpParent --- ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll index 32f84fb16f66..42feb5b27f5a 100644 --- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll @@ -63,6 +63,9 @@ class RegExpParent extends TRegExpParent { /** Gets the number of child terms. */ int getNumChild() { result = count(this.getAChild()) } + /** Gets the last child term of this element. */ + RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) } + /** * Gets the name of a primary CodeQL class to which this regular * expression term belongs. From debc57b417d87ddbaea2083c0f4c5ab358f6f64a Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 28 Mar 2022 11:19:30 +1300 Subject: [PATCH 2/8] Ruby: Add RegExpAnchor to RegExpTreeView --- .../lib/codeql/ruby/regexp/RegExpTreeView.qll | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll index 42feb5b27f5a..872aebf938d2 100644 --- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll @@ -860,6 +860,21 @@ class RegExpDot extends RegExpSpecialChar { override string getAPrimaryQlClass() { result = "RegExpDot" } } +/** + * A term that matches a specific position between characters in the string. + * + * Example: + * + * ``` + * \A + * ``` + */ +class RegExpAnchor extends RegExpSpecialChar { + RegExpAnchor() { this.getChar() = ["^", "$", "\\A", "\\Z", "\\z"] } + + override string getAPrimaryQlClass() { result = "RegExpAnchor" } +} + /** * A dollar assertion `$` or `\Z` matching the end of a line. * @@ -869,7 +884,7 @@ class RegExpDot extends RegExpSpecialChar { * $ * ``` */ -class RegExpDollar extends RegExpSpecialChar { +class RegExpDollar extends RegExpAnchor { RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] } override string getAPrimaryQlClass() { result = "RegExpDollar" } @@ -884,7 +899,7 @@ class RegExpDollar extends RegExpSpecialChar { * ^ * ``` */ -class RegExpCaret extends RegExpSpecialChar { +class RegExpCaret extends RegExpAnchor { RegExpCaret() { this.getChar() = ["^", "\\A"] } override string getAPrimaryQlClass() { result = "RegExpCaret" } From e3c3c00c684440cdb8165d9582847f14f92282b1 Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 28 Mar 2022 11:20:14 +1300 Subject: [PATCH 3/8] Ruby: Add MissingRegExpAnchor query --- .../cwe-020/MissingRegExpAnchor.qhelp | 85 ++++++++++++++ .../security/cwe-020/MissingRegExpAnchor.ql | 105 ++++++++++++++++++ .../examples/missing_regexp_anchor_bad.rb | 8 ++ .../examples/missing_regexp_anchor_good.rb | 8 ++ .../MissingRegExpAnchor.expected | 4 + .../MissingRegExpAnchor.qlref | 1 + .../missing_regexp_anchor.rb | 18 +++ 7 files changed, 229 insertions(+) create mode 100644 ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp create mode 100644 ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql create mode 100644 ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb create mode 100644 ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb create mode 100644 ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected create mode 100644 ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref create mode 100644 ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp new file mode 100644 index 000000000000..cb2cddb6573c --- /dev/null +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp @@ -0,0 +1,85 @@ + + + + +

+ + Sanitizing untrusted input with regular expressions is a + common technique. However, it is error-prone to match untrusted input + against regular expressions without anchors such as ^ or + $. Malicious input can bypass such security checks by + embedding one of the allowed patterns in an unexpected location. + +

+ +

+ + Even if the matching is not done in a security-critical + context, it may still cause undesirable behavior when the regular + expression accidentally matches. + +

+
+ + +

+ + Use anchors to ensure that regular expressions match at + the expected locations. + +

+
+ + + +

+ + The following example code checks that a URL redirection + will reach the example.com domain, or one of its + subdomains, and not some malicious site. + +

+ + + +

+ + The check with the regular expression match is, however, easy to bypass. For example + by embedding http://example.com/ in the query + string component: http://evil-example.net/?x=http://example.com/. + + Address these shortcomings by using anchors in the regular expression instead: + +

+ + + +

+ + A related mistake is to write a regular expression with + multiple alternatives, but to only include an anchor for one of the + alternatives. As an example, the regular expression + /^www\.example\.com|beta\.example\.com/ will match the host + evil.beta.example.com because the regular expression is parsed + as /(^www\.example\.com)|(beta\.example\.com)/ + + TODO: implement this part of the query + +

+ +

+ + TODO: describe the danger of using line anchors like ^ + or $. + +

+ +
+ + +
  • OWASP: SSRF
  • +
  • OWASP: XSS Unvalidated Redirects and Forwards Cheat Sheet.
  • +
    +
    diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql new file mode 100644 index 000000000000..3c67b83ac610 --- /dev/null +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql @@ -0,0 +1,105 @@ +/** + * @name Missing regular expression anchor + * @description Regular expressions without anchors can be vulnerable to bypassing. + * @kind problem + * @problem.severity warning + * @security-severity 7.8 + * @precision medium + * @id rb/regex/missing-regexp-anchor + * @tags correctness + * security + * external/cwe/cwe-020 + */ + +import HostnameRegexpShared +import codeql.ruby.DataFlow +import codeql.ruby.security.performance.RegExpTreeView + +/** + * Holds if `term` is a final term, that is, no term will match anything after this one. + */ +predicate isFinalRegExpTerm(RegExpTerm term) { + term.isRootTerm() + or + exists(RegExpSequence seq | + isFinalRegExpTerm(seq) and + term = seq.getLastChild() + ) + or + exists(RegExpTerm parent | + isFinalRegExpTerm(parent) and + term = parent.getAChild() and + not parent instanceof RegExpSequence and + not parent instanceof RegExpQuantifier + ) +} + +/** + * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors + * rather than `\A/\z` which match the start/end of the whole string. + */ +predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { + not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting + exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | + not isConstantInvalidInsideOrigin(term.getAChild*()) and + tld = term.getAChild*() and + hasTopLevelDomainEnding(tld, i) and + isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD + ( + tld.getChild(0).(RegExpCaret).getChar() = "^" or + tld.getLastChild().(RegExpDollar).getChar() = "$" + ) and + msg = + "This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead." + ) +} + +/** + * Holds if `src` contains a hostname pattern that is missing a `$` anchor. + */ +predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { + // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting + exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | + not isConstantInvalidInsideOrigin(term.getAChild*()) and + tld = term.getAChild*() and + hasTopLevelDomainEnding(tld, i) and + isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD + tld.getChild(0) instanceof RegExpCaret and + msg = + "This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end." + ) +} + +/** + * Holds if `src` is an unanchored pattern for a URL, indicating a + * mistake explained by `msg`. + */ +predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) { + exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() | + alwaysMatchesHostname(term) and + tld = term.getAChild*() and + hasTopLevelDomainEnding(tld) and + not isConstantInvalidInsideOrigin(term.getAChild*()) and + not term.getAChild*() instanceof RegExpAnchor and + // that is not used for capture or replace + not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name | + name = mcn.getMethodName() and + arg = mcn.getArgument(0) + | + ( + src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or + src.getAParse() = arg + ) and + name = ["sub", "sub!", "gsub", "gsub!"] + ) and + msg = + "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it." + ) +} + +from DataFlow::Node nd, string msg +where + isUnanchoredHostnameRegExp(nd, msg) or + isSemiAnchoredHostnameRegExp(nd, msg) or + isLineAnchoredHostnameRegExp(nd, msg) +select nd, msg diff --git a/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb new file mode 100644 index 000000000000..624467cc40a3 --- /dev/null +++ b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb @@ -0,0 +1,8 @@ +class UsersController < ActionController::Base + def index + # BAD: the host of `params[:url]` may be controlled by an attacker + if params[:url].match? /https?:\/\/www\.example\.com\// + redirect_to params[:url] + end + end +end \ No newline at end of file diff --git a/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb new file mode 100644 index 000000000000..4443e111f8c7 --- /dev/null +++ b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb @@ -0,0 +1,8 @@ +class UsersController < ActionController::Base + def index + # GOOD: the host of `params[:url]` can not be controlled by an attacker + if params[:url].match? /\Ahttps?:\/\/www\.example\.com\// + redirect_to params[:url] + end + end +end \ No newline at end of file diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected new file mode 100644 index 000000000000..c4825104b7ee --- /dev/null +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected @@ -0,0 +1,4 @@ +| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | +| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | +| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. | +| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. | diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref new file mode 100644 index 000000000000..bd3ad563aec1 --- /dev/null +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref @@ -0,0 +1 @@ +queries/security/cwe-020/MissingRegExpAnchor.ql \ No newline at end of file diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb new file mode 100644 index 000000000000..e7c9ae6a6617 --- /dev/null +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb @@ -0,0 +1,18 @@ +/www.example.com/ # BAD +/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors +/\Awww.example.com\z/ # GOOD + +/foo.bar/ # GOOD + +/https?:\/\/good.com/ # BAD +/^https?:\/\/good.com/ # BAD: missing end-of-string anchor +/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor + +/bar/ # GOOD + +foo.gsub(/www.example.com/, "bar") # GOOD +foo.sub(/www.example.com/, "bar") # GOOD +foo.gsub!(/www.example.com/, "bar") # GOOD +foo.sub!(/www.example.com/, "bar") # GOOD + + From 3f8b27c0cdc796b4a7e8e35bc83c3a5acc270b57 Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 28 Mar 2022 17:04:31 +1300 Subject: [PATCH 4/8] Ruby: Add RegExpNonWordBoundary to RegExpTreeView --- ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll | 9 +++++++++ ruby/ql/test/library-tests/regexp/parse.expected | 4 ++-- ruby/ql/test/library-tests/regexp/regexp.expected | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll index 872aebf938d2..ac88df8de232 100644 --- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll @@ -581,6 +581,15 @@ class RegExpWordBoundary extends RegExpSpecialChar { RegExpWordBoundary() { this.getChar() = "\\b" } } +/** + * A non-word boundary, that is, a regular expression term of the form `\B`. + */ +class RegExpNonWordBoundary extends RegExpSpecialChar { + RegExpNonWordBoundary() { this.getChar() = "\\B" } + + override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" } +} + /** * A character class escape in a regular expression. * That is, an escaped character that denotes multiple characters. diff --git a/ruby/ql/test/library-tests/regexp/parse.expected b/ruby/ql/test/library-tests/regexp/parse.expected index 9256fefd67a8..3df78cdb4fbf 100644 --- a/ruby/ql/test/library-tests/regexp/parse.expected +++ b/ruby/ql/test/library-tests/regexp/parse.expected @@ -266,11 +266,11 @@ regexp.rb: # 43| [RegExpSequence] \b!a\B #-----| 0 -> [RegExpSpecialChar] \b #-----| 1 -> [RegExpConstant, RegExpNormalChar] !a -#-----| 2 -> [RegExpSpecialChar] \B +#-----| 2 -> [RegExpNonWordBoundary] \B # 43| [RegExpConstant, RegExpNormalChar] !a -# 43| [RegExpSpecialChar] \B +# 43| [RegExpNonWordBoundary] \B # 46| [RegExpGroup] (foo) #-----| 0 -> [RegExpConstant, RegExpNormalChar] foo diff --git a/ruby/ql/test/library-tests/regexp/regexp.expected b/ruby/ql/test/library-tests/regexp/regexp.expected index 4d1837bedea2..e464b5ce5ea5 100644 --- a/ruby/ql/test/library-tests/regexp/regexp.expected +++ b/ruby/ql/test/library-tests/regexp/regexp.expected @@ -108,7 +108,7 @@ term | regexp.rb:43:2:43:3 | \\b | RegExpSpecialChar | | regexp.rb:43:2:43:7 | \\b!a\\B | RegExpSequence | | regexp.rb:43:4:43:5 | !a | RegExpConstant,RegExpNormalChar | -| regexp.rb:43:6:43:7 | \\B | RegExpSpecialChar | +| regexp.rb:43:6:43:7 | \\B | RegExpNonWordBoundary | | regexp.rb:46:2:46:6 | (foo) | RegExpGroup | | regexp.rb:46:2:46:7 | (foo)* | RegExpStar | | regexp.rb:46:2:46:10 | (foo)*bar | RegExpSequence | From 2feb4a48beb8c67ca738a90938394fdab2baea74 Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 28 Mar 2022 17:08:42 +1300 Subject: [PATCH 5/8] Ruby: Add hasMisleadingAnchorPrecedence to MissingRegExpAnchor --- .../cwe-020/MissingRegExpAnchor.qhelp | 2 - .../security/cwe-020/MissingRegExpAnchor.ql | 155 +++++++++++++++++- .../MissingRegExpAnchor.expected | 22 ++- .../missing_regexp_anchor.rb | 57 +++++-- 4 files changed, 215 insertions(+), 21 deletions(-) diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp index cb2cddb6573c..eac017305321 100644 --- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp @@ -65,8 +65,6 @@ evil.beta.example.com because the regular expression is parsed as /(^www\.example\.com)|(beta\.example\.com)/ - TODO: implement this part of the query -

    diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql index 3c67b83ac610..d06ef7efa9b3 100644 --- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql @@ -34,12 +34,158 @@ predicate isFinalRegExpTerm(RegExpTerm term) { ) } +/** Holds if `term` is one of the transitive left children of a regexp. */ +predicate isLeftArmTerm(RegExpTerm term) { + term.isRootTerm() + or + exists(RegExpTerm parent | + term = parent.getChild(0) and + isLeftArmTerm(parent) + ) +} + +/** Holds if `term` is one of the transitive right children of a regexp. */ +predicate isRightArmTerm(RegExpTerm term) { + term.isRootTerm() + or + exists(RegExpTerm parent | + term = parent.getLastChild() and + isRightArmTerm(parent) + ) +} + +/** + * Holds if `term` is an anchor that is not the first or last node + * in its tree. + */ +predicate isInteriorAnchor(RegExpAnchor term) { + not isLeftArmTerm(term) and + not isRightArmTerm(term) +} + +/** + * Holds if `term` contains an anchor that is not the first or last node + * in its tree, such as `(foo|bar$|baz)`. + */ +predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) } + +/** + * Holds if `term` starts with a word boundary or lookbehind assertion, + * indicating that it's not intended to be anchored on that side. + */ +predicate containsLeadingPseudoAnchor(RegExpSequence term) { + exists(RegExpTerm child | child = term.getChild(0) | + child instanceof RegExpWordBoundary or + child instanceof RegExpNonWordBoundary or + child instanceof RegExpLookbehind + ) +} + +/** + * Holds if `term` ends with a word boundary or lookahead assertion, + * indicating that it's not intended to be anchored on that side. + */ +predicate containsTrailingPseudoAnchor(RegExpSequence term) { + exists(RegExpTerm child | child = term.getLastChild() | + child instanceof RegExpWordBoundary or + child instanceof RegExpNonWordBoundary or + child instanceof RegExpLookahead + ) +} + +/** + * Holds if `term` is an empty sequence, usually arising from + * literals with a trailing alternative such as `foo|`. + */ +predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 } + +/** + * Holds if `term` contains a letter constant. + * + * We use this as a heuristic to filter out uninteresting results. + */ +predicate containsLetters(RegExpTerm term) { + term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*") +} + +/** + * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz` + * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`. + */ +predicate hasExplicitAnchorPrecedence(RegExpAlt alt) { + isAnchoredGroup(alt.getAChild()) and + not alt.getAChild() instanceof RegExpGroup +} + +/** + * Holds if `term` consists only of an anchor and a parenthesized term, + * such as the left side of `^(foo|bar)|baz`. + * + * The precedence of the anchor is likely to be intentional in this case, + * as the group wouldn't be needed otherwise. + */ +predicate isAnchoredGroup(RegExpSequence term) { + term.getNumChild() = 2 and + term.getAChild() instanceof RegExpAnchor and + term.getAChild() instanceof RegExpGroup +} + +/** + * Holds if `src` is a pattern for a collection of alternatives where + * only the first or last alternative is anchored, indicating a + * precedence mistake explained by `msg`. + * + * The canonical example of such a mistake is: `^a|b|c`, which is + * parsed as `(^a)|(b)|(c)`. + */ +predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) { + exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction | + root = src.getRegExpTerm() and + not containsInteriorAnchor(root) and + not isEmpty(root.getAChild()) and + not hasExplicitAnchorPrecedence(root) and + containsLetters(anchoredTerm) and + ( + anchoredTerm = root.getChild(0) and + anchoredTerm.getChild(0) instanceof RegExpCaret and + not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and + containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and + direction = "beginning" + or + anchoredTerm = root.getLastChild() and + anchoredTerm.getLastChild() instanceof RegExpDollar and + not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and + containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and + direction = "end" + ) and + // that is not used for string replacement + not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name | + name = mcn.getMethodName() and + arg = mcn.getArgument(0) + | + ( + src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or + src.getAParse() = arg + ) and + name = ["sub", "sub!", "gsub", "gsub!"] + ) and + msg = + "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() + + "' is anchored at the " + direction + + ", but the other parts of this regular expression are not" + ) +} + /** * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors * rather than `\A/\z` which match the start/end of the whole string. */ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { - not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting + // avoid double reporting + not ( + isSemiAnchoredHostnameRegExp(src, _) or + hasMisleadingAnchorPrecedence(src, _) + ) and exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | not isConstantInvalidInsideOrigin(term.getAChild*()) and tld = term.getAChild*() and @@ -58,7 +204,7 @@ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { * Holds if `src` contains a hostname pattern that is missing a `$` anchor. */ predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { - // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting + not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | not isConstantInvalidInsideOrigin(term.getAChild*()) and tld = term.getAChild*() and @@ -81,7 +227,7 @@ predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) { hasTopLevelDomainEnding(tld) and not isConstantInvalidInsideOrigin(term.getAChild*()) and not term.getAChild*() instanceof RegExpAnchor and - // that is not used for capture or replace + // that is not used for string replacement not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name | name = mcn.getMethodName() and arg = mcn.getArgument(0) @@ -101,5 +247,6 @@ from DataFlow::Node nd, string msg where isUnanchoredHostnameRegExp(nd, msg) or isSemiAnchoredHostnameRegExp(nd, msg) or - isLineAnchoredHostnameRegExp(nd, msg) + isLineAnchoredHostnameRegExp(nd, msg) or + hasMisleadingAnchorPrecedence(nd, msg) select nd, msg diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected index c4825104b7ee..9f5a8b11fc6b 100644 --- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected @@ -1,4 +1,18 @@ -| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | -| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | -| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. | -| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. | +| missing_regexp_anchor.rb:1:1:1:19 | /www\\.example\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | +| missing_regexp_anchor.rb:7:1:7:22 | /https?:\\/\\/good\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | +| missing_regexp_anchor.rb:8:1:8:23 | /^https?:\\/\\/good\\.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. | +| missing_regexp_anchor.rb:19:1:19:6 | /^a\|b/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:22:1:22:8 | /^a\|b\|c/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:28:1:28:8 | /^a\|(b)/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:30:1:30:10 | /^(a)\|(b)/ | Misleading operator precedence. The subexpression '^(a)' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:33:1:33:6 | /a\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:36:1:36:8 | /a\|b\|c$/ | Misleading operator precedence. The subexpression 'c$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:42:1:42:8 | /(a)\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:44:1:44:10 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:46:1:46:22 | /^good.com\|better.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:47:1:47:24 | /^good\\.com\|better\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:48:1:48:26 | /^good\\\\.com\|better\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:49:1:49:28 | /^good\\\\\\.com\|better\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not | diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb index e7c9ae6a6617..646a3574447c 100644 --- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb @@ -1,18 +1,53 @@ -/www.example.com/ # BAD -/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors -/\Awww.example.com\z/ # GOOD +/www\.example\.com/ # BAD +/^www\.example\.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors +/\Awww\.example\.com\z/ # GOOD -/foo.bar/ # GOOD +/foo\.bar/ # GOOD -/https?:\/\/good.com/ # BAD -/^https?:\/\/good.com/ # BAD: missing end-of-string anchor -/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor +/https?:\/\/good\.com/ # BAD +/^https?:\/\/good\.com/ # BAD: missing end-of-string anchor +/(^https?:\/\/good1\.com)|(^https?:#good2\.com)/ # BAD: missing end-of-string anchor /bar/ # GOOD -foo.gsub(/www.example.com/, "bar") # GOOD -foo.sub(/www.example.com/, "bar") # GOOD -foo.gsub!(/www.example.com/, "bar") # GOOD -foo.sub!(/www.example.com/, "bar") # GOOD +foo.gsub(/www\.example\.com/, "bar") # GOOD +foo.sub(/www\.example.com/, "bar") # GOOD +foo.gsub!(/www\.example\.com/, "bar") # GOOD +foo.sub!(/www\.example\.com/, "bar") # GOOD +/^a|/ +/^a|b/ # BAD +/a|^b/ +/^a|^b/ +/^a|b|c/ # BAD +/a|^b|c/ +/a|b|^c/ +/^a|^b|c/ +/(^a)|b/ +/^a|(b)/ # BAD +/^a|(^b)/ +/^(a)|(b)/ # BAD + + +/a|b$/ # BAD +/a$|b/ +/a$|b$/ +/a|b|c$/ # BAD +/a|b$|c/ +/a$|b|c/ +/a|b$|c$/ + +/a|(b$)/ +/(a)|b$/ # BAD +/(a$)|b$/ +/(a)|(b)$/ # BAD + +/^good.com|better.com/ # BAD +/^good\.com|better\.com/ # BAD +/^good\\.com|better\\.com/ # BAD +/^good\\\.com|better\\\.com/ # BAD +/^good\\\\.com|better\\\\.com/ # BAD + +/^foo|bar|baz$/ # BAD +/^foo|%/ # OK \ No newline at end of file From 6f9dc5eb7ee70a715c90e3b3974fd4d107aaeff7 Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 4 Apr 2022 11:42:57 +1200 Subject: [PATCH 6/8] Ruby: Update import for file move --- ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql index d06ef7efa9b3..45653af568dc 100644 --- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql @@ -13,7 +13,8 @@ import HostnameRegexpShared import codeql.ruby.DataFlow -import codeql.ruby.security.performance.RegExpTreeView +import codeql.ruby.regexp.RegExpTreeView +import codeql.ruby.Regexp /** * Holds if `term` is a final term, that is, no term will match anything after this one. From af2965c2a0db9a48a2c66de5f2c73542646acacb Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Mon, 4 Apr 2022 14:26:38 +1200 Subject: [PATCH 7/8] Explain anchors in MissingRegExpAnchor qlhelp --- .../security/cwe-020/MissingRegExpAnchor.qhelp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp index eac017305321..838cfb728ab4 100644 --- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp @@ -8,8 +8,8 @@ Sanitizing untrusted input with regular expressions is a common technique. However, it is error-prone to match untrusted input - against regular expressions without anchors such as ^ or - $. Malicious input can bypass such security checks by + against regular expressions without anchors such as \A or + \z. Malicious input can bypass such security checks by embedding one of the allowed patterns in an unexpected location.

    @@ -68,10 +68,17 @@

    + In Ruby the anchors ^ and $ match the + start and end of a line, whereas the anchors \A and + \z match the start and end of the entire string. - TODO: describe the danger of using line anchors like ^ - or $. + Using line anchors can be dangerous, as this can allow malicious + input to be hidden using newlines, leading to vulnerabilities such + as HTTP header injection. + Unless you specifically need the line-matching behaviour of + ^ and $, you should use \A + and \z instead.

    From bbc3043836ab795c9c84141d1acf9df653cf71cc Mon Sep 17 00:00:00 2001 From: Harry Maclean Date: Thu, 14 Apr 2022 15:44:30 +1200 Subject: [PATCH 8/8] Add change note for rb/regex/missing-regexp-anchor --- .../src/change-notes/2022-04-14-rb-missing-regexp-anchor.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md diff --git a/ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md b/ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md new file mode 100644 index 000000000000..7f18700d693f --- /dev/null +++ b/ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md @@ -0,0 +1,4 @@ +--- +category: newQuery +--- +* Added a new query, `rb/regex/missing-regexp-anchor`, which finds regular expressions which are improperly anchored. Validations using such expressions are at risk of being bypassed. \ No newline at end of file