From d95f533d196f6419989ac10136328d3d6200b027 Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 28 Mar 2022 11:18:55 +1300
Subject: [PATCH 1/8] Ruby: Add getLastChild to RegExpParent
---
ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll | 3 +++
1 file changed, 3 insertions(+)
diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
index 32f84fb16f66..42feb5b27f5a 100644
--- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
@@ -63,6 +63,9 @@ class RegExpParent extends TRegExpParent {
/** Gets the number of child terms. */
int getNumChild() { result = count(this.getAChild()) }
+ /** Gets the last child term of this element. */
+ RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
+
/**
* Gets the name of a primary CodeQL class to which this regular
* expression term belongs.
From debc57b417d87ddbaea2083c0f4c5ab358f6f64a Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 28 Mar 2022 11:19:30 +1300
Subject: [PATCH 2/8] Ruby: Add RegExpAnchor to RegExpTreeView
---
.../lib/codeql/ruby/regexp/RegExpTreeView.qll | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
index 42feb5b27f5a..872aebf938d2 100644
--- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
@@ -860,6 +860,21 @@ class RegExpDot extends RegExpSpecialChar {
override string getAPrimaryQlClass() { result = "RegExpDot" }
}
+/**
+ * A term that matches a specific position between characters in the string.
+ *
+ * Example:
+ *
+ * ```
+ * \A
+ * ```
+ */
+class RegExpAnchor extends RegExpSpecialChar {
+ RegExpAnchor() { this.getChar() = ["^", "$", "\\A", "\\Z", "\\z"] }
+
+ override string getAPrimaryQlClass() { result = "RegExpAnchor" }
+}
+
/**
* A dollar assertion `$` or `\Z` matching the end of a line.
*
@@ -869,7 +884,7 @@ class RegExpDot extends RegExpSpecialChar {
* $
* ```
*/
-class RegExpDollar extends RegExpSpecialChar {
+class RegExpDollar extends RegExpAnchor {
RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] }
override string getAPrimaryQlClass() { result = "RegExpDollar" }
@@ -884,7 +899,7 @@ class RegExpDollar extends RegExpSpecialChar {
* ^
* ```
*/
-class RegExpCaret extends RegExpSpecialChar {
+class RegExpCaret extends RegExpAnchor {
RegExpCaret() { this.getChar() = ["^", "\\A"] }
override string getAPrimaryQlClass() { result = "RegExpCaret" }
From e3c3c00c684440cdb8165d9582847f14f92282b1 Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 28 Mar 2022 11:20:14 +1300
Subject: [PATCH 3/8] Ruby: Add MissingRegExpAnchor query
---
.../cwe-020/MissingRegExpAnchor.qhelp | 85 ++++++++++++++
.../security/cwe-020/MissingRegExpAnchor.ql | 105 ++++++++++++++++++
.../examples/missing_regexp_anchor_bad.rb | 8 ++
.../examples/missing_regexp_anchor_good.rb | 8 ++
.../MissingRegExpAnchor.expected | 4 +
.../MissingRegExpAnchor.qlref | 1 +
.../missing_regexp_anchor.rb | 18 +++
7 files changed, 229 insertions(+)
create mode 100644 ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
create mode 100644 ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
create mode 100644 ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb
create mode 100644 ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb
create mode 100644 ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
create mode 100644 ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref
create mode 100644 ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
new file mode 100644
index 000000000000..cb2cddb6573c
--- /dev/null
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
@@ -0,0 +1,85 @@
+
+
+
+
+
+
+ Sanitizing untrusted input with regular expressions is a
+ common technique. However, it is error-prone to match untrusted input
+ against regular expressions without anchors such as ^
or
+ $
. Malicious input can bypass such security checks by
+ embedding one of the allowed patterns in an unexpected location.
+
+
+
+
+
+ Even if the matching is not done in a security-critical
+ context, it may still cause undesirable behavior when the regular
+ expression accidentally matches.
+
+
+
+
+
+
+
+ Use anchors to ensure that regular expressions match at
+ the expected locations.
+
+
+
+
+
+
+
+
+ The following example code checks that a URL redirection
+ will reach the example.com
domain, or one of its
+ subdomains, and not some malicious site.
+
+
+
+
+
+
+
+ The check with the regular expression match is, however, easy to bypass. For example
+ by embedding http://example.com/
in the query
+ string component: http://evil-example.net/?x=http://example.com/
.
+
+ Address these shortcomings by using anchors in the regular expression instead:
+
+
+
+
+
+
+
+ A related mistake is to write a regular expression with
+ multiple alternatives, but to only include an anchor for one of the
+ alternatives. As an example, the regular expression
+ /^www\.example\.com|beta\.example\.com/
will match the host
+ evil.beta.example.com
because the regular expression is parsed
+ as /(^www\.example\.com)|(beta\.example\.com)/
+
+ TODO: implement this part of the query
+
+
+
+
+
+ TODO: describe the danger of using line anchors like ^
+ or $
.
+
+
+
+
+
+
+ OWASP: SSRF
+ OWASP: XSS Unvalidated Redirects and Forwards Cheat Sheet.
+
+
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
new file mode 100644
index 000000000000..3c67b83ac610
--- /dev/null
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
@@ -0,0 +1,105 @@
+/**
+ * @name Missing regular expression anchor
+ * @description Regular expressions without anchors can be vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision medium
+ * @id rb/regex/missing-regexp-anchor
+ * @tags correctness
+ * security
+ * external/cwe/cwe-020
+ */
+
+import HostnameRegexpShared
+import codeql.ruby.DataFlow
+import codeql.ruby.security.performance.RegExpTreeView
+
+/**
+ * Holds if `term` is a final term, that is, no term will match anything after this one.
+ */
+predicate isFinalRegExpTerm(RegExpTerm term) {
+ term.isRootTerm()
+ or
+ exists(RegExpSequence seq |
+ isFinalRegExpTerm(seq) and
+ term = seq.getLastChild()
+ )
+ or
+ exists(RegExpTerm parent |
+ isFinalRegExpTerm(parent) and
+ term = parent.getAChild() and
+ not parent instanceof RegExpSequence and
+ not parent instanceof RegExpQuantifier
+ )
+}
+
+/**
+ * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
+ * rather than `\A/\z` which match the start/end of the whole string.
+ */
+predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+ not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting
+ exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
+ not isConstantInvalidInsideOrigin(term.getAChild*()) and
+ tld = term.getAChild*() and
+ hasTopLevelDomainEnding(tld, i) and
+ isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
+ (
+ tld.getChild(0).(RegExpCaret).getChar() = "^" or
+ tld.getLastChild().(RegExpDollar).getChar() = "$"
+ ) and
+ msg =
+ "This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead."
+ )
+}
+
+/**
+ * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
+ */
+predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+ // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
+ exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
+ not isConstantInvalidInsideOrigin(term.getAChild*()) and
+ tld = term.getAChild*() and
+ hasTopLevelDomainEnding(tld, i) and
+ isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
+ tld.getChild(0) instanceof RegExpCaret and
+ msg =
+ "This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end."
+ )
+}
+
+/**
+ * Holds if `src` is an unanchored pattern for a URL, indicating a
+ * mistake explained by `msg`.
+ */
+predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+ exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
+ alwaysMatchesHostname(term) and
+ tld = term.getAChild*() and
+ hasTopLevelDomainEnding(tld) and
+ not isConstantInvalidInsideOrigin(term.getAChild*()) and
+ not term.getAChild*() instanceof RegExpAnchor and
+ // that is not used for capture or replace
+ not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
+ name = mcn.getMethodName() and
+ arg = mcn.getArgument(0)
+ |
+ (
+ src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
+ src.getAParse() = arg
+ ) and
+ name = ["sub", "sub!", "gsub", "gsub!"]
+ ) and
+ msg =
+ "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
+ )
+}
+
+from DataFlow::Node nd, string msg
+where
+ isUnanchoredHostnameRegExp(nd, msg) or
+ isSemiAnchoredHostnameRegExp(nd, msg) or
+ isLineAnchoredHostnameRegExp(nd, msg)
+select nd, msg
diff --git a/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb
new file mode 100644
index 000000000000..624467cc40a3
--- /dev/null
+++ b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb
@@ -0,0 +1,8 @@
+class UsersController < ActionController::Base
+ def index
+ # BAD: the host of `params[:url]` may be controlled by an attacker
+ if params[:url].match? /https?:\/\/www\.example\.com\//
+ redirect_to params[:url]
+ end
+ end
+end
\ No newline at end of file
diff --git a/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb
new file mode 100644
index 000000000000..4443e111f8c7
--- /dev/null
+++ b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb
@@ -0,0 +1,8 @@
+class UsersController < ActionController::Base
+ def index
+ # GOOD: the host of `params[:url]` can not be controlled by an attacker
+ if params[:url].match? /\Ahttps?:\/\/www\.example\.com\//
+ redirect_to params[:url]
+ end
+ end
+end
\ No newline at end of file
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
new file mode 100644
index 000000000000..c4825104b7ee
--- /dev/null
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
@@ -0,0 +1,4 @@
+| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
+| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. |
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref
new file mode 100644
index 000000000000..bd3ad563aec1
--- /dev/null
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref
@@ -0,0 +1 @@
+queries/security/cwe-020/MissingRegExpAnchor.ql
\ No newline at end of file
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
new file mode 100644
index 000000000000..e7c9ae6a6617
--- /dev/null
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
@@ -0,0 +1,18 @@
+/www.example.com/ # BAD
+/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
+/\Awww.example.com\z/ # GOOD
+
+/foo.bar/ # GOOD
+
+/https?:\/\/good.com/ # BAD
+/^https?:\/\/good.com/ # BAD: missing end-of-string anchor
+/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor
+
+/bar/ # GOOD
+
+foo.gsub(/www.example.com/, "bar") # GOOD
+foo.sub(/www.example.com/, "bar") # GOOD
+foo.gsub!(/www.example.com/, "bar") # GOOD
+foo.sub!(/www.example.com/, "bar") # GOOD
+
+
From 3f8b27c0cdc796b4a7e8e35bc83c3a5acc270b57 Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 28 Mar 2022 17:04:31 +1300
Subject: [PATCH 4/8] Ruby: Add RegExpNonWordBoundary to RegExpTreeView
---
ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll | 9 +++++++++
ruby/ql/test/library-tests/regexp/parse.expected | 4 ++--
ruby/ql/test/library-tests/regexp/regexp.expected | 2 +-
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
index 872aebf938d2..ac88df8de232 100644
--- a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll
@@ -581,6 +581,15 @@ class RegExpWordBoundary extends RegExpSpecialChar {
RegExpWordBoundary() { this.getChar() = "\\b" }
}
+/**
+ * A non-word boundary, that is, a regular expression term of the form `\B`.
+ */
+class RegExpNonWordBoundary extends RegExpSpecialChar {
+ RegExpNonWordBoundary() { this.getChar() = "\\B" }
+
+ override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" }
+}
+
/**
* A character class escape in a regular expression.
* That is, an escaped character that denotes multiple characters.
diff --git a/ruby/ql/test/library-tests/regexp/parse.expected b/ruby/ql/test/library-tests/regexp/parse.expected
index 9256fefd67a8..3df78cdb4fbf 100644
--- a/ruby/ql/test/library-tests/regexp/parse.expected
+++ b/ruby/ql/test/library-tests/regexp/parse.expected
@@ -266,11 +266,11 @@ regexp.rb:
# 43| [RegExpSequence] \b!a\B
#-----| 0 -> [RegExpSpecialChar] \b
#-----| 1 -> [RegExpConstant, RegExpNormalChar] !a
-#-----| 2 -> [RegExpSpecialChar] \B
+#-----| 2 -> [RegExpNonWordBoundary] \B
# 43| [RegExpConstant, RegExpNormalChar] !a
-# 43| [RegExpSpecialChar] \B
+# 43| [RegExpNonWordBoundary] \B
# 46| [RegExpGroup] (foo)
#-----| 0 -> [RegExpConstant, RegExpNormalChar] foo
diff --git a/ruby/ql/test/library-tests/regexp/regexp.expected b/ruby/ql/test/library-tests/regexp/regexp.expected
index 4d1837bedea2..e464b5ce5ea5 100644
--- a/ruby/ql/test/library-tests/regexp/regexp.expected
+++ b/ruby/ql/test/library-tests/regexp/regexp.expected
@@ -108,7 +108,7 @@ term
| regexp.rb:43:2:43:3 | \\b | RegExpSpecialChar |
| regexp.rb:43:2:43:7 | \\b!a\\B | RegExpSequence |
| regexp.rb:43:4:43:5 | !a | RegExpConstant,RegExpNormalChar |
-| regexp.rb:43:6:43:7 | \\B | RegExpSpecialChar |
+| regexp.rb:43:6:43:7 | \\B | RegExpNonWordBoundary |
| regexp.rb:46:2:46:6 | (foo) | RegExpGroup |
| regexp.rb:46:2:46:7 | (foo)* | RegExpStar |
| regexp.rb:46:2:46:10 | (foo)*bar | RegExpSequence |
From 2feb4a48beb8c67ca738a90938394fdab2baea74 Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 28 Mar 2022 17:08:42 +1300
Subject: [PATCH 5/8] Ruby: Add hasMisleadingAnchorPrecedence to
MissingRegExpAnchor
---
.../cwe-020/MissingRegExpAnchor.qhelp | 2 -
.../security/cwe-020/MissingRegExpAnchor.ql | 155 +++++++++++++++++-
.../MissingRegExpAnchor.expected | 22 ++-
.../missing_regexp_anchor.rb | 57 +++++--
4 files changed, 215 insertions(+), 21 deletions(-)
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
index cb2cddb6573c..eac017305321 100644
--- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
@@ -65,8 +65,6 @@
evil.beta.example.com
because the regular expression is parsed
as /(^www\.example\.com)|(beta\.example\.com)/
- TODO: implement this part of the query
-
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
index 3c67b83ac610..d06ef7efa9b3 100644
--- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
@@ -34,12 +34,158 @@ predicate isFinalRegExpTerm(RegExpTerm term) {
)
}
+/** Holds if `term` is one of the transitive left children of a regexp. */
+predicate isLeftArmTerm(RegExpTerm term) {
+ term.isRootTerm()
+ or
+ exists(RegExpTerm parent |
+ term = parent.getChild(0) and
+ isLeftArmTerm(parent)
+ )
+}
+
+/** Holds if `term` is one of the transitive right children of a regexp. */
+predicate isRightArmTerm(RegExpTerm term) {
+ term.isRootTerm()
+ or
+ exists(RegExpTerm parent |
+ term = parent.getLastChild() and
+ isRightArmTerm(parent)
+ )
+}
+
+/**
+ * Holds if `term` is an anchor that is not the first or last node
+ * in its tree.
+ */
+predicate isInteriorAnchor(RegExpAnchor term) {
+ not isLeftArmTerm(term) and
+ not isRightArmTerm(term)
+}
+
+/**
+ * Holds if `term` contains an anchor that is not the first or last node
+ * in its tree, such as `(foo|bar$|baz)`.
+ */
+predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
+
+/**
+ * Holds if `term` starts with a word boundary or lookbehind assertion,
+ * indicating that it's not intended to be anchored on that side.
+ */
+predicate containsLeadingPseudoAnchor(RegExpSequence term) {
+ exists(RegExpTerm child | child = term.getChild(0) |
+ child instanceof RegExpWordBoundary or
+ child instanceof RegExpNonWordBoundary or
+ child instanceof RegExpLookbehind
+ )
+}
+
+/**
+ * Holds if `term` ends with a word boundary or lookahead assertion,
+ * indicating that it's not intended to be anchored on that side.
+ */
+predicate containsTrailingPseudoAnchor(RegExpSequence term) {
+ exists(RegExpTerm child | child = term.getLastChild() |
+ child instanceof RegExpWordBoundary or
+ child instanceof RegExpNonWordBoundary or
+ child instanceof RegExpLookahead
+ )
+}
+
+/**
+ * Holds if `term` is an empty sequence, usually arising from
+ * literals with a trailing alternative such as `foo|`.
+ */
+predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
+
+/**
+ * Holds if `term` contains a letter constant.
+ *
+ * We use this as a heuristic to filter out uninteresting results.
+ */
+predicate containsLetters(RegExpTerm term) {
+ term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
+}
+
+/**
+ * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
+ * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
+ */
+predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
+ isAnchoredGroup(alt.getAChild()) and
+ not alt.getAChild() instanceof RegExpGroup
+}
+
+/**
+ * Holds if `term` consists only of an anchor and a parenthesized term,
+ * such as the left side of `^(foo|bar)|baz`.
+ *
+ * The precedence of the anchor is likely to be intentional in this case,
+ * as the group wouldn't be needed otherwise.
+ */
+predicate isAnchoredGroup(RegExpSequence term) {
+ term.getNumChild() = 2 and
+ term.getAChild() instanceof RegExpAnchor and
+ term.getAChild() instanceof RegExpGroup
+}
+
+/**
+ * Holds if `src` is a pattern for a collection of alternatives where
+ * only the first or last alternative is anchored, indicating a
+ * precedence mistake explained by `msg`.
+ *
+ * The canonical example of such a mistake is: `^a|b|c`, which is
+ * parsed as `(^a)|(b)|(c)`.
+ */
+predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
+ exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
+ root = src.getRegExpTerm() and
+ not containsInteriorAnchor(root) and
+ not isEmpty(root.getAChild()) and
+ not hasExplicitAnchorPrecedence(root) and
+ containsLetters(anchoredTerm) and
+ (
+ anchoredTerm = root.getChild(0) and
+ anchoredTerm.getChild(0) instanceof RegExpCaret and
+ not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
+ containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
+ direction = "beginning"
+ or
+ anchoredTerm = root.getLastChild() and
+ anchoredTerm.getLastChild() instanceof RegExpDollar and
+ not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
+ containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
+ direction = "end"
+ ) and
+ // that is not used for string replacement
+ not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
+ name = mcn.getMethodName() and
+ arg = mcn.getArgument(0)
+ |
+ (
+ src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
+ src.getAParse() = arg
+ ) and
+ name = ["sub", "sub!", "gsub", "gsub!"]
+ ) and
+ msg =
+ "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
+ "' is anchored at the " + direction +
+ ", but the other parts of this regular expression are not"
+ )
+}
+
/**
* Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
* rather than `\A/\z` which match the start/end of the whole string.
*/
predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
- not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting
+ // avoid double reporting
+ not (
+ isSemiAnchoredHostnameRegExp(src, _) or
+ hasMisleadingAnchorPrecedence(src, _)
+ ) and
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
@@ -58,7 +204,7 @@ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
*/
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
- // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
+ not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
@@ -81,7 +227,7 @@ predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
hasTopLevelDomainEnding(tld) and
not isConstantInvalidInsideOrigin(term.getAChild*()) and
not term.getAChild*() instanceof RegExpAnchor and
- // that is not used for capture or replace
+ // that is not used for string replacement
not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
name = mcn.getMethodName() and
arg = mcn.getArgument(0)
@@ -101,5 +247,6 @@ from DataFlow::Node nd, string msg
where
isUnanchoredHostnameRegExp(nd, msg) or
isSemiAnchoredHostnameRegExp(nd, msg) or
- isLineAnchoredHostnameRegExp(nd, msg)
+ isLineAnchoredHostnameRegExp(nd, msg) or
+ hasMisleadingAnchorPrecedence(nd, msg)
select nd, msg
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
index c4825104b7ee..9f5a8b11fc6b 100644
--- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
@@ -1,4 +1,18 @@
-| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
-| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
-| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
-| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. |
+| missing_regexp_anchor.rb:1:1:1:19 | /www\\.example\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:7:1:7:22 | /https?:\\/\\/good\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:8:1:8:23 | /^https?:\\/\\/good\\.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
+| missing_regexp_anchor.rb:19:1:19:6 | /^a\|b/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:22:1:22:8 | /^a\|b\|c/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:28:1:28:8 | /^a\|(b)/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:30:1:30:10 | /^(a)\|(b)/ | Misleading operator precedence. The subexpression '^(a)' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:33:1:33:6 | /a\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:36:1:36:8 | /a\|b\|c$/ | Misleading operator precedence. The subexpression 'c$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:42:1:42:8 | /(a)\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:44:1:44:10 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:46:1:46:22 | /^good.com\|better.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:47:1:47:24 | /^good\\.com\|better\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:48:1:48:26 | /^good\\\\.com\|better\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:49:1:49:28 | /^good\\\\\\.com\|better\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
index e7c9ae6a6617..646a3574447c 100644
--- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
@@ -1,18 +1,53 @@
-/www.example.com/ # BAD
-/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
-/\Awww.example.com\z/ # GOOD
+/www\.example\.com/ # BAD
+/^www\.example\.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
+/\Awww\.example\.com\z/ # GOOD
-/foo.bar/ # GOOD
+/foo\.bar/ # GOOD
-/https?:\/\/good.com/ # BAD
-/^https?:\/\/good.com/ # BAD: missing end-of-string anchor
-/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor
+/https?:\/\/good\.com/ # BAD
+/^https?:\/\/good\.com/ # BAD: missing end-of-string anchor
+/(^https?:\/\/good1\.com)|(^https?:#good2\.com)/ # BAD: missing end-of-string anchor
/bar/ # GOOD
-foo.gsub(/www.example.com/, "bar") # GOOD
-foo.sub(/www.example.com/, "bar") # GOOD
-foo.gsub!(/www.example.com/, "bar") # GOOD
-foo.sub!(/www.example.com/, "bar") # GOOD
+foo.gsub(/www\.example\.com/, "bar") # GOOD
+foo.sub(/www\.example.com/, "bar") # GOOD
+foo.gsub!(/www\.example\.com/, "bar") # GOOD
+foo.sub!(/www\.example\.com/, "bar") # GOOD
+/^a|/
+/^a|b/ # BAD
+/a|^b/
+/^a|^b/
+/^a|b|c/ # BAD
+/a|^b|c/
+/a|b|^c/
+/^a|^b|c/
+/(^a)|b/
+/^a|(b)/ # BAD
+/^a|(^b)/
+/^(a)|(b)/ # BAD
+
+
+/a|b$/ # BAD
+/a$|b/
+/a$|b$/
+/a|b|c$/ # BAD
+/a|b$|c/
+/a$|b|c/
+/a|b$|c$/
+
+/a|(b$)/
+/(a)|b$/ # BAD
+/(a$)|b$/
+/(a)|(b)$/ # BAD
+
+/^good.com|better.com/ # BAD
+/^good\.com|better\.com/ # BAD
+/^good\\.com|better\\.com/ # BAD
+/^good\\\.com|better\\\.com/ # BAD
+/^good\\\\.com|better\\\\.com/ # BAD
+
+/^foo|bar|baz$/ # BAD
+/^foo|%/ # OK
\ No newline at end of file
From 6f9dc5eb7ee70a715c90e3b3974fd4d107aaeff7 Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 4 Apr 2022 11:42:57 +1200
Subject: [PATCH 6/8] Ruby: Update import for file move
---
ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
index d06ef7efa9b3..45653af568dc 100644
--- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
@@ -13,7 +13,8 @@
import HostnameRegexpShared
import codeql.ruby.DataFlow
-import codeql.ruby.security.performance.RegExpTreeView
+import codeql.ruby.regexp.RegExpTreeView
+import codeql.ruby.Regexp
/**
* Holds if `term` is a final term, that is, no term will match anything after this one.
From af2965c2a0db9a48a2c66de5f2c73542646acacb Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Mon, 4 Apr 2022 14:26:38 +1200
Subject: [PATCH 7/8] Explain anchors in MissingRegExpAnchor qlhelp
---
.../security/cwe-020/MissingRegExpAnchor.qhelp | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
index eac017305321..838cfb728ab4 100644
--- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
@@ -8,8 +8,8 @@
Sanitizing untrusted input with regular expressions is a
common technique. However, it is error-prone to match untrusted input
- against regular expressions without anchors such as ^
or
- $
. Malicious input can bypass such security checks by
+ against regular expressions without anchors such as \A
or
+ \z
. Malicious input can bypass such security checks by
embedding one of the allowed patterns in an unexpected location.
@@ -68,10 +68,17 @@
+ In Ruby the anchors ^
and $
match the
+ start and end of a line, whereas the anchors \A
and
+ \z
match the start and end of the entire string.
- TODO: describe the danger of using line anchors like ^
- or $
.
+ Using line anchors can be dangerous, as this can allow malicious
+ input to be hidden using newlines, leading to vulnerabilities such
+ as HTTP header injection.
+ Unless you specifically need the line-matching behaviour of
+ ^
and $
, you should use \A
+ and \z
instead.
From bbc3043836ab795c9c84141d1acf9df653cf71cc Mon Sep 17 00:00:00 2001
From: Harry Maclean
Date: Thu, 14 Apr 2022 15:44:30 +1200
Subject: [PATCH 8/8] Add change note for rb/regex/missing-regexp-anchor
---
.../src/change-notes/2022-04-14-rb-missing-regexp-anchor.md | 4 ++++
1 file changed, 4 insertions(+)
create mode 100644 ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md
diff --git a/ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md b/ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md
new file mode 100644
index 000000000000..7f18700d693f
--- /dev/null
+++ b/ruby/ql/src/change-notes/2022-04-14-rb-missing-regexp-anchor.md
@@ -0,0 +1,4 @@
+---
+category: newQuery
+---
+* Added a new query, `rb/regex/missing-regexp-anchor`, which finds regular expressions which are improperly anchored. Validations using such expressions are at risk of being bypassed.
\ No newline at end of file