github · hmac · Apr 27, 2022 · Mar 27, 2022 · Mar 27, 2022 · Mar 27, 2022
@@ -63,6 +63,9 @@ class RegExpParent extends TRegExpParent {
   /** Gets the number of child terms. */
   int getNumChild() { result = count(this.getAChild()) }
 
+  /** Gets the last child term of this element. */
+  RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
+
   /**
    * Gets the name of a primary CodeQL class to which this regular
    * expression term belongs.
@@ -578,6 +581,15 @@ class RegExpWordBoundary extends RegExpSpecialChar {
   RegExpWordBoundary() { this.getChar() = "\\b" }
 }
 
+/**
+ * A non-word boundary, that is, a regular expression term of the form `\B`.
+ */
+class RegExpNonWordBoundary extends RegExpSpecialChar {
+  RegExpNonWordBoundary() { this.getChar() = "\\B" }
+
+  override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" }
+}
+
 /**
  * A character class escape in a regular expression.
  * That is, an escaped character that denotes multiple characters.
@@ -857,6 +869,21 @@ class RegExpDot extends RegExpSpecialChar {
   override string getAPrimaryQlClass() { result = "RegExpDot" }
 }
 
+/**
+ * A term that matches a specific position between characters in the string.
+ *
+ * Example:
+ *
+ * ```
+ * \A
+ * ```
+ */
+class RegExpAnchor extends RegExpSpecialChar {
+  RegExpAnchor() { this.getChar() = ["^", "$", "\\A", "\\Z", "\\z"] }
+
+  override string getAPrimaryQlClass() { result = "RegExpAnchor" }
+}
+
 /**
  * A dollar assertion `$` or `\Z` matching the end of a line.
  *
@@ -866,7 +893,7 @@ class RegExpDot extends RegExpSpecialChar {
  * $
  * ```
  */
-class RegExpDollar extends RegExpSpecialChar {
+class RegExpDollar extends RegExpAnchor {
   RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] }
 
   override string getAPrimaryQlClass() { result = "RegExpDollar" }
@@ -881,7 +908,7 @@ class RegExpDollar extends RegExpSpecialChar {
  * ^
  * ```
  */
-class RegExpCaret extends RegExpSpecialChar {
+class RegExpCaret extends RegExpAnchor {
   RegExpCaret() { this.getChar() = ["^", "\\A"] }
 
   override string getAPrimaryQlClass() { result = "RegExpCaret" }

@@ -0,0 +1,4 @@
+---
+category: newQuery
+---
+* Added a new query, `rb/regex/missing-regexp-anchor`, which finds regular expressions which are improperly anchored. Validations using such expressions are at risk of being bypassed.
@@ -0,0 +1,90 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted input with regular expressions is a
+			common technique.  However, it is error-prone to match untrusted input
+			against regular expressions without anchors such as <code>\A</code> or
+			<code>\z</code>.  Malicious input can bypass such security checks by
+			embedding one of the allowed patterns in an unexpected location.
+
+		</p>
+
+		<p>
+
+			Even if the matching is not done in a security-critical
+			context, it may still cause undesirable behavior when the regular
+			expression accidentally matches.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Use anchors to ensure that regular expressions match at
+			the expected locations.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains, and not some malicious site.
+
+		</p>
+
+		<sample src="examples/missing_regexp_anchor_bad.rb"/>
+
+		<p>
+
+			The check with the regular expression match is, however, easy to bypass. For example
+			by embedding <code>http://example.com/</code> in the query
+			string component: <code>http://evil-example.net/?x=http://example.com/</code>.
+
+			Address these shortcomings by using anchors in the regular expression instead:
+
+		</p>
+
+		<sample src="examples/missing_regexp_anchor_good.rb"/>
+
+		<p>
+
+			A related mistake is to write a regular expression with
+			multiple alternatives, but to only include an anchor for one of the
+			alternatives. As an example, the regular expression
+			<code>/^www\.example\.com|beta\.example\.com/</code> will match the host
+			<code>evil.beta.example.com</code> because the regular expression is parsed
+			as <code>/(^www\.example\.com)|(beta\.example\.com)/</code>
+
+		</p>
+
+		<p>
+			In Ruby the anchors <code>^</code> and <code>$</code> match the
+			start and end of a line, whereas the anchors <code>\A</code> and
+			<code>\z</code> match the start and end of the entire string.
+
+			Using line anchors can be dangerous, as this can allow malicious
+			input to be hidden using newlines, leading to vulnerabilities such
+			as HTTP header injection.
+
+			Unless you specifically need the line-matching behaviour of
+			<code>^</code> and <code>$</code>, you should use <code>\A</code>
+			and <code>\z</code> instead.
+		</p>
+
+	</example>
+
+	<references>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>