From 88d8cb514cb80c7ea3d744f6ac30c7b62a4e0f95 Mon Sep 17 00:00:00 2001
From: Mark Shannon
Date: Thu, 24 Jan 2019 12:57:14 +0000
Subject: [PATCH 1/3] Python: Two new queries for URL and hostname sanitization
(CWE-020).
---
change-notes/1.20/analysis-python.md | 2 +
.../CWE-020/IncompleteHostnameRegExp.qhelp | 70 +++++++++++++++
.../CWE-020/IncompleteHostnameRegExp.ql | 44 ++++++++++
.../IncompleteUrlSubstringSanitization.qhelp | 86 +++++++++++++++++++
.../IncompleteUrlSubstringSanitization.ql | 58 +++++++++++++
.../examples/IncompleteHostnameRegExp.py | 19 ++++
.../IncompleteUrlSubstringSanitization.py | 41 +++++++++
.../CWE-020/IncompleteHostnameRegExp.expected | 1 +
.../CWE-020/IncompleteHostnameRegExp.qlref | 1 +
...ncompleteUrlSubstringSanitization.expected | 2 +
.../IncompleteUrlSubstringSanitization.qlref | 1 +
.../query-tests/Security/CWE-020/hosttest.py | 19 ++++
.../query-tests/Security/CWE-020/urltest.py | 41 +++++++++
13 files changed, 385 insertions(+)
create mode 100644 python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
create mode 100644 python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
create mode 100644 python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
create mode 100644 python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
create mode 100644 python/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.py
create mode 100644 python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
create mode 100644 python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
create mode 100644 python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref
create mode 100644 python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.expected
create mode 100644 python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.qlref
create mode 100644 python/ql/test/query-tests/Security/CWE-020/hosttest.py
create mode 100644 python/ql/test/query-tests/Security/CWE-020/urltest.py
diff --git a/change-notes/1.20/analysis-python.md b/change-notes/1.20/analysis-python.md
index 109230f11253..ed7d8e1a72a5 100644
--- a/change-notes/1.20/analysis-python.md
+++ b/change-notes/1.20/analysis-python.md
@@ -15,6 +15,8 @@ Removes false positives seen when using Python 3.6, but not when using earlier v
|-----------------------------|-----------|--------------------------------------------------------------------|
| Default version of SSL/TLS may be insecure (`py/insecure-default-protocol`) | security, external/cwe/cwe-327 | Finds instances where an insecure default protocol may be used. Results are shown on LGTM by default. |
| Use of insecure SSL/TLS version (`py/insecure-protocol`) | security, external/cwe/cwe-327 | Finds instances where a known insecure protocol has been specified. Results are shown on LGTM by default. |
+| Incomplete regular expression for hostnames (`py/incomplete-hostname-regexp`) | security, external/cwe/cwe-020 | Finds instances where a hostname is incompletely sanitized due to enescaped character in a regular expression. Results are shown on LGTM by default. |
+| Incomplete URL substring sanitization (`py/incomplete-url-substring-sanitization`) | security, external/cwe/cwe-020 | Finds instances where a URL is incompletely sanitized due to insufficient checks. Results are shown on LGTM by default. |
## Changes to existing queries
diff --git a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
new file mode 100644
index 000000000000..346923da0216
--- /dev/null
+++ b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
@@ -0,0 +1,70 @@
+
+
+
+
+
+
+ Sanitizing untrusted URLs is an important technique for
+ preventing attacks such as request forgeries and malicious
+ redirections. Often, this is done by checking that the host of a URL
+ is in a set of allowed hosts.
+
+
+
+
+
+ If a regular expression implements such a check, it is
+ easy to accidentally make the check too permissive by not escaping the
+ . meta-characters appropriately.
+
+ Even if the check is not used in a security-critical
+ context, the incomplete check may still cause undesirable behaviors
+ when it accidentally succeeds.
+
+
+
+
+
+
+
+ Escape all meta-characters appropriately when constructing
+ regular expressions for security checks, pay special attention to the
+ . meta-character.
+
+
+
+
+
+
+
+
+ The following example code checks that a URL redirection
+ will reach the example.com domain, or one of its
+ subdomains.
+
+
+
+
+
+
+ The unsafe check is easy to bypass because the unescaped
+ . allows for any character before
+ example.com, effectively allowing the redirect to go to
+ an attacker-controlled domain such as wwwXexample.com.
+
+
+
+ This vulnerability is addressed in the safe check, which
+ escapes the . and will reject wwwXexample.com.
+
+
+
+
+
+
+ OWASP: SSRF
+ OWASP: XSS Unvalidated Redirects and Forwards Cheat Sheet.
+
+
diff --git a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
new file mode 100644
index 000000000000..11238db40b42
--- /dev/null
+++ b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -0,0 +1,44 @@
+/**
+ * @name Incomplete regular expression for hostnames
+ * @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
+ * @kind problem
+ * @problem.severity warning
+ * @precision high
+ * @id py/incomplete-hostname-regexp
+ * @tags correctness
+ * security
+ * external/cwe/cwe-20
+ */
+
+import python
+import semmle.python.regex
+
+private string commonTopLevelDomainRegex() {
+ result = "com|org|edu|gov|uk|net|io"
+}
+
+/**
+ * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
+ * and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
+ */
+bindingset[pattern]
+predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
+ hostPart = pattern
+ .regexpCapture("(?i).*" +
+ // an unescaped single `.`
+ "(?
+
+
+
+
+
+ Sanitizing untrusted URLs is an important technique for
+ preventing attacks such as request forgeries and malicious
+ redirections. Usually, this is done by checking that the host of a URL
+ is in a set of allowed hosts.
+
+
+
+
+
+ However, it is notoriously error-prone to treat the URL as
+ a string and check if one of the allowed hosts is a substring of the
+ URL. Malicious URLs can bypass such security checks by embedding one
+ of the allowed hosts in an unexpected location.
+
+
+
+
+
+ Even if the substring check is not used in a
+ security-critical context, the incomplete check may still cause
+ undesirable behaviors when the check succeeds accidentally.
+
+
+
+
+
+
+
+ Parse a URL before performing a check on its host value,
+ and ensure that the check handles arbitrary subdomain sequences
+ correctly.
+
+
+
+
+
+
+
+
+ The following example code checks that a URL redirection
+ will reach the example.com domain.
+
+
+
+
+
+
+
+ The first two examples show unsafe checks that are easily bypassed.
+ In unsafe1 the attacker can simply add
+ example.com anywhere in the url. For example,
+ http://evil-example.net/example.com.
+
+
+ In unsafe2 the attacker must use a hostname ending in
+ example.com, but that is easy to do. For example,
+ http://benign-looking-prefix-example.com.
+
+
+
+
+
+ The second two examples show safe checks.
+ In safe1, a white-list is used. Although fairly inflexible,
+ this is easy to get right and is most likely to be safe.
+
+
+ In safe2, urlparse is used to parse the URL,
+ then the hostname is checked to make sure it ends with .example.com.
+
+
+
+
+
+ OWASP: SSRF
+ OWASP: XSS Unvalidated Redirects and Forwards Cheat Sheet.
+
+
diff --git a/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql b/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
new file mode 100644
index 000000000000..bc4fdb3d437b
--- /dev/null
+++ b/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
@@ -0,0 +1,58 @@
+/**
+ * @name Incomplete URL substring sanitization
+ * @description Security checks on the substrings of an unparsed URL are often vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity warning
+ * @precision high
+ * @id py/incomplete-url-substring-sanitization
+ * @tags correctness
+ * security
+ * external/cwe/cwe-20
+ */
+
+
+import python
+import semmle.python.regex
+
+private string commonTopLevelDomainRegex() {
+ result = "com|org|edu|gov|uk|net|io"
+}
+
+predicate looksLikeUrl(StrConst s) {
+ exists(string text |
+ text = s.getText()
+ |
+ text.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" +
+ commonTopLevelDomainRegex() +")(:[0-9]+)?/?")
+ or
+ // target is a HTTP URL to a domain on any TLD
+ text.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
+ )
+}
+
+predicate incomplete_sanitization(Expr sanitizer, StrConst url) {
+ looksLikeUrl(url) and
+ (
+ sanitizer.(Compare).compares(url, any(In i), _)
+ or
+ call_to_startswith(sanitizer, url)
+ or
+ unsafe_call_to_endswith(sanitizer, url)
+ )
+}
+
+predicate call_to_startswith(Call sanitizer, StrConst url) {
+ sanitizer.getFunc().(Attribute).getName() = "startswith"
+ and
+ sanitizer.getArg(0) = url
+}
+
+predicate unsafe_call_to_endswith(Call sanitizer, StrConst url) {
+ sanitizer.getFunc().(Attribute).getName() = "endswith" and
+ sanitizer.getArg(0) = url and
+ not url.getText().regexpMatch("(?i)\\.([a-z0-9-]+)(\\.[a-z0-9-]+)+")
+}
+
+from Expr sanitizer, StrConst url
+where incomplete_sanitization(sanitizer, url)
+select sanitizer, "'$@' may be at an arbitrary position in the sanitized URL.", url, url.getText()
diff --git a/python/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.py b/python/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.py
new file mode 100644
index 000000000000..957f16fb72b4
--- /dev/null
+++ b/python/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.py
@@ -0,0 +1,19 @@
+from flask import Flask, request, redirect
+import re
+
+app = Flask(__name__)
+
+UNSAFE_REGEX = re.compile("(www|beta).example.com/")
+SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
+
+@app.route('/some/path/bad')
+def unsafe(request):
+ target = request.args.get('target', '')
+ if UNSAFE_REGEX.match(target):
+ return redirect(target)
+
+@app.route('/some/path/good')
+def safe(request):
+ target = request.args.get('target', '')
+ if SAFE_REGEX.match(target):
+ return redirect(target)
diff --git a/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py b/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
new file mode 100644
index 000000000000..d1f629794600
--- /dev/null
+++ b/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
@@ -0,0 +1,41 @@
+from flask import Flask, request, redirect
+from urllib.parse import urlparse
+
+app = Flask(__name__)
+
+@app.route('/some/path/bad1')
+def unsafe1(request):
+ target = request.args.get('target', '')
+ if "example.com" in target:
+ return redirect(target)
+
+@app.route('/some/path/bad2')
+def unsafe2(request):
+ target = request.args.get('target', '')
+ if target.endswith("example.com"):
+ return redirect(target)
+
+
+
+#Simplest and safest approach is to use a white-list
+
+@app.route('/some/path/good1')
+def safe1(request):
+ whitelist = [
+ "example.com/home",
+ "example.com/login",
+ ]
+ target = request.args.get('target', '')
+ if target in whitelist:
+ return redirect(target)
+
+#More complex example allowing sub-domains.
+
+@app.route('/some/path/good2')
+def safe2(request):
+ target = request.args.get('target', '')
+ host = urlparse(target).hostname
+ #Note the '.' preceding example.com
+ if host and host.endswith(".example.com"):
+ return redirect(target)
+
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected b/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
new file mode 100644
index 000000000000..00e0fcb95a0e
--- /dev/null
+++ b/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
@@ -0,0 +1 @@
+| hosttest.py:6:27:6:51 | Str | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref b/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref
new file mode 100644
index 000000000000..e818d9472521
--- /dev/null
+++ b/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteHostnameRegExp.ql
\ No newline at end of file
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.expected b/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.expected
new file mode 100644
index 000000000000..4b2ba67ecdab
--- /dev/null
+++ b/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.expected
@@ -0,0 +1,2 @@
+| urltest.py:9:8:9:30 | Compare | '$@' may be at an arbitrary position in the sanitized URL. | urltest.py:9:8:9:20 | Str | example.com |
+| urltest.py:15:8:15:37 | Attribute() | '$@' may be at an arbitrary position in the sanitized URL. | urltest.py:15:24:15:36 | Str | example.com |
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.qlref b/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.qlref
new file mode 100644
index 000000000000..3fa6794419d7
--- /dev/null
+++ b/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteUrlSubstringSanitization.ql
\ No newline at end of file
diff --git a/python/ql/test/query-tests/Security/CWE-020/hosttest.py b/python/ql/test/query-tests/Security/CWE-020/hosttest.py
new file mode 100644
index 000000000000..957f16fb72b4
--- /dev/null
+++ b/python/ql/test/query-tests/Security/CWE-020/hosttest.py
@@ -0,0 +1,19 @@
+from flask import Flask, request, redirect
+import re
+
+app = Flask(__name__)
+
+UNSAFE_REGEX = re.compile("(www|beta).example.com/")
+SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
+
+@app.route('/some/path/bad')
+def unsafe(request):
+ target = request.args.get('target', '')
+ if UNSAFE_REGEX.match(target):
+ return redirect(target)
+
+@app.route('/some/path/good')
+def safe(request):
+ target = request.args.get('target', '')
+ if SAFE_REGEX.match(target):
+ return redirect(target)
diff --git a/python/ql/test/query-tests/Security/CWE-020/urltest.py b/python/ql/test/query-tests/Security/CWE-020/urltest.py
new file mode 100644
index 000000000000..d1f629794600
--- /dev/null
+++ b/python/ql/test/query-tests/Security/CWE-020/urltest.py
@@ -0,0 +1,41 @@
+from flask import Flask, request, redirect
+from urllib.parse import urlparse
+
+app = Flask(__name__)
+
+@app.route('/some/path/bad1')
+def unsafe1(request):
+ target = request.args.get('target', '')
+ if "example.com" in target:
+ return redirect(target)
+
+@app.route('/some/path/bad2')
+def unsafe2(request):
+ target = request.args.get('target', '')
+ if target.endswith("example.com"):
+ return redirect(target)
+
+
+
+#Simplest and safest approach is to use a white-list
+
+@app.route('/some/path/good1')
+def safe1(request):
+ whitelist = [
+ "example.com/home",
+ "example.com/login",
+ ]
+ target = request.args.get('target', '')
+ if target in whitelist:
+ return redirect(target)
+
+#More complex example allowing sub-domains.
+
+@app.route('/some/path/good2')
+def safe2(request):
+ target = request.args.get('target', '')
+ host = urlparse(target).hostname
+ #Note the '.' preceding example.com
+ if host and host.endswith(".example.com"):
+ return redirect(target)
+
From 6ddbed7d95bada014614548823750c2d27e2cea9 Mon Sep 17 00:00:00 2001
From: Mark Shannon
Date: Fri, 25 Jan 2019 11:34:41 +0000
Subject: [PATCH 2/3] Python: Minor tweaks to qldoc and release note.
---
change-notes/1.20/analysis-python.md | 4 ++--
.../ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp | 5 ++---
.../CWE-020/examples/IncompleteUrlSubstringSanitization.py | 4 ++++
3 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/change-notes/1.20/analysis-python.md b/change-notes/1.20/analysis-python.md
index ed7d8e1a72a5..27cb554eb8c9 100644
--- a/change-notes/1.20/analysis-python.md
+++ b/change-notes/1.20/analysis-python.md
@@ -14,9 +14,9 @@ Removes false positives seen when using Python 3.6, but not when using earlier v
| **Query** | **Tags** | **Purpose** |
|-----------------------------|-----------|--------------------------------------------------------------------|
| Default version of SSL/TLS may be insecure (`py/insecure-default-protocol`) | security, external/cwe/cwe-327 | Finds instances where an insecure default protocol may be used. Results are shown on LGTM by default. |
-| Use of insecure SSL/TLS version (`py/insecure-protocol`) | security, external/cwe/cwe-327 | Finds instances where a known insecure protocol has been specified. Results are shown on LGTM by default. |
-| Incomplete regular expression for hostnames (`py/incomplete-hostname-regexp`) | security, external/cwe/cwe-020 | Finds instances where a hostname is incompletely sanitized due to enescaped character in a regular expression. Results are shown on LGTM by default. |
+| Incomplete regular expression for hostnames (`py/incomplete-hostname-regexp`) | security, external/cwe/cwe-020 | Finds instances where a hostname is incompletely sanitized because a regular expression contains an unescaped character. Results are shown on LGTM by default. |
| Incomplete URL substring sanitization (`py/incomplete-url-substring-sanitization`) | security, external/cwe/cwe-020 | Finds instances where a URL is incompletely sanitized due to insufficient checks. Results are shown on LGTM by default. |
+| Use of insecure SSL/TLS version (`py/insecure-protocol`) | security, external/cwe/cwe-327 | Finds instances where a known insecure protocol has been specified. Results are shown on LGTM by default. |
## Changes to existing queries
diff --git a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
index 346923da0216..b542ae252eb1 100644
--- a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
+++ b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
@@ -56,9 +56,8 @@
- This vulnerability is addressed in the safe check, which
- escapes the . and will reject wwwXexample.com.
-
+ The safe check closes this vulnerability by escaping the .
+ so that URLs of the form wwwXexample.com are rejected.
diff --git a/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py b/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
index d1f629794600..937a23f806f3 100644
--- a/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
+++ b/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
@@ -3,12 +3,16 @@
app = Flask(__name__)
+# Not safe, as "evil-example.net/example.com" would be accepted
+
@app.route('/some/path/bad1')
def unsafe1(request):
target = request.args.get('target', '')
if "example.com" in target:
return redirect(target)
+# Not safe, as "benign-looking-prefix-example.com" would be accepted
+
@app.route('/some/path/bad2')
def unsafe2(request):
target = request.args.get('target', '')
From 3850f878793f5373ef6bdbef3fd7bf4df4e63348 Mon Sep 17 00:00:00 2001
From: Mark Shannon
Date: Fri, 25 Jan 2019 16:47:23 +0000
Subject: [PATCH 3/3] Make qhelp for 'Incomplete URL substring sanitization'
consistent across languages.
---
.../CWE-020/IncompleteUrlSubstringSanitization.qhelp | 6 +++---
.../CWE-020/IncompleteUrlSubstringSanitization.qhelp | 7 +++----
2 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
index 80f4a8b44fdb..b23557aa557d 100644
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
@@ -15,9 +15,9 @@
- However, it is notoriously error-prone to treat the URL as
- a string and check if one of the allowed hosts is a substring of the
- URL. Malicious URLs can bypass such security checks by embedding one
+ However, treating the URL as a string and checking if one of the
+ allowed hosts is a substring of the URL is very prone to errors.
+ Malicious URLs can bypass such security checks by embedding one
of the allowed hosts in an unexpected location.
diff --git a/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp b/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
index a5e8c524adeb..6c783a4f729c 100644
--- a/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
+++ b/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
@@ -14,10 +14,9 @@
-
- However, it is notoriously error-prone to treat the URL as
- a string and check if one of the allowed hosts is a substring of the
- URL. Malicious URLs can bypass such security checks by embedding one
+ However, treating the URL as a string and checking if one of the
+ allowed hosts is a substring of the URL is very prone to errors.
+ Malicious URLs can bypass such security checks by embedding one
of the allowed hosts in an unexpected location.