diff --git a/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py new file mode 100644 index 000000000000..8e7efcaf9260 --- /dev/null +++ b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py @@ -0,0 +1,2 @@ +"\uD800" +"?" diff --git a/python/extractor/cli-integration-test/string-encoding/test.sh b/python/extractor/cli-integration-test/string-encoding/test.sh new file mode 100755 index 000000000000..3bf1a6b03014 --- /dev/null +++ b/python/extractor/cli-integration-test/string-encoding/test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/ + +set -x + +CODEQL=${CODEQL:-codeql} + +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$SCRIPTDIR" + +rm -rf db + +$CODEQL database create db --language python --source-root repo_dir/ + +$CODEQL dataset check db/db-python + +echo "Test successfully completed." diff --git a/python/extractor/semmle/python/passes/objects.py b/python/extractor/semmle/python/passes/objects.py index 599539bc5419..335603c131d7 100644 --- a/python/extractor/semmle/python/passes/objects.py +++ b/python/extractor/semmle/python/passes/objects.py @@ -43,6 +43,23 @@ LITERALS = (ast.Num, ast.Str) +# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD +# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped +# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source +# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/ +def fffd_replace(exc): + if isinstance(exc, UnicodeEncodeError): + return ((exc.end-exc.start)*u"\\ufffd", exc.end) + elif isinstance(exc, UnicodeDecodeError): + return (u"\\ufffd", exc.end) + elif isinstance(exc, UnicodeTranslateError): + return ((exc.end-exc.start)*u"\\ufffd", exc.end) + else: + raise TypeError("can't handle %s" % exc.__name__) + +import codecs +codecs.register_error("fffdreplace", fffd_replace) + class _CObject(object): '''Utility class to wrap arbitrary C objects. Treat all objects as unique. Rely on naming in the @@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type): else: prefix = u"C_bytes$" if t is str: - obj = obj.encode("utf8", errors='replace') + obj = obj.encode("utf8", errors='fffdreplace') return prefix + hashlib.sha1(obj).hexdigest() if t is bytes: return prefix + hashlib.sha1(obj).hexdigest()