diff --git a/src/optimizer/rule/regex_optimizations.cpp b/src/optimizer/rule/regex_optimizations.cpp index 00f332ea8d1..5aa4401c3f4 100644 --- a/src/optimizer/rule/regex_optimizations.cpp +++ b/src/optimizer/rule/regex_optimizations.cpp @@ -5,6 +5,7 @@ #include "duckdb/planner/expression/bound_constant_expression.hpp" #include "duckdb/function/scalar/string_functions.hpp" #include "duckdb/function/scalar/regexp.hpp" +#include "utf8proc_wrapper.hpp" #include "re2/re2.h" #include "re2/regexp.h" @@ -39,6 +40,19 @@ static void AddCharacter(char chr, LikeString &ret, bool contains) { ret.like_string += run_as_str; } +static void AddCodepoint(int32_t codepoint, LikeString &ret, bool contains) { + int sz = 0; + char utf8_str[4]; + if (!Utf8Proc::CodepointToUtf8(codepoint, sz, utf8_str)) { + // invalid codepoint + ret.exists = false; + return; + } + for (idx_t i = 0; i < idx_t(sz); i++) { + AddCharacter(utf8_str[i], ret, contains); + } +} + static LikeString GetLikeStringEscaped(duckdb_re2::Regexp *regexp, bool contains = false) { D_ASSERT(regexp->op() == duckdb_re2::kRegexpLiteralString || regexp->op() == duckdb_re2::kRegexpLiteral); LikeString ret; @@ -57,16 +71,14 @@ static LikeString GetLikeStringEscaped(duckdb_re2::Regexp *regexp, bool contains auto nrunes = (idx_t)regexp->nrunes(); auto runes = regexp->runes(); for (idx_t i = 0; i < nrunes; i++) { - char chr = toascii(runes[i]); - AddCharacter(chr, ret, contains); + AddCodepoint(runes[i], ret, contains); if (!ret.exists) { return ret; } } } else { auto rune = regexp->rune(); - char chr = toascii(rune); - AddCharacter(chr, ret, contains); + AddCodepoint(rune, ret, contains); } D_ASSERT(ret.like_string.size() >= 1 || !ret.exists); return ret; diff --git a/test/sql/function/string/regexp_unicode_literal.test b/test/sql/function/string/regexp_unicode_literal.test new file mode 100644 index 00000000000..66dc439d0f1 --- /dev/null +++ b/test/sql/function/string/regexp_unicode_literal.test @@ -0,0 +1,55 @@ +# name: test/sql/function/string/regexp_unicode_literal.test +# description: Issue #10058: Regex match turns non-breakable space into regular space +# group: [string] + +statement ok +PRAGMA enable_verification + +statement ok +CREATE TABLE data(wsc INT, zipcode VARCHAR) + +statement ok +INSERT INTO data VALUES (32, '00' || chr(32) || '001'), (160, '00' || chr(160) || '001'), (0, '00🦆001'); + +query II +from data +where regexp_matches(zipcode, '^00\x{0020}001$') +---- +32 00 001 + +query II +from data +where regexp_matches(zipcode, '^00\x{00A0}001$') +---- +160 00 001 + +query II +from data +where regexp_matches(zipcode, '\x{00A0}001$') +---- +160 00 001 + +query II +from data +where regexp_matches(zipcode, '^00\x{1F986}001$') +---- +0 00🦆001 + +query II +from data +where regexp_matches(zipcode, '\x{1F986}') +---- +0 00🦆001 + +query II +select * +from data +where regexp_matches(zipcode, '^00\x{00A0}001$') +and regexp_matches(zipcode, '^00\x{0020}001$') +---- + + +statement error +select regexp_matches(zipcode, '^00\x{FFFFFFFF}001$') from data +---- +invalid escape sequence