Skip to content

Commit

Permalink
Merge pull request #10061 from Mytherin/issue10058
Browse files Browse the repository at this point in the history
Fix #10058: correctly handle unicode literals in regexp optimizer
  • Loading branch information
Mytherin committed Dec 22, 2023
2 parents 0ad4044 + 64a32cf commit 11e1868
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 4 deletions.
20 changes: 16 additions & 4 deletions src/optimizer/rule/regex_optimizations.cpp
Expand Up @@ -5,6 +5,7 @@
#include "duckdb/planner/expression/bound_constant_expression.hpp"
#include "duckdb/function/scalar/string_functions.hpp"
#include "duckdb/function/scalar/regexp.hpp"
#include "utf8proc_wrapper.hpp"

#include "re2/re2.h"
#include "re2/regexp.h"
Expand Down Expand Up @@ -39,6 +40,19 @@ static void AddCharacter(char chr, LikeString &ret, bool contains) {
ret.like_string += run_as_str;
}

static void AddCodepoint(int32_t codepoint, LikeString &ret, bool contains) {
int sz = 0;
char utf8_str[4];
if (!Utf8Proc::CodepointToUtf8(codepoint, sz, utf8_str)) {
// invalid codepoint
ret.exists = false;
return;
}
for (idx_t i = 0; i < idx_t(sz); i++) {
AddCharacter(utf8_str[i], ret, contains);
}
}

static LikeString GetLikeStringEscaped(duckdb_re2::Regexp *regexp, bool contains = false) {
D_ASSERT(regexp->op() == duckdb_re2::kRegexpLiteralString || regexp->op() == duckdb_re2::kRegexpLiteral);
LikeString ret;
Expand All @@ -57,16 +71,14 @@ static LikeString GetLikeStringEscaped(duckdb_re2::Regexp *regexp, bool contains
auto nrunes = (idx_t)regexp->nrunes();
auto runes = regexp->runes();
for (idx_t i = 0; i < nrunes; i++) {
char chr = toascii(runes[i]);
AddCharacter(chr, ret, contains);
AddCodepoint(runes[i], ret, contains);
if (!ret.exists) {
return ret;
}
}
} else {
auto rune = regexp->rune();
char chr = toascii(rune);
AddCharacter(chr, ret, contains);
AddCodepoint(rune, ret, contains);
}
D_ASSERT(ret.like_string.size() >= 1 || !ret.exists);
return ret;
Expand Down
55 changes: 55 additions & 0 deletions test/sql/function/string/regexp_unicode_literal.test
@@ -0,0 +1,55 @@
# name: test/sql/function/string/regexp_unicode_literal.test
# description: Issue #10058: Regex match turns non-breakable space into regular space
# group: [string]

statement ok
PRAGMA enable_verification

statement ok
CREATE TABLE data(wsc INT, zipcode VARCHAR)

statement ok
INSERT INTO data VALUES (32, '00' || chr(32) || '001'), (160, '00' || chr(160) || '001'), (0, '00🦆001');

query II
from data
where regexp_matches(zipcode, '^00\x{0020}001$')
----
32 00 001

query II
from data
where regexp_matches(zipcode, '^00\x{00A0}001$')
----
160 00 001

query II
from data
where regexp_matches(zipcode, '\x{00A0}001$')
----
160 00 001

query II
from data
where regexp_matches(zipcode, '^00\x{1F986}001$')
----
0 00🦆001

query II
from data
where regexp_matches(zipcode, '\x{1F986}')
----
0 00🦆001

query II
select *
from data
where regexp_matches(zipcode, '^00\x{00A0}001$')
and regexp_matches(zipcode, '^00\x{0020}001$')
----


statement error
select regexp_matches(zipcode, '^00\x{FFFFFFFF}001$') from data
----
invalid escape sequence

0 comments on commit 11e1868

Please sign in to comment.