Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement operator LIKE for FixedString at left hand side. #9890

Merged
merged 6 commits into from
Mar 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
160 changes: 160 additions & 0 deletions dbms/src/Functions/FunctionsStringRegex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,166 @@ struct MatchImpl
}
}

/// Very carefully crafted copy-paste.
static void vectorFixedConstant(
const ColumnString::Chars & data, size_t n, const std::string & pattern, PaddedPODArray<UInt8> & res)
{
if (data.empty())
return;

String strstr_pattern;
/// A simple case where the LIKE expression reduces to finding a substring in a string
if (like && likePatternIsStrstr(pattern, strstr_pattern))
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();

size_t i = 0;
const UInt8 * next_pos = begin;

/// If pattern is larger than string size - it cannot be found.
if (strstr_pattern.size() <= n)
{
Volnitsky searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos);

/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
{
/// Let's determine which index it refers to.
while (next_pos + n <= pos)
{
res[i] = revert;
next_pos += n;
++i;
}
next_pos += n;

/// We check that the entry does not pass through the boundaries of strings.
if (pos + strstr_pattern.size() <= next_pos)
res[i] = !revert;
else
res[i] = revert;

pos = next_pos;
++i;
}
}

/// Tail, in which there can be no substring.
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
else
{
size_t size = data.size() / n;

const auto & regexp = Regexps::get<like, true>(pattern);

std::string required_substring;
bool is_trivial;
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.

regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);

if (required_substring.empty())
{
if (!regexp->getRE2()) /// An empty regexp. Always matches.
{
if (size)
memset(res.data(), 1, size * sizeof(res[0]));
}
else
{
size_t offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(reinterpret_cast<const char *>(&data[offset]), n),
0,
n,
re2_st::RE2::UNANCHORED,
nullptr,
0);

offset += n;
}
}
}
else
{
/// NOTE This almost matches with the case of LikePatternIsStrstr.

const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();

size_t i = 0;
const UInt8 * next_pos = begin;

/// If required substring is larger than string size - it cannot be found.
if (strstr_pattern.size() <= n)
{
Volnitsky searcher(required_substring.data(), required_substring.size(), end - pos);

/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
{
/// Let's determine which index it refers to.
while (next_pos + n <= pos)
{
res[i] = revert;
next_pos += n;
++i;
}
next_pos += n;

if (pos + strstr_pattern.size() <= next_pos)
{
/// And if it does not, if necessary, we check the regexp.

if (is_trivial)
res[i] = !revert;
else
{
const char * str_data = reinterpret_cast<const char *>(next_pos - n);

/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the string several times,
* and at the first occurrence, the regexp is not a match.
*/

if (required_substring_is_prefix)
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, n),
reinterpret_cast<const char *>(pos) - str_data,
n,
re2_st::RE2::UNANCHORED,
nullptr,
0);
else
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, n), 0, n, re2_st::RE2::UNANCHORED, nullptr, 0);
}
}
else
res[i] = revert;

pos = next_pos;
++i;
}
}

/// Tail, in which there can be no substring.
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
}
}

template <typename... Args>
static void vectorVector(Args &&...)
{
Expand Down
12 changes: 12 additions & 0 deletions dbms/src/Functions/FunctionsStringSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,12 @@ struct PositionImpl
prev_needle_offset = needle_offsets[i];
}
}

template <typename... Args>
static void vectorFixedConstant(Args &&...)
{
throw Exception("Functions 'position' don't support FixedString haystack argument", ErrorCodes::ILLEGAL_COLUMN);
}
};

template <typename Impl>
Expand Down Expand Up @@ -519,6 +525,12 @@ struct HasTokenImpl
{
throw Exception("Function 'hasToken' does not support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}

template <typename... Args>
static void vectorFixedConstant(Args &&...)
{
throw Exception("Functions 'hasToken' don't support FixedString haystack argument", ErrorCodes::ILLEGAL_COLUMN);
}
};


Expand Down
8 changes: 7 additions & 1 deletion dbms/src/Functions/FunctionsStringSearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <Columns/ColumnArray.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
Expand All @@ -13,6 +14,7 @@
#include <Interpreters/Context.h>
#include <common/StringRef.h>


namespace DB
{
/** Search and replace functions in strings:
Expand Down Expand Up @@ -93,7 +95,7 @@ class FunctionsStringSearch : public IFunction

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
if (!isStringOrFixedString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

Expand Down Expand Up @@ -132,6 +134,7 @@ class FunctionsStringSearch : public IFunction
vec_res.resize(column_haystack->size());

const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnFixedString * col_haystack_vector_fixed = checkAndGetColumn<ColumnFixedString>(&*column_haystack);
const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle);

if (col_haystack_vector && col_needle_vector)
Expand All @@ -144,6 +147,9 @@ class FunctionsStringSearch : public IFunction
else if (col_haystack_vector && col_needle_const)
Impl::vectorConstant(
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res);
else if (col_haystack_vector_fixed && col_needle_const)
Impl::vectorFixedConstant(
col_haystack_vector_fixed->getChars(), col_haystack_vector_fixed->getN(), col_needle_const->getValue<String>(), vec_res);
else if (col_haystack_const && col_needle_vector)
Impl::constantVector(
col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res);
Expand Down
6 changes: 6 additions & 0 deletions dbms/src/Functions/FunctionsVisitParam.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ struct ExtractParamImpl
{
throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}

template <typename... Args>
static void vectorFixedConstant(Args &&...)
{
throw Exception("Functions 'visitParamHas' don't support FixedString haystack argument", ErrorCodes::ILLEGAL_COLUMN);
}
};


Expand Down
78 changes: 78 additions & 0 deletions dbms/tests/queries/0_stateless/01104_fixed_string_like.reference
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
1
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
1
1
0
0
1
0
0
0
0
0
1
1
0
0
1
0
0
1
0
0
0
1
0
0
1
0
0
1
0
0
1
1
0
0
1
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
0
0
1
46 changes: 46 additions & 0 deletions dbms/tests/queries/0_stateless/01104_fixed_string_like.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'hello';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'world';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'xyz';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'hell';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'orld';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%hello%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%world%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%xyz%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%hell%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%orld%';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%hello';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%world';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%xyz';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%hell';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%orld';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'hello%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'world%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'xyz%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'hell%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'orld%';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%he%o%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%w%ld%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%x%z%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%hell_';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '_orld%';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%he__o%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%w__ld%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%x%z%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'hell_';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '_orld';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE 'helloworld';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%helloworld%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%elloworl%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%ow%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%o%w%';

SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%o%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%l%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%l%o%';
SELECT arrayJoin(CAST(['hello', 'world'] AS Array(FixedString(5)))) LIKE '%o%l%';