Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport #49750 to 23.3: Fix msan issue in randomStringUTF8(<uneven number>) #49806

Merged
merged 1 commit into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/Common/Volnitsky.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,6 @@ class VolnitskyBase
FallbackSearcher fallback_searcher;

public:
using Searcher = FallbackSearcher;

/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
Expand Down Expand Up @@ -729,15 +727,15 @@ class MultiVolnitskyBase


using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky
using VolnitskyUTF8 = VolnitskyBase<true, false, UTF8CaseSensitiveStringSearcher>;
using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;

using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>;

using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, UTF8CaseSensitiveStringSearcher>;
using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;

Expand Down
13 changes: 6 additions & 7 deletions src/Common/format.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,18 @@ namespace Format
{
using IndexPositions = PODArrayWithStackMemory<UInt64, 64>;

static inline void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res, UInt64 argument_number)
static inline UInt64 parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 argument_number)
{
res = 0;
UInt64 res = 0;
for (UInt64 pos = l; pos < r; ++pos)
{
if (!isNumericASCII(description[pos]))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not a number in curly braces at position {}", std::to_string(pos));
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not a number in curly braces at position {}", pos);
res = res * 10 + description[pos] - '0';
if (res >= argument_number)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too big number for arguments, must be at most {}",
argument_number - 1);
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too big number for arguments, must be at most {}", argument_number - 1);
}
return res;
}

static inline void init(
Expand Down Expand Up @@ -132,8 +132,7 @@ namespace Format
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot switch from automatic field numbering to manual field specification");
is_plain_numbering = false;

UInt64 arg;
parseNumber(pattern, last_open, i, arg, argument_number);
UInt64 arg = parseNumber(pattern, last_open, i, argument_number);

if (arg >= argument_number)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument is too big for formatting. Note that indexing starts from zero");
Expand Down
64 changes: 31 additions & 33 deletions src/Functions/randomStringUTF8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,26 +61,26 @@ class FunctionRandomStringUTF8 : public IFunction

offsets_to.resize(input_rows_count);

const IColumn & length_column = *arguments[0].column;
size_t summary_utf8_len = 0;
const IColumn & col_length = *arguments[0].column;
size_t total_codepoints = 0;
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t utf8_len = length_column.getUInt(row_num);
summary_utf8_len += utf8_len;
size_t codepoints = col_length.getUInt(row_num);
total_codepoints += codepoints;
}

/* As we generate only assigned planes, the mathematical expectation of the number of bytes
* per generated code point ~= 3.85. So, reserving for coefficient 4 will not be an overhead
*/

if (summary_utf8_len > (1 << 29))
if (total_codepoints > (1 << 29))
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size in function {}", getName());

size_t size_in_bytes_with_margin = summary_utf8_len * 4 + input_rows_count;
data_to.resize(size_in_bytes_with_margin);
pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead.
size_t max_byte_size = total_codepoints * 4 + input_rows_count;
data_to.resize(max_byte_size);

const auto generate_code_point = [](UInt32 rand) -> UInt32 {
const auto generate_code_point = [](UInt32 rand)
{
/// We want to generate number in [0x0, 0x70000) and shift it if need

/// Generate highest byte in [0, 6]
Expand All @@ -104,43 +104,41 @@ class FunctionRandomStringUTF8 : public IFunction
return code_point;
};

pcg64_fast rng(randomSeed());
IColumn::Offset offset = 0;

for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t utf8_len = length_column.getUInt(row_num);
size_t codepoints = col_length.getUInt(row_num);
auto * pos = data_to.data() + offset;

size_t last_writen_bytes = 0;
size_t i = 0;
for (; i < utf8_len; i += 2)
for (size_t i = 0; i < codepoints; i +=2)
{
UInt64 rand = rng();
UInt64 rand = rng(); /// that's the bottleneck

UInt32 code_point1 = generate_code_point(static_cast<UInt32>(rand));
UInt32 code_point2 = generate_code_point(static_cast<UInt32>(rand >> 32u));

/// We have padding in column buffers that we can overwrite.
size_t length1 = UTF8::convertCodePointToUTF8(code_point1, pos, sizeof(int));
assert(length1 <= 4);
pos += length1;

size_t length2 = UTF8::convertCodePointToUTF8(code_point2, pos, sizeof(int));
assert(length2 <= 4);
last_writen_bytes = length2;
pos += last_writen_bytes;
}
offset = pos - data_to.data() + 1;
if (i > utf8_len)
{
offset -= last_writen_bytes;
size_t bytes1 = UTF8::convertCodePointToUTF8(code_point1, pos, 4);
chassert(bytes1 <= 4);
pos += bytes1;

if (i + 1 != codepoints)
{
UInt32 code_point2 = generate_code_point(static_cast<UInt32>(rand >> 32u));
size_t bytes2 = UTF8::convertCodePointToUTF8(code_point2, pos, 4);
chassert(bytes2 <= 4);
pos += bytes2;
}
}

*pos = 0;
++pos;

offset = pos - data_to.data();
offsets_to[row_num] = offset;
}

/// Put zero bytes in between.
auto * pos = data_to.data();
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
pos[offsets_to[row_num] - 1] = 0;
data_to.resize(offset);

return col_to;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
String
1

99
1 change: 1 addition & 0 deletions tests/queries/0_stateless/01278_random_string_utf8.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ SELECT lengthUTF8(randomStringUTF8(100));
SELECT toTypeName(randomStringUTF8(10));
SELECT isValidUTF8(randomStringUTF8(100000));
SELECT randomStringUTF8(0);
SELECT lengthUTF8(lowerUTF8(randomStringUTF8(99))); -- bug #49672: msan assert