ClickHouse · rschu1ze · May 12, 2023 · May 12, 2023
diff --git a/src/Common/Volnitsky.h b/src/Common/Volnitsky.h
@@ -386,8 +386,6 @@ class VolnitskyBase
     FallbackSearcher fallback_searcher;
 
 public:
-    using Searcher = FallbackSearcher;
-
     /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
       * If you specify it small enough, the fallback algorithm will be used,
       *  since it is considered that it's useless to waste time initializing the hash table.
@@ -729,15 +727,15 @@ class MultiVolnitskyBase
 
 
 using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
-using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky
+using VolnitskyUTF8 = VolnitskyBase<true, false, UTF8CaseSensitiveStringSearcher>;
 using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
 using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
 
 using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
 using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>;
 
 using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
-using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
+using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, UTF8CaseSensitiveStringSearcher>;
 using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
 using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
 

diff --git a/src/Common/format.h b/src/Common/format.h
@@ -17,18 +17,18 @@ namespace Format
 {
     using IndexPositions = PODArrayWithStackMemory<UInt64, 64>;
 
-    static inline void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res, UInt64 argument_number)
+    static inline UInt64 parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 argument_number)
     {
-        res = 0;
+        UInt64 res = 0;
         for (UInt64 pos = l; pos < r; ++pos)
         {
             if (!isNumericASCII(description[pos]))
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not a number in curly braces at position {}", std::to_string(pos));
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not a number in curly braces at position {}", pos);
             res = res * 10 + description[pos] - '0';
             if (res >= argument_number)
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too big number for arguments, must be at most {}",
-                    argument_number - 1);
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too big number for arguments, must be at most {}", argument_number - 1);
         }
+        return res;
     }
 
     static inline void init(
@@ -132,8 +132,7 @@ namespace Format
                         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot switch from automatic field numbering to manual field specification");
                     is_plain_numbering = false;
 
-                    UInt64 arg;
-                    parseNumber(pattern, last_open, i, arg, argument_number);
+                    UInt64 arg = parseNumber(pattern, last_open, i, argument_number);
 
                     if (arg >= argument_number)
                         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument is too big for formatting. Note that indexing starts from zero");

diff --git a/src/Functions/randomStringUTF8.cpp b/src/Functions/randomStringUTF8.cpp
@@ -61,26 +61,26 @@ class FunctionRandomStringUTF8 : public IFunction
 
         offsets_to.resize(input_rows_count);
 
-        const IColumn & length_column = *arguments[0].column;
-        size_t summary_utf8_len = 0;
+        const IColumn & col_length = *arguments[0].column;
+        size_t total_codepoints = 0;
         for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
         {
-            size_t utf8_len = length_column.getUInt(row_num);
-            summary_utf8_len += utf8_len;
+            size_t codepoints = col_length.getUInt(row_num);
+            total_codepoints += codepoints;
         }
 
         /* As we generate only assigned planes, the mathematical expectation of the number of bytes
          * per generated code point ~= 3.85. So, reserving for coefficient 4 will not be an overhead
          */
 
-        if (summary_utf8_len > (1 << 29))
+        if (total_codepoints > (1 << 29))
             throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size in function {}", getName());
 
-        size_t size_in_bytes_with_margin = summary_utf8_len * 4 + input_rows_count;
-        data_to.resize(size_in_bytes_with_margin);
-        pcg64_fast rng(randomSeed()); // TODO It is inefficient. We should use SIMD PRNG instead.
+        size_t max_byte_size = total_codepoints * 4 + input_rows_count;
+        data_to.resize(max_byte_size);
 
-        const auto generate_code_point = [](UInt32 rand) -> UInt32 {
+        const auto generate_code_point = [](UInt32 rand)
+        {
             /// We want to generate number in [0x0, 0x70000) and shift it if need
 
             /// Generate highest byte in [0, 6]
@@ -104,43 +104,41 @@ class FunctionRandomStringUTF8 : public IFunction
             return code_point;
         };
 
+        pcg64_fast rng(randomSeed());
         IColumn::Offset offset = 0;
+
         for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
         {
-            size_t utf8_len = length_column.getUInt(row_num);
+            size_t codepoints = col_length.getUInt(row_num);
             auto * pos = data_to.data() + offset;
 
-            size_t last_writen_bytes = 0;
-            size_t i = 0;
-            for (; i < utf8_len; i += 2)
+            for (size_t i = 0; i < codepoints; i +=2)
             {
-                UInt64 rand = rng();
+                UInt64 rand = rng(); /// that's the bottleneck
 
                 UInt32 code_point1 = generate_code_point(static_cast<UInt32>(rand));
-                UInt32 code_point2 = generate_code_point(static_cast<UInt32>(rand >> 32u));
 
-                /// We have padding in column buffers that we can overwrite.
-                size_t length1 = UTF8::convertCodePointToUTF8(code_point1, pos, sizeof(int));
-                assert(length1 <= 4);
-                pos += length1;
-
-                size_t length2 = UTF8::convertCodePointToUTF8(code_point2, pos, sizeof(int));
-                assert(length2 <= 4);
-                last_writen_bytes = length2;
-                pos += last_writen_bytes;
-            }
-            offset = pos - data_to.data() + 1;
-            if (i > utf8_len)
-            {
-                offset -= last_writen_bytes;
+                size_t bytes1 = UTF8::convertCodePointToUTF8(code_point1, pos, 4);
+                chassert(bytes1 <= 4);
+                pos += bytes1;
+
+                if (i + 1 != codepoints)
+                {
+                    UInt32 code_point2 = generate_code_point(static_cast<UInt32>(rand >> 32u));
+                    size_t bytes2 = UTF8::convertCodePointToUTF8(code_point2, pos, 4);
+                    chassert(bytes2 <= 4);
+                    pos += bytes2;
+                }
             }
+
+            *pos = 0;
+            ++pos;
+
+            offset = pos - data_to.data();
             offsets_to[row_num] = offset;
         }
 
-        /// Put zero bytes in between.
-        auto * pos = data_to.data();
-        for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
-            pos[offsets_to[row_num] - 1] = 0;
+        data_to.resize(offset);
 
         return col_to;
     }

diff --git a/tests/queries/0_stateless/01278_random_string_utf8.reference b/tests/queries/0_stateless/01278_random_string_utf8.reference
@@ -2,3 +2,4 @@
 String
 1
 
+99
diff --git a/tests/queries/0_stateless/01278_random_string_utf8.sql b/tests/queries/0_stateless/01278_random_string_utf8.sql
@@ -3,3 +3,4 @@ SELECT lengthUTF8(randomStringUTF8(100));
 SELECT toTypeName(randomStringUTF8(10));
 SELECT isValidUTF8(randomStringUTF8(100000));
 SELECT randomStringUTF8(0);
+SELECT lengthUTF8(lowerUTF8(randomStringUTF8(99))); -- bug #49672: msan assert