From aa1cd1a35e948da2eb9d097cad5e32b773d27079 Mon Sep 17 00:00:00 2001 From: Justin King Date: Wed, 10 Sep 2025 15:00:38 -0700 Subject: [PATCH] Refactor UTF8 encoding/decoding functions PiperOrigin-RevId: 805528572 --- internal/BUILD | 1 + internal/utf8.cc | 131 ++++++++++++++++++++++++++++-------------- internal/utf8.h | 25 +++++++- internal/utf8_test.cc | 4 ++ 4 files changed, 114 insertions(+), 47 deletions(-) diff --git a/internal/BUILD b/internal/BUILD index 889279c26..e215b3da7 100644 --- a/internal/BUILD +++ b/internal/BUILD @@ -344,6 +344,7 @@ cc_library( deps = [ ":unicode", "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:nullability", "@com_google_absl//absl/log:absl_check", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:cord", diff --git a/internal/utf8.cc b/internal/utf8.cc index b6de9d74b..8cda91505 100644 --- a/internal/utf8.cc +++ b/internal/utf8.cc @@ -20,8 +20,8 @@ #include #include -#include "absl/base/attributes.h" #include "absl/base/macros.h" +#include "absl/base/nullability.h" #include "absl/base/optimization.h" #include "absl/log/absl_check.h" #include "absl/strings/cord.h" @@ -355,77 +355,109 @@ std::pair Utf8Validate(const absl::Cord& str) { namespace { -std::pair Utf8DecodeImpl(uint8_t b, uint8_t leading, - size_t size, absl::string_view str) { +size_t Utf8DecodeImpl(uint8_t b, uint8_t leading, size_t size, + absl::string_view str, + char32_t* absl_nullable code_point) { const auto& accept = kAccept[leading >> 4]; const auto b1 = static_cast(str.front()); if (ABSL_PREDICT_FALSE(b1 < accept.first || b1 > accept.second)) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } if (size <= 1) { - return {(static_cast(b & kMask2) << 6) | - static_cast(b1 & kMaskX), - 2}; + if (code_point != nullptr) { + *code_point = (static_cast(b & kMask2) << 6) | + static_cast(b1 & kMaskX); + } + return 2; } str.remove_prefix(1); const auto b2 = static_cast(str.front()); if (ABSL_PREDICT_FALSE(b2 < kLow || b2 > kHigh)) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } if (size <= 2) { - return {(static_cast(b & kMask3) << 12) | - (static_cast(b1 & kMaskX) << 6) | - static_cast(b2 & kMaskX), - 3}; + if (code_point != nullptr) { + *code_point = (static_cast(b & kMask3) << 12) | + (static_cast(b1 & kMaskX) << 6) | + static_cast(b2 & kMaskX); + } + return 3; } str.remove_prefix(1); const auto b3 = static_cast(str.front()); if (ABSL_PREDICT_FALSE(b3 < kLow || b3 > kHigh)) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } - return {(static_cast(b & kMask4) << 18) | - (static_cast(b1 & kMaskX) << 12) | - (static_cast(b2 & kMaskX) << 6) | - static_cast(b3 & kMaskX), - 4}; + if (code_point != nullptr) { + *code_point = (static_cast(b & kMask4) << 18) | + (static_cast(b1 & kMaskX) << 12) | + (static_cast(b2 & kMaskX) << 6) | + static_cast(b3 & kMaskX); + } + return 4; } } // namespace -std::pair Utf8Decode(absl::string_view str) { +size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point) { ABSL_DCHECK(!str.empty()); const auto b = static_cast(str.front()); if (b < kUtf8RuneSelf) { - return {static_cast(b), 1}; + if (code_point != nullptr) { + *code_point = static_cast(b); + } + return 1; } const auto leading = kLeading[b]; if (ABSL_PREDICT_FALSE(leading == kXX)) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } auto size = static_cast(leading & 7) - 1; str.remove_prefix(1); if (ABSL_PREDICT_FALSE(size > str.size())) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } - return Utf8DecodeImpl(b, leading, size, str); + return Utf8DecodeImpl(b, leading, size, str, code_point); } -std::pair Utf8Decode(const absl::Cord::CharIterator& it) { +size_t Utf8Decode(const absl::Cord::CharIterator& it, + char32_t* absl_nullable code_point) { absl::string_view str = absl::Cord::ChunkRemaining(it); ABSL_DCHECK(!str.empty()); const auto b = static_cast(str.front()); if (b < kUtf8RuneSelf) { - return {static_cast(b), 1}; + if (code_point != nullptr) { + *code_point = static_cast(b); + } + return 1; } const auto leading = kLeading[b]; if (ABSL_PREDICT_FALSE(leading == kXX)) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } auto size = static_cast(leading & 7) - 1; str.remove_prefix(1); if (ABSL_PREDICT_TRUE(size <= str.size())) { // Fast path. - return Utf8DecodeImpl(b, leading, size, str); + return Utf8DecodeImpl(b, leading, size, str, code_point); } absl::Cord::CharIterator current = it; absl::Cord::Advance(¤t, 1); @@ -434,49 +466,60 @@ std::pair Utf8Decode(const absl::Cord::CharIterator& it) { while (buffer_len < size) { str = absl::Cord::ChunkRemaining(current); if (ABSL_PREDICT_FALSE(str.empty())) { - return {kUnicodeReplacementCharacter, 1}; + if (code_point != nullptr) { + *code_point = kUnicodeReplacementCharacter; + } + return 1; } size_t to_copy = std::min(size_t{3} - buffer_len, str.size()); std::memcpy(buffer + buffer_len, str.data(), to_copy); buffer_len += to_copy; absl::Cord::Advance(¤t, to_copy); } - return Utf8DecodeImpl(b, leading, size, - absl::string_view(buffer, buffer_len)); + return Utf8DecodeImpl(b, leading, size, absl::string_view(buffer, buffer_len), + code_point); } -size_t Utf8Encode(std::string& buffer, char32_t code_point) { +size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer) { + ABSL_DCHECK(buffer != nullptr); + + char storage[4]; + size_t storage_len = Utf8Encode(code_point, storage); + buffer->append(storage, storage_len); + return storage_len; +} + +size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer) { + ABSL_DCHECK(buffer != nullptr); + if (ABSL_PREDICT_FALSE(!UnicodeIsValid(code_point))) { code_point = kUnicodeReplacementCharacter; } - char storage[4]; size_t storage_len = 0; if (code_point <= 0x7f) { - storage[storage_len++] = - static_cast(static_cast(code_point)); + buffer[storage_len++] = static_cast(static_cast(code_point)); } else if (code_point <= 0x7ff) { - storage[storage_len++] = + buffer[storage_len++] = static_cast(kT2 | static_cast(code_point >> 6)); - storage[storage_len++] = + buffer[storage_len++] = static_cast(kTX | (static_cast(code_point) & kMaskX)); } else if (code_point <= 0xffff) { - storage[storage_len++] = + buffer[storage_len++] = static_cast(kT3 | static_cast(code_point >> 12)); - storage[storage_len++] = static_cast( + buffer[storage_len++] = static_cast( kTX | (static_cast(code_point >> 6) & kMaskX)); - storage[storage_len++] = + buffer[storage_len++] = static_cast(kTX | (static_cast(code_point) & kMaskX)); } else { - storage[storage_len++] = + buffer[storage_len++] = static_cast(kT4 | static_cast(code_point >> 18)); - storage[storage_len++] = static_cast( + buffer[storage_len++] = static_cast( kTX | (static_cast(code_point >> 12) & kMaskX)); - storage[storage_len++] = static_cast( + buffer[storage_len++] = static_cast( kTX | (static_cast(code_point >> 6) & kMaskX)); - storage[storage_len++] = + buffer[storage_len++] = static_cast(kTX | (static_cast(code_point) & kMaskX)); } - buffer.append(storage, storage_len); return storage_len; } diff --git a/internal/utf8.h b/internal/utf8.h index 8aa1b7457..f6b530636 100644 --- a/internal/utf8.h +++ b/internal/utf8.h @@ -19,6 +19,8 @@ #include #include +#include "absl/base/attributes.h" +#include "absl/base/nullability.h" #include "absl/strings/cord.h" #include "absl/strings/string_view.h" @@ -50,13 +52,30 @@ std::pair Utf8Validate(const absl::Cord& str); // sequence is returned the replacement character, U+FFFD, is returned with a // code unit count of 1. As U+FFFD requires 3 code units when encoded, this can // be used to differentiate valid input from malformed input. -std::pair Utf8Decode(absl::string_view str); -std::pair Utf8Decode(const absl::Cord::CharIterator& it); +size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point); +size_t Utf8Decode(const absl::Cord::CharIterator& it, + char32_t* absl_nullable code_point); +inline std::pair Utf8Decode(absl::string_view str) { + char32_t code_point; + size_t code_units = Utf8Decode(str, &code_point); + return std::pair{code_point, code_units}; +} +inline std::pair Utf8Decode( + const absl::Cord::CharIterator& it) { + char32_t code_point; + size_t code_units = Utf8Decode(it, &code_point); + return std::pair{code_point, code_units}; +} // Encodes the given code point and appends it to the buffer. If the code point // is an unpaired surrogate or outside of the valid Unicode range it is replaced // with the replacement character, U+FFFD. -size_t Utf8Encode(std::string& buffer, char32_t code_point); +size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer); +size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer); +ABSL_DEPRECATED("Use other overload") +inline size_t Utf8Encode(std::string& buffer, char32_t code_point) { + return Utf8Encode(code_point, &buffer); +} } // namespace cel::internal diff --git a/internal/utf8_test.cc b/internal/utf8_test.cc index 2569dbce0..800102b12 100644 --- a/internal/utf8_test.cc +++ b/internal/utf8_test.cc @@ -226,6 +226,8 @@ TEST_P(Utf8DecodeTest, StringView) { << absl::CHexEscape(test_case.code_units); EXPECT_EQ(code_point, test_case.code_point) << absl::CHexEscape(test_case.code_units); + EXPECT_EQ(Utf8Decode(test_case.code_units, nullptr), + test_case.code_units.size()); } TEST_P(Utf8DecodeTest, Cord) { @@ -239,6 +241,8 @@ TEST_P(Utf8DecodeTest, Cord) { << absl::CHexEscape(test_case.code_units); EXPECT_EQ(code_point, test_case.code_point) << absl::CHexEscape(test_case.code_units); + it = cord.char_begin(); + EXPECT_EQ(Utf8Decode(it, nullptr), test_case.code_units.size()); } std::vector FragmentString(absl::string_view text) {