Roll TF Models lib

New version removes usage of libutf since it uses the unsafe chartorune function. Bug: b/245564409 Change-Id: I2a1c195db7581d84f16e7ad54de016f3edf373d1 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4019579 Commit-Queue: Robert Ogden <robertogden@chromium.org> Reviewed-by: Michael Crouse <mcrouse@chromium.org> Cr-Commit-Position: refs/heads/main@{#1069848}
chromium · Nov 10, 2022 · d9a2f86 · d9a2f86
1 parent 9a9461f
commit d9a2f86
Show file tree

Hide file tree

Showing 12 changed files with 128 additions and 113 deletions.
diff --git a/third_party/tensorflow_models/BUILD.gn b/third_party/tensorflow_models/BUILD.gn
@@ -20,7 +20,6 @@ static_library("tflite_custom_ops") {
     # Shim headers.
     "shims/icu4c/source/common/unicode/uchar.h",
     "shims/icu4c/source/common/unicode/utf8.h",
-    "shims/libutf/utf.h",
 
     # When adding/removing entries from this list, also change |update.sh| in
     # this directory.
@@ -46,7 +45,6 @@ static_library("tflite_custom_ops") {
     "//third_party/flatbuffers",
     "//third_party/icu",
     "//third_party/tflite",
-    "//third_party/utf",
   ]
 
   configs -= [ "//build/config/compiler:chromium_code" ]

diff --git a/third_party/tensorflow_models/README.chromium b/third_party/tensorflow_models/README.chromium
@@ -1,8 +1,8 @@
 Name: TensorFlow Models
 Short Name: tensorflow_models
 URL: https://github.com/tensorflow/models
-Version: 45fd32c82e1bb55f8eac44d77d40902302c5aee2
-Date: 2022/09/29
+Version: a2e1e19eea5e7b926dafcc7e10c4e4e475d42f51
+Date: 2022/11/09
 License: Apache 2.0
 License File: src/LICENSE
 Security Critical: Yes

diff --git a/third_party/tensorflow_models/shims/libutf/utf.h b/third_party/tensorflow_models/shims/libutf/utf.h
diff --git a/...d_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_normalizer_util.cc b/...d_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_normalizer_util.cc
@@ -20,18 +20,18 @@ limitations under the License.
 #include <sstream>
 #include <utility>
 
+#include "icu4c/source/common/unicode/utf8.h"
 #include "tf_ops/projection_util.h"  // seq_flow_lite
 
 // Returns true if the given text contains a number.
 bool IsDigit(const std::string& text) {
-  Rune rune;
   for (size_t i = 0; i < text.length();) {
-    const int bytes_read = chartorune(&rune, const_cast<char*>(text.data()));
-    if (rune == Runeerror || bytes_read == 0) break;
-    if (rune >= static_cast<Rune>('0') && rune <= static_cast<Rune>('9')) {
+    UChar32 c;
+    U8_NEXT(text.data(), i, text.length(), c);
+    if (c < 0) break;
+    if (c >= '0' && c <= '9') {
       return true;
     }
-    i += bytes_read;
   }
   return false;
 }

diff --git a/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_normalizer_util.h b/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_normalizer_util.h
@@ -19,8 +19,6 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "libutf/utf.h"
-
 // Normalizes the input with the given |separators| by adding a space before and
 // after each separator. When |normalize_repetition| is true, it removes the
 // repeated characters (except numbers) which consecutively appeared more than

diff --git a/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_util.cc b/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_util.cc
@@ -22,7 +22,11 @@ limitations under the License.
 #include <sstream>
 #include <unordered_set>
 
+#include "icu4c/source/common/unicode/uchar.h"
+#include "icu4c/source/common/unicode/utf8.h"
+
 namespace {
+
 constexpr int kInvalid = -1;
 constexpr char kSpace = ' ';
 
@@ -168,25 +172,33 @@ class UnicodeHash : public HashEngine {
                                      int feature_size) override {
     std::vector<uint64_t> hash_codes;
     hash_codes.reserve(2 * (feature_size / 64 + 1));
-    auto word_ptr = word.c_str();
-    int utflength = utflen(const_cast<char*>(word_ptr));
-    // Both `feature_size` and `bits_per_unicode` are bit lengths.
-    const int max_usable_runes = feature_size * 2 / bits_per_unicode_;
-    if (max_usable_runes < utflength) {
-      const int unicode_skip = (utflength - max_usable_runes) / 2;
+    const char* word_ptr = word.c_str();
+    int word_index = 0;
+    int utflength = 0;
+    while (word_index < word.length()) {
+      UChar32 c;
+      U8_NEXT(word_ptr, word_index, word.length(), c);
+      if (c < 0) break;
+      utflength++;
+    }
+    word_index = 0;
+    //  Both `feature_size` and `bits_per_unicode` are bit lengths.
+    const int max_usable_chars = feature_size * 2 / bits_per_unicode_;
+    if (max_usable_chars < utflength) {
+      const int unicode_skip = (utflength - max_usable_chars) / 2;
       for (int i = 0; i < unicode_skip; ++i) {
-        Rune rune;
-        word_ptr += chartorune(&rune, const_cast<char*>(word_ptr));
+        UChar32 c;
+        U8_NEXT(word_ptr, word_index, word.length(), c);
       }
-      utflength = max_usable_runes;
+      utflength = max_usable_chars;
     }
 
     std::vector<uint64_t> unicode_hashes;
     unicode_hashes.reserve(utflength);
     for (int i = 0; i < utflength; ++i) {
-      Rune rune;
-      word_ptr += chartorune(&rune, const_cast<char*>(word_ptr));
-      unicode_hashes.push_back((rune * kMul) & bit_mask_);
+      UChar32 c;
+      U8_NEXT(word_ptr, word_index, word.length(), c);
+      unicode_hashes.push_back((c * kMul) & bit_mask_);
     }
 
     uint64_t hash = 0;
@@ -252,42 +264,37 @@ std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
   // is allocated for target.
   const char* csource = source.first;
   int len = source.second;
+  int i = 0;
   auto target = std::unique_ptr<char[]>(new char[len * 4]);
   auto target_ptr = target.get();
-  int i = 0;
+  int target_len = 0;
   bool first_char = true;
   bool first_cap_value = false;
   bool all_caps_value = false;
   while (i < len) {
-    Rune rune;
-    const int bytes_read = chartorune(&rune, const_cast<char*>(csource + i));
-    if (bytes_read == 0 || bytes_read > len - i) {
-      break;
+    UChar32 c;
+    U8_NEXT(csource, i, len, c);
+    if (c < 0) break;
+    UChar32 lower = u_tolower(c);
+    // Skip processing the unicode if exclude_nonalphaspace_unicodes_ is
+    // true and the unicode is not alpha and not space.
+    const UChar32 kSpaceChar = ' ';
+    if (exclude_nonalphaspace_unicodes_ && !u_isUAlphabetic(lower) &&
+        lower != kSpaceChar) {
+      continue;
     }
-    i += bytes_read;
-    if (rune != Runeerror) {
-      Rune lower = tolowerrune(rune);
-      // Skip processing the unicode if exclude_nonalphaspace_unicodes_ is
-      // true and the unicode is not alpha and not space.
-      const Rune kSpaceRune = ' ';
-      if (exclude_nonalphaspace_unicodes_ && !isalpharune(lower) &&
-          lower != kSpaceRune) {
-        continue;
-      }
-      if (IsUnrestrictedVocabulary() || IsValidUnicode(lower)) {
-        const int bytes_written = runetochar(target_ptr, &lower);
-        target_ptr += bytes_written;
-
-        const bool lower_case = (lower == rune);
-        if (first_char) {
-          first_cap_value = !lower_case;
-          all_caps_value = !lower_case;
-        } else {
-          first_cap_value &= lower_case;
-          all_caps_value &= !lower_case;
-        }
-        first_char = false;
+    if (IsUnrestrictedVocabulary() || IsValidUnicode(lower)) {
+      U8_APPEND_UNSAFE(target_ptr, target_len, lower);
+
+      const bool lower_case = (lower == c);
+      if (first_char) {
+        first_cap_value = !lower_case;
+        all_caps_value = !lower_case;
+      } else {
+        first_cap_value &= lower_case;
+        all_caps_value &= !lower_case;
       }
+      first_char = false;
     }
   }
   if (first_cap) {
@@ -296,28 +303,24 @@ std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
   if (all_caps) {
     *all_caps = all_caps_value;
   }
-  return std::string(target.get(), target_ptr);
+  return std::string(target_ptr, target_len);
 }
 
 void ProjectionUnicodeHandler::InitializeVocabulary(
     const std::string& vocabulary) {
   for (size_t i = 0, index = 0; i < vocabulary.length();) {
-    Rune rune;
-    const int bytes_read =
-        chartorune(&rune, const_cast<char*>(vocabulary.c_str() + i));
-    if (!bytes_read || bytes_read > (vocabulary.length() - i)) {
-      break;
-    }
-    i += bytes_read;
+    UChar32 c;
+    U8_NEXT(const_cast<char*>(vocabulary.c_str()), i, vocabulary.length(), c);
+    if (c < 0) break;
     // Include novel lower case unicode segments as part of valid chars.
-    if (rune == Runeerror) {
-      std::clog << "Invalid rune in vocabulary.";
-    } else if (IsValidUnicode(rune)) {
-      std::clog << "Duplicate rune " << rune << " found in vocabulary.";
-    } else if (rune != tolowerrune(rune)) {
-      std::clog << "Upper case rune " << rune << " found in vocabulary.";
+    if (c == 0xFFFD) {
+      std::clog << "Invalid character in vocabulary.";
+    } else if (IsValidUnicode(c)) {
+      std::clog << "Duplicate character " << c << " found in vocabulary.";
+    } else if (u_isUUppercase(c)) {
+      std::clog << "Upper case character " << c << " found in vocabulary.";
     } else {
-      valid_chars_[rune] = index++;
+      valid_chars_[c] = index++;
     }
   }
 }
@@ -379,15 +382,15 @@ std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
 template <typename T>
 void SplitByCharInternal(std::vector<T>* tokens, const char* input_ptr,
                          size_t len, size_t max_tokens) {
-  Rune rune;
   for (size_t i = 0; i < len;) {
-    auto bytes_read = chartorune(&rune, const_cast<char*>(input_ptr + i));
-    if (bytes_read == 0 || bytes_read > (len - i)) break;
-    tokens->emplace_back(input_ptr + i, bytes_read);
+    UChar32 c;
+    size_t old_i = i;
+    U8_NEXT(input_ptr, i, len, c);
+    if (c < 0) break;
+    tokens->emplace_back(input_ptr + old_i, i - old_i);
     if (max_tokens != kInvalid && tokens->size() == max_tokens) {
       break;
     }
-    i += bytes_read;
   }
 }
 

diff --git a/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_util.h b/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_util.h
@@ -14,12 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_MODELS_SEQ_FLOW_LITE_TF_OPS_PROJECTION_UTIL_H_
 #define TENSORFLOW_MODELS_SEQ_FLOW_LITE_TF_OPS_PROJECTION_UTIL_H_
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "libutf/utf.h"
+#include "icu4c/source/common/unicode/utf8.h"
 
 inline constexpr int kFirstCapOffset = 3;
 inline constexpr int kAllCapsOffset = 4;
@@ -105,14 +106,14 @@ class ProjectionUnicodeHandler {
 
   // Returns a boolean flag indicating if the unicode segment is part of the
   // vocabulary.
-  bool IsValidUnicode(Rune rune) const {
-    return valid_chars_.find(rune) != valid_chars_.end();
+  bool IsValidUnicode(UChar32 c) const {
+    return valid_chars_.find(c) != valid_chars_.end();
   }
 
   // Returns an index in [0, |vocabulary|), if the unicode is part of the
   // vocabulary and -1 if it's not.
-  int UnicodeIndex(Rune rune) const {
-    return IsValidUnicode(rune) ? valid_chars_.at(rune) : -1;
+  int UnicodeIndex(UChar32 c) const {
+    return IsValidUnicode(c) ? valid_chars_.at(c) : -1;
   }
 
   // Returns |vocabulary|.
@@ -137,9 +138,9 @@ class ProjectionUnicodeHandler {
   // Parses and extracts supported or allowed unicode segments, also referred
   // to as vocabulary, from a utf8 string.
   void InitializeVocabulary(const std::string& vocabulary);
-  // A variable that maps a valid Unicode rune to its index in valid character
-  // vocabulary.
-  std::unordered_map<Rune, int> valid_chars_;
+  // A variable that maps a valid Unicode character to its index in the valid
+  // character vocabulary.
+  std::unordered_map<UChar32, int> valid_chars_;
   // Controls whether to exclude non-alphabetic, non-space characters from the
   // output text.
   bool exclude_nonalphaspace_unicodes_;

diff --git a/third_party/tensorflow_models/src/research/seq_flow_lite/tflite_ops/denylist.cc b/third_party/tensorflow_models/src/research/seq_flow_lite/tflite_ops/denylist.cc
@@ -76,10 +76,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
     }
   } else if (output_categories->type == kTfLiteUInt8) {
-    const uint8_t one = PodQuantize(1.0, output_categories->params.zero_point,
-                                    1.0 / output_categories->params.scale);
-    const uint8_t zero = PodQuantize(0.0, output_categories->params.zero_point,
-                                     1.0 / output_categories->params.scale);
+    const uint8_t one =
+        PodQuantize<uint8_t>(1.0, output_categories->params.zero_point,
+                             1.0 / output_categories->params.scale);
+    const uint8_t zero =
+        PodQuantize<uint8_t>(0.0, output_categories->params.zero_point,
+                             1.0 / output_categories->params.scale);
     for (int i = 0; i < input_size; i++) {
       absl::flat_hash_set<int> categories;
       TF_LITE_ENSURE_STATUS(op->GetCategories(context, i, categories));

diff --git a/third_party/tensorflow_models/src/research/seq_flow_lite/tflite_ops/quantization_util.h b/third_party/tensorflow_models/src/research/seq_flow_lite/tflite_ops/quantization_util.h
@@ -17,35 +17,60 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <limits>
 
+#include "absl/base/macros.h"
 #include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 namespace seq_flow_lite {
 
-// Returns the original (dequantized) value of 8bit value.
-inline float PodDequantizeValue(const TfLiteTensor& tensor, uint8_t value) {
+// Returns the original (dequantized) value.
+template <typename T>
+inline float PodDequantizeValue(const TfLiteTensor& tensor, T value) {
   const int32_t zero_point = tensor.params.zero_point;
   const float scale = tensor.params.scale;
   return (static_cast<int32_t>(value) - zero_point) * scale;
 }
 
 // Returns the original (dequantized) value of the 'index'-th element of
 // 'tensor.
+template <typename T>
 inline float PodDequantize(const TfLiteTensor& tensor, int index) {
-  return PodDequantizeValue(tensor, tensor.data.uint8[index]);
+  return PodDequantizeValue<T>(tensor,
+                               tflite::GetTensorData<T>(&tensor)[index]);
 }
 
-// Quantizes 'value' to 8bit, given the quantization bias (zero_point) and
-// factor (inverse_scale).
-inline uint8_t PodQuantize(float value, int32_t zero_point,
-                           float inverse_scale) {
+ABSL_DEPRECATED("Use PodDequantizeValue<uint8_t> instead.")
+inline float PodDequantizeValue(const TfLiteTensor& tensor, uint8_t value) {
+  return PodDequantizeValue<uint8_t>(tensor, value);
+}
+
+ABSL_DEPRECATED("Use PodDequantize<uint8_t> instead.")
+inline float PodDequantize(const TfLiteTensor& tensor, int index) {
+  return PodDequantizeValue<uint8_t>(tensor, tensor.data.uint8[index]);
+}
+
+// Quantizes 'value', given the quantization bias (zero_point) and factor
+// (inverse_scale).
+template <typename T>
+inline T PodQuantize(float value, int32_t zero_point, float inverse_scale) {
   const float integer_value_in_float = value * inverse_scale;
   const float offset = (integer_value_in_float >= 0.0) ? 0.5f : -0.5f;
   // NOTE(sfeuz): This assumes value * inverse_scale is within [INT_MIN,
   // INT_MAX].
   int32_t integer_value =
       static_cast<int32_t>(integer_value_in_float + offset) + zero_point;
-  return static_cast<uint8_t>(std::max(std::min(255, integer_value), 0));
+  return static_cast<T>(
+      std::max(std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+                        integer_value),
+               static_cast<int32_t>(std::numeric_limits<T>::min())));
+}
+
+ABSL_DEPRECATED("Use PodQuantize<uint8_t> instead.")
+inline uint8_t PodQuantize(float value, int32_t zero_point,
+                           float inverse_scale) {
+  return PodQuantize<uint8_t>(value, zero_point, inverse_scale);
 }
 
 }  // namespace seq_flow_lite