Skip to content

Commit

Permalink
Roll TF Models lib
Browse files Browse the repository at this point in the history
New version removes usage of libutf since it uses the unsafe
chartorune function.

Bug: b/245564409
Change-Id: I2a1c195db7581d84f16e7ad54de016f3edf373d1
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4019579
Commit-Queue: Robert Ogden <robertogden@chromium.org>
Reviewed-by: Michael Crouse <mcrouse@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1069848}
  • Loading branch information
Robert Ogden authored and Chromium LUCI CQ committed Nov 10, 2022
1 parent 9a9461f commit d9a2f86
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 113 deletions.
2 changes: 0 additions & 2 deletions third_party/tensorflow_models/BUILD.gn
Expand Up @@ -20,7 +20,6 @@ static_library("tflite_custom_ops") {
# Shim headers.
"shims/icu4c/source/common/unicode/uchar.h",
"shims/icu4c/source/common/unicode/utf8.h",
"shims/libutf/utf.h",

# When adding/removing entries from this list, also change |update.sh| in
# this directory.
Expand All @@ -46,7 +45,6 @@ static_library("tflite_custom_ops") {
"//third_party/flatbuffers",
"//third_party/icu",
"//third_party/tflite",
"//third_party/utf",
]

configs -= [ "//build/config/compiler:chromium_code" ]
Expand Down
4 changes: 2 additions & 2 deletions third_party/tensorflow_models/README.chromium
@@ -1,8 +1,8 @@
Name: TensorFlow Models
Short Name: tensorflow_models
URL: https://github.com/tensorflow/models
Version: 45fd32c82e1bb55f8eac44d77d40902302c5aee2
Date: 2022/09/29
Version: a2e1e19eea5e7b926dafcc7e10c4e4e475d42f51
Date: 2022/11/09
License: Apache 2.0
License File: src/LICENSE
Security Critical: Yes
Expand Down
15 changes: 0 additions & 15 deletions third_party/tensorflow_models/shims/libutf/utf.h

This file was deleted.

Expand Up @@ -20,18 +20,18 @@ limitations under the License.
#include <sstream>
#include <utility>

#include "icu4c/source/common/unicode/utf8.h"
#include "tf_ops/projection_util.h" // seq_flow_lite

// Returns true if the given text contains a number.
bool IsDigit(const std::string& text) {
Rune rune;
for (size_t i = 0; i < text.length();) {
const int bytes_read = chartorune(&rune, const_cast<char*>(text.data()));
if (rune == Runeerror || bytes_read == 0) break;
if (rune >= static_cast<Rune>('0') && rune <= static_cast<Rune>('9')) {
UChar32 c;
U8_NEXT(text.data(), i, text.length(), c);
if (c < 0) break;
if (c >= '0' && c <= '9') {
return true;
}
i += bytes_read;
}
return false;
}
Expand Down
Expand Up @@ -19,8 +19,6 @@ limitations under the License.
#include <unordered_set>
#include <vector>

#include "libutf/utf.h"

// Normalizes the input with the given |separators| by adding a space before and
// after each separator. When |normalize_repetition| is true, it removes the
// repeated characters (except numbers) which consecutively appeared more than
Expand Down
Expand Up @@ -22,7 +22,11 @@ limitations under the License.
#include <sstream>
#include <unordered_set>

#include "icu4c/source/common/unicode/uchar.h"
#include "icu4c/source/common/unicode/utf8.h"

namespace {

constexpr int kInvalid = -1;
constexpr char kSpace = ' ';

Expand Down Expand Up @@ -168,25 +172,33 @@ class UnicodeHash : public HashEngine {
int feature_size) override {
std::vector<uint64_t> hash_codes;
hash_codes.reserve(2 * (feature_size / 64 + 1));
auto word_ptr = word.c_str();
int utflength = utflen(const_cast<char*>(word_ptr));
// Both `feature_size` and `bits_per_unicode` are bit lengths.
const int max_usable_runes = feature_size * 2 / bits_per_unicode_;
if (max_usable_runes < utflength) {
const int unicode_skip = (utflength - max_usable_runes) / 2;
const char* word_ptr = word.c_str();
int word_index = 0;
int utflength = 0;
while (word_index < word.length()) {
UChar32 c;
U8_NEXT(word_ptr, word_index, word.length(), c);
if (c < 0) break;
utflength++;
}
word_index = 0;
// Both `feature_size` and `bits_per_unicode` are bit lengths.
const int max_usable_chars = feature_size * 2 / bits_per_unicode_;
if (max_usable_chars < utflength) {
const int unicode_skip = (utflength - max_usable_chars) / 2;
for (int i = 0; i < unicode_skip; ++i) {
Rune rune;
word_ptr += chartorune(&rune, const_cast<char*>(word_ptr));
UChar32 c;
U8_NEXT(word_ptr, word_index, word.length(), c);
}
utflength = max_usable_runes;
utflength = max_usable_chars;
}

std::vector<uint64_t> unicode_hashes;
unicode_hashes.reserve(utflength);
for (int i = 0; i < utflength; ++i) {
Rune rune;
word_ptr += chartorune(&rune, const_cast<char*>(word_ptr));
unicode_hashes.push_back((rune * kMul) & bit_mask_);
UChar32 c;
U8_NEXT(word_ptr, word_index, word.length(), c);
unicode_hashes.push_back((c * kMul) & bit_mask_);
}

uint64_t hash = 0;
Expand Down Expand Up @@ -252,42 +264,37 @@ std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
// is allocated for target.
const char* csource = source.first;
int len = source.second;
int i = 0;
auto target = std::unique_ptr<char[]>(new char[len * 4]);
auto target_ptr = target.get();
int i = 0;
int target_len = 0;
bool first_char = true;
bool first_cap_value = false;
bool all_caps_value = false;
while (i < len) {
Rune rune;
const int bytes_read = chartorune(&rune, const_cast<char*>(csource + i));
if (bytes_read == 0 || bytes_read > len - i) {
break;
UChar32 c;
U8_NEXT(csource, i, len, c);
if (c < 0) break;
UChar32 lower = u_tolower(c);
// Skip processing the unicode if exclude_nonalphaspace_unicodes_ is
// true and the unicode is not alpha and not space.
const UChar32 kSpaceChar = ' ';
if (exclude_nonalphaspace_unicodes_ && !u_isUAlphabetic(lower) &&
lower != kSpaceChar) {
continue;
}
i += bytes_read;
if (rune != Runeerror) {
Rune lower = tolowerrune(rune);
// Skip processing the unicode if exclude_nonalphaspace_unicodes_ is
// true and the unicode is not alpha and not space.
const Rune kSpaceRune = ' ';
if (exclude_nonalphaspace_unicodes_ && !isalpharune(lower) &&
lower != kSpaceRune) {
continue;
}
if (IsUnrestrictedVocabulary() || IsValidUnicode(lower)) {
const int bytes_written = runetochar(target_ptr, &lower);
target_ptr += bytes_written;

const bool lower_case = (lower == rune);
if (first_char) {
first_cap_value = !lower_case;
all_caps_value = !lower_case;
} else {
first_cap_value &= lower_case;
all_caps_value &= !lower_case;
}
first_char = false;
if (IsUnrestrictedVocabulary() || IsValidUnicode(lower)) {
U8_APPEND_UNSAFE(target_ptr, target_len, lower);

const bool lower_case = (lower == c);
if (first_char) {
first_cap_value = !lower_case;
all_caps_value = !lower_case;
} else {
first_cap_value &= lower_case;
all_caps_value &= !lower_case;
}
first_char = false;
}
}
if (first_cap) {
Expand All @@ -296,28 +303,24 @@ std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
if (all_caps) {
*all_caps = all_caps_value;
}
return std::string(target.get(), target_ptr);
return std::string(target_ptr, target_len);
}

void ProjectionUnicodeHandler::InitializeVocabulary(
const std::string& vocabulary) {
for (size_t i = 0, index = 0; i < vocabulary.length();) {
Rune rune;
const int bytes_read =
chartorune(&rune, const_cast<char*>(vocabulary.c_str() + i));
if (!bytes_read || bytes_read > (vocabulary.length() - i)) {
break;
}
i += bytes_read;
UChar32 c;
U8_NEXT(const_cast<char*>(vocabulary.c_str()), i, vocabulary.length(), c);
if (c < 0) break;
// Include novel lower case unicode segments as part of valid chars.
if (rune == Runeerror) {
std::clog << "Invalid rune in vocabulary.";
} else if (IsValidUnicode(rune)) {
std::clog << "Duplicate rune " << rune << " found in vocabulary.";
} else if (rune != tolowerrune(rune)) {
std::clog << "Upper case rune " << rune << " found in vocabulary.";
if (c == 0xFFFD) {
std::clog << "Invalid character in vocabulary.";
} else if (IsValidUnicode(c)) {
std::clog << "Duplicate character " << c << " found in vocabulary.";
} else if (u_isUUppercase(c)) {
std::clog << "Upper case character " << c << " found in vocabulary.";
} else {
valid_chars_[rune] = index++;
valid_chars_[c] = index++;
}
}
}
Expand Down Expand Up @@ -379,15 +382,15 @@ std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
template <typename T>
void SplitByCharInternal(std::vector<T>* tokens, const char* input_ptr,
size_t len, size_t max_tokens) {
Rune rune;
for (size_t i = 0; i < len;) {
auto bytes_read = chartorune(&rune, const_cast<char*>(input_ptr + i));
if (bytes_read == 0 || bytes_read > (len - i)) break;
tokens->emplace_back(input_ptr + i, bytes_read);
UChar32 c;
size_t old_i = i;
U8_NEXT(input_ptr, i, len, c);
if (c < 0) break;
tokens->emplace_back(input_ptr + old_i, i - old_i);
if (max_tokens != kInvalid && tokens->size() == max_tokens) {
break;
}
i += bytes_read;
}
}

Expand Down
Expand Up @@ -14,12 +14,13 @@ limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_MODELS_SEQ_FLOW_LITE_TF_OPS_PROJECTION_UTIL_H_
#define TENSORFLOW_MODELS_SEQ_FLOW_LITE_TF_OPS_PROJECTION_UTIL_H_

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "libutf/utf.h"
#include "icu4c/source/common/unicode/utf8.h"

inline constexpr int kFirstCapOffset = 3;
inline constexpr int kAllCapsOffset = 4;
Expand Down Expand Up @@ -105,14 +106,14 @@ class ProjectionUnicodeHandler {

// Returns a boolean flag indicating if the unicode segment is part of the
// vocabulary.
bool IsValidUnicode(Rune rune) const {
return valid_chars_.find(rune) != valid_chars_.end();
bool IsValidUnicode(UChar32 c) const {
return valid_chars_.find(c) != valid_chars_.end();
}

// Returns an index in [0, |vocabulary|), if the unicode is part of the
// vocabulary and -1 if it's not.
int UnicodeIndex(Rune rune) const {
return IsValidUnicode(rune) ? valid_chars_.at(rune) : -1;
int UnicodeIndex(UChar32 c) const {
return IsValidUnicode(c) ? valid_chars_.at(c) : -1;
}

// Returns |vocabulary|.
Expand All @@ -137,9 +138,9 @@ class ProjectionUnicodeHandler {
// Parses and extracts supported or allowed unicode segments, also referred
// to as vocabulary, from a utf8 string.
void InitializeVocabulary(const std::string& vocabulary);
// A variable that maps a valid Unicode rune to its index in valid character
// vocabulary.
std::unordered_map<Rune, int> valid_chars_;
// A variable that maps a valid Unicode character to its index in the valid
// character vocabulary.
std::unordered_map<UChar32, int> valid_chars_;
// Controls whether to exclude non-alphabetic, non-space characters from the
// output text.
bool exclude_nonalphaspace_unicodes_;
Expand Down
Expand Up @@ -76,10 +76,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
}
}
} else if (output_categories->type == kTfLiteUInt8) {
const uint8_t one = PodQuantize(1.0, output_categories->params.zero_point,
1.0 / output_categories->params.scale);
const uint8_t zero = PodQuantize(0.0, output_categories->params.zero_point,
1.0 / output_categories->params.scale);
const uint8_t one =
PodQuantize<uint8_t>(1.0, output_categories->params.zero_point,
1.0 / output_categories->params.scale);
const uint8_t zero =
PodQuantize<uint8_t>(0.0, output_categories->params.zero_point,
1.0 / output_categories->params.scale);
for (int i = 0; i < input_size; i++) {
absl::flat_hash_set<int> categories;
TF_LITE_ENSURE_STATUS(op->GetCategories(context, i, categories));
Expand Down
Expand Up @@ -17,35 +17,60 @@ limitations under the License.

#include <algorithm>
#include <cmath>
#include <limits>

#include "absl/base/macros.h"
#include "tensorflow/lite/context.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"

namespace seq_flow_lite {

// Returns the original (dequantized) value of 8bit value.
inline float PodDequantizeValue(const TfLiteTensor& tensor, uint8_t value) {
// Returns the original (dequantized) value.
template <typename T>
inline float PodDequantizeValue(const TfLiteTensor& tensor, T value) {
const int32_t zero_point = tensor.params.zero_point;
const float scale = tensor.params.scale;
return (static_cast<int32_t>(value) - zero_point) * scale;
}

// Returns the original (dequantized) value of the 'index'-th element of
// 'tensor.
template <typename T>
inline float PodDequantize(const TfLiteTensor& tensor, int index) {
return PodDequantizeValue(tensor, tensor.data.uint8[index]);
return PodDequantizeValue<T>(tensor,
tflite::GetTensorData<T>(&tensor)[index]);
}

// Quantizes 'value' to 8bit, given the quantization bias (zero_point) and
// factor (inverse_scale).
inline uint8_t PodQuantize(float value, int32_t zero_point,
float inverse_scale) {
ABSL_DEPRECATED("Use PodDequantizeValue<uint8_t> instead.")
inline float PodDequantizeValue(const TfLiteTensor& tensor, uint8_t value) {
return PodDequantizeValue<uint8_t>(tensor, value);
}

ABSL_DEPRECATED("Use PodDequantize<uint8_t> instead.")
inline float PodDequantize(const TfLiteTensor& tensor, int index) {
return PodDequantizeValue<uint8_t>(tensor, tensor.data.uint8[index]);
}

// Quantizes 'value', given the quantization bias (zero_point) and factor
// (inverse_scale).
template <typename T>
inline T PodQuantize(float value, int32_t zero_point, float inverse_scale) {
const float integer_value_in_float = value * inverse_scale;
const float offset = (integer_value_in_float >= 0.0) ? 0.5f : -0.5f;
// NOTE(sfeuz): This assumes value * inverse_scale is within [INT_MIN,
// INT_MAX].
int32_t integer_value =
static_cast<int32_t>(integer_value_in_float + offset) + zero_point;
return static_cast<uint8_t>(std::max(std::min(255, integer_value), 0));
return static_cast<T>(
std::max(std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
integer_value),
static_cast<int32_t>(std::numeric_limits<T>::min())));
}

ABSL_DEPRECATED("Use PodQuantize<uint8_t> instead.")
inline uint8_t PodQuantize(float value, int32_t zero_point,
float inverse_scale) {
return PodQuantize<uint8_t>(value, zero_point, inverse_scale);
}

} // namespace seq_flow_lite
Expand Down

0 comments on commit d9a2f86

Please sign in to comment.