Internal change

PiperOrigin-RevId: 516871638
google · Mar 15, 2023 · ce3cd94 · ce3cd94
1 parent 04ffb84
commit ce3cd94
Show file tree

Hide file tree

Showing 10 changed files with 970 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -199,3 +199,20 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+===========================================================================
+For files under tasks/cc/text/language_detector/custom_ops/utils/utf/
+===========================================================================
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD
@@ -0,0 +1,42 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+cc_library(
+    name = "ngram_hash_ops_utils",
+    srcs = [
+        "ngram_hash_ops_utils.cc",
+    ],
+    hdrs = [
+        "ngram_hash_ops_utils.h",
+    ],
+    deps = [
+        "//mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf",
+    ],
+)
+
+cc_test(
+    name = "ngram_hash_ops_utils_test",
+    size = "small",
+    srcs = [
+        "ngram_hash_ops_utils_test.cc",
+    ],
+    deps = [
+        ":ngram_hash_ops_utils",
+        "//mediapipe/framework/port:gtest_main",
+    ],
+)
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc
@@ -0,0 +1,96 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
+
+namespace mediapipe::tasks::text::language_detector::custom_ops {
+
+TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
+                         bool exclude_nonalphaspace_tokens) {
+  const std::string kPrefix = "^";
+  const std::string kSuffix = "$";
+  const std::string kReplacementToken = " ";
+
+  TokenizedOutput output;
+
+  size_t token_start = 0;
+  output.str.reserve(len + 2);
+  output.tokens.reserve(len + 2);
+
+  output.str.append(kPrefix);
+  output.tokens.push_back(std::make_pair(token_start, kPrefix.size()));
+  token_start += kPrefix.size();
+
+  Rune token;
+  for (int i = 0; i < len && output.tokens.size() + 1 < max_tokens;) {
+    // Use the standard UTF-8 library to find the next token.
+    size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
+
+    // Stop processing, if we can't read any more tokens, or we have reached
+    // maximum allowed tokens, allocating one token for the suffix.
+    if (bytes_read == 0) {
+      break;
+    }
+
+    // If `exclude_nonalphaspace_tokens` is set to true, and the token is not
+    // alphanumeric, replace it with a replacement token.
+    if (exclude_nonalphaspace_tokens && !utf_isalpharune(token)) {
+      output.str.append(kReplacementToken);
+      output.tokens.push_back(
+          std::make_pair(token_start, kReplacementToken.size()));
+      token_start += kReplacementToken.size();
+      i += bytes_read;
+      continue;
+    }
+
+    // Append the token in the output string, and note its position and the
+    // number of bytes that token consumed.
+    output.str.append(input_str + i, bytes_read);
+    output.tokens.push_back(std::make_pair(token_start, bytes_read));
+    token_start += bytes_read;
+    i += bytes_read;
+  }
+  output.str.append(kSuffix);
+  output.tokens.push_back(std::make_pair(token_start, kSuffix.size()));
+  token_start += kSuffix.size();
+
+  return output;
+}
+
+void LowercaseUnicodeStr(const char* input_str, int len,
+                         std::string* output_str) {
+  for (int i = 0; i < len;) {
+    Rune token;
+
+    // Tokenize the given string, and get the appropriate lowercase token.
+    size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
+    token = utf_isalpharune(token) ? utf_tolowerrune(token) : token;
+
+    // Write back the token to the output string.
+    char token_buf[UTFmax];
+    size_t bytes_to_write = utf_runetochar(token_buf, &token);
+    output_str->append(token_buf, bytes_to_write);
+
+    i += bytes_read;
+  }
+}
+
+}  // namespace mediapipe::tasks::text::language_detector::custom_ops
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
+#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace mediapipe::tasks::text::language_detector::custom_ops {
+
+struct TokenizedOutput {
+  // The processed string (with necessary prefix, suffix, skipped tokens, etc.).
+  std::string str;
+
+  // This vector contains pairs, where each pair has two members. The first
+  // denoting the starting index of the token in the `str` string, and the
+  // second denoting the length of that token in bytes.
+  std::vector<std::pair<const size_t, const size_t>> tokens;
+};
+
+// Tokenizes the given input string on Unicode token boundaries, with a maximum
+// of `max_tokens` tokens.
+//
+// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores
+// non-alphanumeric tokens, and replaces them with a replacement token (" ").
+//
+// The method returns the output in the `TokenizedOutput` struct, which stores
+// both, the processed input string, and the indices and sizes of each token
+// within that string.
+TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
+                         bool exclude_nonalphaspace_tokens);
+
+// Converts the given unicode string (`input_str`) with the specified length
+// (`len`) to a lowercase string.
+//
+// The method populates the lowercased string in `output_str`.
+void LowercaseUnicodeStr(const char* input_str, int len,
+                         std::string* output_str);
+
+}  // namespace mediapipe::tasks::text::language_detector::custom_ops
+
+#endif  // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"
+
+#include <string>
+
+#include "mediapipe/framework/port/gmock.h"
+#include "mediapipe/framework/port/gtest.h"
+
+namespace mediapipe::tasks::text::language_detector::custom_ops {
+
+namespace {
+
+using ::testing::Values;
+
+std::string ReconstructStringFromTokens(TokenizedOutput output) {
+  std::string reconstructed_str;
+  for (int i = 0; i < output.tokens.size(); i++) {
+    reconstructed_str.append(
+        output.str.c_str() + output.tokens[i].first,
+        output.str.c_str() + output.tokens[i].first + output.tokens[i].second);
+  }
+  return reconstructed_str;
+}
+
+struct TokenizeTestParams {
+  std::string input_str;
+  size_t max_tokens;
+  bool exclude_nonalphaspace_tokens;
+  std::string expected_output_str;
+};
+
+class TokenizeParameterizedTest
+    : public ::testing::Test,
+      public testing::WithParamInterface<TokenizeTestParams> {};
+
+TEST_P(TokenizeParameterizedTest, Tokenize) {
+  // Checks that the Tokenize method returns the expected value.
+  const TokenizeTestParams params = TokenizeParameterizedTest::GetParam();
+  const TokenizedOutput output = Tokenize(
+      /*input_str=*/params.input_str.c_str(),
+      /*len=*/params.input_str.size(),
+      /*max_tokens=*/params.max_tokens,
+      /*exclude_nonalphaspace_tokens=*/params.exclude_nonalphaspace_tokens);
+
+  // The output string should have the necessary prefixes, and the "!" token
+  // should have been replaced with a " ".
+  EXPECT_EQ(output.str, params.expected_output_str);
+  EXPECT_EQ(ReconstructStringFromTokens(output), params.expected_output_str);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TokenizeParameterizedTests, TokenizeParameterizedTest,
+    Values(
+        // Test including non-alphanumeric characters.
+        TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
+                            /*exclude_alphanonspace=*/false,
+                            /*expected_output_str=*/"^hi!$"}),
+        // Test not including non-alphanumeric characters.
+        TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
+                            /*exclude_alphanonspace=*/true,
+                            /*expected_output_str=*/"^hi $"}),
+        // Test with a maximum of 3 tokens.
+        TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/3,
+                            /*exclude_alphanonspace=*/true,
+                            /*expected_output_str=*/"^h$"}),
+        // Test with non-latin characters.
+        TokenizeTestParams({/*input_str=*/"ありがと", /*max_tokens=*/100,
+                            /*exclude_alphanonspace=*/true,
+                            /*expected_output_str=*/"^ありがと$"})));
+
+TEST(LowercaseUnicodeTest, TestLowercaseUnicode) {
+  {
+    // Check that the method is a no-op when the string is lowercase.
+    std::string input_str = "hello";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "hello");
+  }
+  {
+    // Check that the method has uppercase characters.
+    std::string input_str = "hElLo";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "hello");
+  }
+  {
+    // Check that the method works with non-latin scripts.
+    // Cyrillic has the concept of cases, so it should change the input.
+    std::string input_str = "БЙп";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "бйп");
+  }
+  {
+    // Check that the method works with non-latin scripts.
+    // Japanese doesn't have the concept of cases, so it should not change.
+    std::string input_str = "ありがと";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "ありがと");
+  }
+}
+
+}  // namespace
+}  // namespace mediapipe::tasks::text::language_detector::custom_ops
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD
@@ -0,0 +1,27 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+cc_library(
+    name = "utf",
+    srcs = [
+        "rune.c",
+        "runetype.c",
+        "runetypebody.h",
+    ],
+    hdrs = ["utf.h"],
+)