Skip to content

Commit

Permalink
Internal change
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 516871638
  • Loading branch information
MediaPipe Team authored and Copybara-Service committed Mar 15, 2023
1 parent 04ffb84 commit ce3cd94
Show file tree
Hide file tree
Showing 10 changed files with 970 additions and 0 deletions.
17 changes: 17 additions & 0 deletions LICENSE
Expand Up @@ -199,3 +199,20 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

===========================================================================
For files under tasks/cc/text/language_detector/custom_ops/utils/utf/
===========================================================================
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
42 changes: 42 additions & 0 deletions mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD
@@ -0,0 +1,42 @@
# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

package(default_visibility = ["//mediapipe/tasks:internal"])

licenses(["notice"])

cc_library(
name = "ngram_hash_ops_utils",
srcs = [
"ngram_hash_ops_utils.cc",
],
hdrs = [
"ngram_hash_ops_utils.h",
],
deps = [
"//mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf",
],
)

cc_test(
name = "ngram_hash_ops_utils_test",
size = "small",
srcs = [
"ngram_hash_ops_utils_test.cc",
],
deps = [
":ngram_hash_ops_utils",
"//mediapipe/framework/port:gtest_main",
],
)
@@ -0,0 +1,96 @@
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"

#include <string>
#include <utility>
#include <vector>

#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"

namespace mediapipe::tasks::text::language_detector::custom_ops {

TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
bool exclude_nonalphaspace_tokens) {
const std::string kPrefix = "^";
const std::string kSuffix = "$";
const std::string kReplacementToken = " ";

TokenizedOutput output;

size_t token_start = 0;
output.str.reserve(len + 2);
output.tokens.reserve(len + 2);

output.str.append(kPrefix);
output.tokens.push_back(std::make_pair(token_start, kPrefix.size()));
token_start += kPrefix.size();

Rune token;
for (int i = 0; i < len && output.tokens.size() + 1 < max_tokens;) {
// Use the standard UTF-8 library to find the next token.
size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);

// Stop processing, if we can't read any more tokens, or we have reached
// maximum allowed tokens, allocating one token for the suffix.
if (bytes_read == 0) {
break;
}

// If `exclude_nonalphaspace_tokens` is set to true, and the token is not
// alphanumeric, replace it with a replacement token.
if (exclude_nonalphaspace_tokens && !utf_isalpharune(token)) {
output.str.append(kReplacementToken);
output.tokens.push_back(
std::make_pair(token_start, kReplacementToken.size()));
token_start += kReplacementToken.size();
i += bytes_read;
continue;
}

// Append the token in the output string, and note its position and the
// number of bytes that token consumed.
output.str.append(input_str + i, bytes_read);
output.tokens.push_back(std::make_pair(token_start, bytes_read));
token_start += bytes_read;
i += bytes_read;
}
output.str.append(kSuffix);
output.tokens.push_back(std::make_pair(token_start, kSuffix.size()));
token_start += kSuffix.size();

return output;
}

void LowercaseUnicodeStr(const char* input_str, int len,
std::string* output_str) {
for (int i = 0; i < len;) {
Rune token;

// Tokenize the given string, and get the appropriate lowercase token.
size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
token = utf_isalpharune(token) ? utf_tolowerrune(token) : token;

// Write back the token to the output string.
char token_buf[UTFmax];
size_t bytes_to_write = utf_runetochar(token_buf, &token);
output_str->append(token_buf, bytes_to_write);

i += bytes_read;
}
}

} // namespace mediapipe::tasks::text::language_detector::custom_ops
@@ -0,0 +1,56 @@
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_

#include <string>
#include <utility>
#include <vector>

namespace mediapipe::tasks::text::language_detector::custom_ops {

struct TokenizedOutput {
// The processed string (with necessary prefix, suffix, skipped tokens, etc.).
std::string str;

// This vector contains pairs, where each pair has two members. The first
// denoting the starting index of the token in the `str` string, and the
// second denoting the length of that token in bytes.
std::vector<std::pair<const size_t, const size_t>> tokens;
};

// Tokenizes the given input string on Unicode token boundaries, with a maximum
// of `max_tokens` tokens.
//
// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores
// non-alphanumeric tokens, and replaces them with a replacement token (" ").
//
// The method returns the output in the `TokenizedOutput` struct, which stores
// both, the processed input string, and the indices and sizes of each token
// within that string.
TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
bool exclude_nonalphaspace_tokens);

// Converts the given unicode string (`input_str`) with the specified length
// (`len`) to a lowercase string.
//
// The method populates the lowercased string in `output_str`.
void LowercaseUnicodeStr(const char* input_str, int len,
std::string* output_str);

} // namespace mediapipe::tasks::text::language_detector::custom_ops

#endif // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
@@ -0,0 +1,135 @@
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"

#include <string>

#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"

namespace mediapipe::tasks::text::language_detector::custom_ops {

namespace {

using ::testing::Values;

std::string ReconstructStringFromTokens(TokenizedOutput output) {
std::string reconstructed_str;
for (int i = 0; i < output.tokens.size(); i++) {
reconstructed_str.append(
output.str.c_str() + output.tokens[i].first,
output.str.c_str() + output.tokens[i].first + output.tokens[i].second);
}
return reconstructed_str;
}

struct TokenizeTestParams {
std::string input_str;
size_t max_tokens;
bool exclude_nonalphaspace_tokens;
std::string expected_output_str;
};

class TokenizeParameterizedTest
: public ::testing::Test,
public testing::WithParamInterface<TokenizeTestParams> {};

TEST_P(TokenizeParameterizedTest, Tokenize) {
// Checks that the Tokenize method returns the expected value.
const TokenizeTestParams params = TokenizeParameterizedTest::GetParam();
const TokenizedOutput output = Tokenize(
/*input_str=*/params.input_str.c_str(),
/*len=*/params.input_str.size(),
/*max_tokens=*/params.max_tokens,
/*exclude_nonalphaspace_tokens=*/params.exclude_nonalphaspace_tokens);

// The output string should have the necessary prefixes, and the "!" token
// should have been replaced with a " ".
EXPECT_EQ(output.str, params.expected_output_str);
EXPECT_EQ(ReconstructStringFromTokens(output), params.expected_output_str);
}

INSTANTIATE_TEST_SUITE_P(
TokenizeParameterizedTests, TokenizeParameterizedTest,
Values(
// Test including non-alphanumeric characters.
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
/*exclude_alphanonspace=*/false,
/*expected_output_str=*/"^hi!$"}),
// Test not including non-alphanumeric characters.
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
/*exclude_alphanonspace=*/true,
/*expected_output_str=*/"^hi $"}),
// Test with a maximum of 3 tokens.
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/3,
/*exclude_alphanonspace=*/true,
/*expected_output_str=*/"^h$"}),
// Test with non-latin characters.
TokenizeTestParams({/*input_str=*/"ありがと", /*max_tokens=*/100,
/*exclude_alphanonspace=*/true,
/*expected_output_str=*/"^ありがと$"})));

TEST(LowercaseUnicodeTest, TestLowercaseUnicode) {
{
// Check that the method is a no-op when the string is lowercase.
std::string input_str = "hello";
std::string output_str;
LowercaseUnicodeStr(
/*input_str=*/input_str.c_str(),
/*len=*/input_str.size(),
/*output_str=*/&output_str);

EXPECT_EQ(output_str, "hello");
}
{
// Check that the method has uppercase characters.
std::string input_str = "hElLo";
std::string output_str;
LowercaseUnicodeStr(
/*input_str=*/input_str.c_str(),
/*len=*/input_str.size(),
/*output_str=*/&output_str);

EXPECT_EQ(output_str, "hello");
}
{
// Check that the method works with non-latin scripts.
// Cyrillic has the concept of cases, so it should change the input.
std::string input_str = "БЙп";
std::string output_str;
LowercaseUnicodeStr(
/*input_str=*/input_str.c_str(),
/*len=*/input_str.size(),
/*output_str=*/&output_str);

EXPECT_EQ(output_str, "бйп");
}
{
// Check that the method works with non-latin scripts.
// Japanese doesn't have the concept of cases, so it should not change.
std::string input_str = "ありがと";
std::string output_str;
LowercaseUnicodeStr(
/*input_str=*/input_str.c_str(),
/*len=*/input_str.size(),
/*output_str=*/&output_str);

EXPECT_EQ(output_str, "ありがと");
}
}

} // namespace
} // namespace mediapipe::tasks::text::language_detector::custom_ops
@@ -0,0 +1,27 @@
# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

package(default_visibility = ["//mediapipe/tasks:internal"])

licenses(["notice"])

cc_library(
name = "utf",
srcs = [
"rune.c",
"runetype.c",
"runetypebody.h",
],
hdrs = ["utf.h"],
)

0 comments on commit ce3cd94

Please sign in to comment.