Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PiperOrigin-RevId: 516871638
- Loading branch information
MediaPipe Team
authored and
Copybara-Service
committed
Mar 15, 2023
1 parent
04ffb84
commit ce3cd94
Showing
10 changed files
with
970 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Copyright 2023 The MediaPipe Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
package(default_visibility = ["//mediapipe/tasks:internal"]) | ||
|
||
licenses(["notice"]) | ||
|
||
cc_library( | ||
name = "ngram_hash_ops_utils", | ||
srcs = [ | ||
"ngram_hash_ops_utils.cc", | ||
], | ||
hdrs = [ | ||
"ngram_hash_ops_utils.h", | ||
], | ||
deps = [ | ||
"//mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf", | ||
], | ||
) | ||
|
||
cc_test( | ||
name = "ngram_hash_ops_utils_test", | ||
size = "small", | ||
srcs = [ | ||
"ngram_hash_ops_utils_test.cc", | ||
], | ||
deps = [ | ||
":ngram_hash_ops_utils", | ||
"//mediapipe/framework/port:gtest_main", | ||
], | ||
) |
96 changes: 96 additions & 0 deletions
96
mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h" | ||
|
||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h" | ||
|
||
namespace mediapipe::tasks::text::language_detector::custom_ops { | ||
|
||
TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens, | ||
bool exclude_nonalphaspace_tokens) { | ||
const std::string kPrefix = "^"; | ||
const std::string kSuffix = "$"; | ||
const std::string kReplacementToken = " "; | ||
|
||
TokenizedOutput output; | ||
|
||
size_t token_start = 0; | ||
output.str.reserve(len + 2); | ||
output.tokens.reserve(len + 2); | ||
|
||
output.str.append(kPrefix); | ||
output.tokens.push_back(std::make_pair(token_start, kPrefix.size())); | ||
token_start += kPrefix.size(); | ||
|
||
Rune token; | ||
for (int i = 0; i < len && output.tokens.size() + 1 < max_tokens;) { | ||
// Use the standard UTF-8 library to find the next token. | ||
size_t bytes_read = utf_charntorune(&token, input_str + i, len - i); | ||
|
||
// Stop processing, if we can't read any more tokens, or we have reached | ||
// maximum allowed tokens, allocating one token for the suffix. | ||
if (bytes_read == 0) { | ||
break; | ||
} | ||
|
||
// If `exclude_nonalphaspace_tokens` is set to true, and the token is not | ||
// alphanumeric, replace it with a replacement token. | ||
if (exclude_nonalphaspace_tokens && !utf_isalpharune(token)) { | ||
output.str.append(kReplacementToken); | ||
output.tokens.push_back( | ||
std::make_pair(token_start, kReplacementToken.size())); | ||
token_start += kReplacementToken.size(); | ||
i += bytes_read; | ||
continue; | ||
} | ||
|
||
// Append the token in the output string, and note its position and the | ||
// number of bytes that token consumed. | ||
output.str.append(input_str + i, bytes_read); | ||
output.tokens.push_back(std::make_pair(token_start, bytes_read)); | ||
token_start += bytes_read; | ||
i += bytes_read; | ||
} | ||
output.str.append(kSuffix); | ||
output.tokens.push_back(std::make_pair(token_start, kSuffix.size())); | ||
token_start += kSuffix.size(); | ||
|
||
return output; | ||
} | ||
|
||
void LowercaseUnicodeStr(const char* input_str, int len, | ||
std::string* output_str) { | ||
for (int i = 0; i < len;) { | ||
Rune token; | ||
|
||
// Tokenize the given string, and get the appropriate lowercase token. | ||
size_t bytes_read = utf_charntorune(&token, input_str + i, len - i); | ||
token = utf_isalpharune(token) ? utf_tolowerrune(token) : token; | ||
|
||
// Write back the token to the output string. | ||
char token_buf[UTFmax]; | ||
size_t bytes_to_write = utf_runetochar(token_buf, &token); | ||
output_str->append(token_buf, bytes_to_write); | ||
|
||
i += bytes_read; | ||
} | ||
} | ||
|
||
} // namespace mediapipe::tasks::text::language_detector::custom_ops |
56 changes: 56 additions & 0 deletions
56
mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_ | ||
#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_ | ||
|
||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
namespace mediapipe::tasks::text::language_detector::custom_ops { | ||
|
||
struct TokenizedOutput { | ||
// The processed string (with necessary prefix, suffix, skipped tokens, etc.). | ||
std::string str; | ||
|
||
// This vector contains pairs, where each pair has two members. The first | ||
// denoting the starting index of the token in the `str` string, and the | ||
// second denoting the length of that token in bytes. | ||
std::vector<std::pair<const size_t, const size_t>> tokens; | ||
}; | ||
|
||
// Tokenizes the given input string on Unicode token boundaries, with a maximum | ||
// of `max_tokens` tokens. | ||
// | ||
// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores | ||
// non-alphanumeric tokens, and replaces them with a replacement token (" "). | ||
// | ||
// The method returns the output in the `TokenizedOutput` struct, which stores | ||
// both, the processed input string, and the indices and sizes of each token | ||
// within that string. | ||
TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens, | ||
bool exclude_nonalphaspace_tokens); | ||
|
||
// Converts the given unicode string (`input_str`) with the specified length | ||
// (`len`) to a lowercase string. | ||
// | ||
// The method populates the lowercased string in `output_str`. | ||
void LowercaseUnicodeStr(const char* input_str, int len, | ||
std::string* output_str); | ||
|
||
} // namespace mediapipe::tasks::text::language_detector::custom_ops | ||
|
||
#endif // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_ |
135 changes: 135 additions & 0 deletions
135
mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h" | ||
|
||
#include <string> | ||
|
||
#include "mediapipe/framework/port/gmock.h" | ||
#include "mediapipe/framework/port/gtest.h" | ||
|
||
namespace mediapipe::tasks::text::language_detector::custom_ops { | ||
|
||
namespace { | ||
|
||
using ::testing::Values; | ||
|
||
std::string ReconstructStringFromTokens(TokenizedOutput output) { | ||
std::string reconstructed_str; | ||
for (int i = 0; i < output.tokens.size(); i++) { | ||
reconstructed_str.append( | ||
output.str.c_str() + output.tokens[i].first, | ||
output.str.c_str() + output.tokens[i].first + output.tokens[i].second); | ||
} | ||
return reconstructed_str; | ||
} | ||
|
||
struct TokenizeTestParams { | ||
std::string input_str; | ||
size_t max_tokens; | ||
bool exclude_nonalphaspace_tokens; | ||
std::string expected_output_str; | ||
}; | ||
|
||
class TokenizeParameterizedTest | ||
: public ::testing::Test, | ||
public testing::WithParamInterface<TokenizeTestParams> {}; | ||
|
||
TEST_P(TokenizeParameterizedTest, Tokenize) { | ||
// Checks that the Tokenize method returns the expected value. | ||
const TokenizeTestParams params = TokenizeParameterizedTest::GetParam(); | ||
const TokenizedOutput output = Tokenize( | ||
/*input_str=*/params.input_str.c_str(), | ||
/*len=*/params.input_str.size(), | ||
/*max_tokens=*/params.max_tokens, | ||
/*exclude_nonalphaspace_tokens=*/params.exclude_nonalphaspace_tokens); | ||
|
||
// The output string should have the necessary prefixes, and the "!" token | ||
// should have been replaced with a " ". | ||
EXPECT_EQ(output.str, params.expected_output_str); | ||
EXPECT_EQ(ReconstructStringFromTokens(output), params.expected_output_str); | ||
} | ||
|
||
INSTANTIATE_TEST_SUITE_P( | ||
TokenizeParameterizedTests, TokenizeParameterizedTest, | ||
Values( | ||
// Test including non-alphanumeric characters. | ||
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100, | ||
/*exclude_alphanonspace=*/false, | ||
/*expected_output_str=*/"^hi!$"}), | ||
// Test not including non-alphanumeric characters. | ||
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100, | ||
/*exclude_alphanonspace=*/true, | ||
/*expected_output_str=*/"^hi $"}), | ||
// Test with a maximum of 3 tokens. | ||
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/3, | ||
/*exclude_alphanonspace=*/true, | ||
/*expected_output_str=*/"^h$"}), | ||
// Test with non-latin characters. | ||
TokenizeTestParams({/*input_str=*/"ありがと", /*max_tokens=*/100, | ||
/*exclude_alphanonspace=*/true, | ||
/*expected_output_str=*/"^ありがと$"}))); | ||
|
||
TEST(LowercaseUnicodeTest, TestLowercaseUnicode) { | ||
{ | ||
// Check that the method is a no-op when the string is lowercase. | ||
std::string input_str = "hello"; | ||
std::string output_str; | ||
LowercaseUnicodeStr( | ||
/*input_str=*/input_str.c_str(), | ||
/*len=*/input_str.size(), | ||
/*output_str=*/&output_str); | ||
|
||
EXPECT_EQ(output_str, "hello"); | ||
} | ||
{ | ||
// Check that the method has uppercase characters. | ||
std::string input_str = "hElLo"; | ||
std::string output_str; | ||
LowercaseUnicodeStr( | ||
/*input_str=*/input_str.c_str(), | ||
/*len=*/input_str.size(), | ||
/*output_str=*/&output_str); | ||
|
||
EXPECT_EQ(output_str, "hello"); | ||
} | ||
{ | ||
// Check that the method works with non-latin scripts. | ||
// Cyrillic has the concept of cases, so it should change the input. | ||
std::string input_str = "БЙп"; | ||
std::string output_str; | ||
LowercaseUnicodeStr( | ||
/*input_str=*/input_str.c_str(), | ||
/*len=*/input_str.size(), | ||
/*output_str=*/&output_str); | ||
|
||
EXPECT_EQ(output_str, "бйп"); | ||
} | ||
{ | ||
// Check that the method works with non-latin scripts. | ||
// Japanese doesn't have the concept of cases, so it should not change. | ||
std::string input_str = "ありがと"; | ||
std::string output_str; | ||
LowercaseUnicodeStr( | ||
/*input_str=*/input_str.c_str(), | ||
/*len=*/input_str.size(), | ||
/*output_str=*/&output_str); | ||
|
||
EXPECT_EQ(output_str, "ありがと"); | ||
} | ||
} | ||
|
||
} // namespace | ||
} // namespace mediapipe::tasks::text::language_detector::custom_ops |
27 changes: 27 additions & 0 deletions
27
mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Copyright 2022 The MediaPipe Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
package(default_visibility = ["//mediapipe/tasks:internal"]) | ||
|
||
licenses(["notice"]) | ||
|
||
cc_library( | ||
name = "utf", | ||
srcs = [ | ||
"rune.c", | ||
"runetype.c", | ||
"runetypebody.h", | ||
], | ||
hdrs = ["utf.h"], | ||
) |
Oops, something went wrong.