From ce3cd94f457970502adb855fc723d2d13ae47980 Mon Sep 17 00:00:00 2001
From: MediaPipe Team <mediapipe-team@google.com>
Date: Wed, 15 Mar 2023 10:54:21 -0700
Subject: [PATCH] Internal change

PiperOrigin-RevId: 516871638
---
 LICENSE                                       |  17 ++
 .../language_detector/custom_ops/utils/BUILD  |  42 ++++
 .../custom_ops/utils/ngram_hash_ops_utils.cc  |  96 ++++++++
 .../custom_ops/utils/ngram_hash_ops_utils.h   |  56 +++++
 .../utils/ngram_hash_ops_utils_test.cc        | 135 ++++++++++
 .../custom_ops/utils/utf/BUILD                |  27 ++
 .../custom_ops/utils/utf/rune.c               | 233 ++++++++++++++++++
 .../custom_ops/utils/utf/runetype.c           |  54 ++++
 .../custom_ops/utils/utf/runetypebody.h       | 212 ++++++++++++++++
 .../custom_ops/utils/utf/utf.h                |  98 ++++++++
 10 files changed, 970 insertions(+)
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/rune.c
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetype.c
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetypebody.h
 create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h

diff --git a/LICENSE b/LICENSE
index 261eeb9e9f..0e03e3911e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -199,3 +199,20 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+===========================================================================
+For files under tasks/cc/text/language_detector/custom_ops/utils/utf/
+===========================================================================
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD
new file mode 100644
index 0000000000..9f2fe298ad
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/BUILD
@@ -0,0 +1,42 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+cc_library(
+    name = "ngram_hash_ops_utils",
+    srcs = [
+        "ngram_hash_ops_utils.cc",
+    ],
+    hdrs = [
+        "ngram_hash_ops_utils.h",
+    ],
+    deps = [
+        "//mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf",
+    ],
+)
+
+cc_test(
+    name = "ngram_hash_ops_utils_test",
+    size = "small",
+    srcs = [
+        "ngram_hash_ops_utils_test.cc",
+    ],
+    deps = [
+        ":ngram_hash_ops_utils",
+        "//mediapipe/framework/port:gtest_main",
+    ],
+)
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc
new file mode 100644
index 0000000000..f1ad71fc14
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.cc
@@ -0,0 +1,96 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
+
+namespace mediapipe::tasks::text::language_detector::custom_ops {
+
+TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
+                         bool exclude_nonalphaspace_tokens) {
+  const std::string kPrefix = "^";
+  const std::string kSuffix = "$";
+  const std::string kReplacementToken = " ";
+
+  TokenizedOutput output;
+
+  size_t token_start = 0;
+  output.str.reserve(len + 2);
+  output.tokens.reserve(len + 2);
+
+  output.str.append(kPrefix);
+  output.tokens.push_back(std::make_pair(token_start, kPrefix.size()));
+  token_start += kPrefix.size();
+
+  Rune token;
+  for (int i = 0; i < len && output.tokens.size() + 1 < max_tokens;) {
+    // Use the standard UTF-8 library to find the next token.
+    size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
+
+    // Stop processing, if we can't read any more tokens, or we have reached
+    // maximum allowed tokens, allocating one token for the suffix.
+    if (bytes_read == 0) {
+      break;
+    }
+
+    // If `exclude_nonalphaspace_tokens` is set to true, and the token is not
+    // alphanumeric, replace it with a replacement token.
+    if (exclude_nonalphaspace_tokens && !utf_isalpharune(token)) {
+      output.str.append(kReplacementToken);
+      output.tokens.push_back(
+          std::make_pair(token_start, kReplacementToken.size()));
+      token_start += kReplacementToken.size();
+      i += bytes_read;
+      continue;
+    }
+
+    // Append the token in the output string, and note its position and the
+    // number of bytes that token consumed.
+    output.str.append(input_str + i, bytes_read);
+    output.tokens.push_back(std::make_pair(token_start, bytes_read));
+    token_start += bytes_read;
+    i += bytes_read;
+  }
+  output.str.append(kSuffix);
+  output.tokens.push_back(std::make_pair(token_start, kSuffix.size()));
+  token_start += kSuffix.size();
+
+  return output;
+}
+
+void LowercaseUnicodeStr(const char* input_str, int len,
+                         std::string* output_str) {
+  for (int i = 0; i < len;) {
+    Rune token;
+
+    // Tokenize the given string, and get the appropriate lowercase token.
+    size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
+    token = utf_isalpharune(token) ? utf_tolowerrune(token) : token;
+
+    // Write back the token to the output string.
+    char token_buf[UTFmax];
+    size_t bytes_to_write = utf_runetochar(token_buf, &token);
+    output_str->append(token_buf, bytes_to_write);
+
+    i += bytes_read;
+  }
+}
+
+}  // namespace mediapipe::tasks::text::language_detector::custom_ops
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h
new file mode 100644
index 0000000000..9a80554c8b
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
+#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace mediapipe::tasks::text::language_detector::custom_ops {
+
+struct TokenizedOutput {
+  // The processed string (with necessary prefix, suffix, skipped tokens, etc.).
+  std::string str;
+
+  // This vector contains pairs, where each pair has two members. The first
+  // denoting the starting index of the token in the `str` string, and the
+  // second denoting the length of that token in bytes.
+  std::vector<std::pair<const size_t, const size_t>> tokens;
+};
+
+// Tokenizes the given input string on Unicode token boundaries, with a maximum
+// of `max_tokens` tokens.
+//
+// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores
+// non-alphanumeric tokens, and replaces them with a replacement token (" ").
+//
+// The method returns the output in the `TokenizedOutput` struct, which stores
+// both, the processed input string, and the indices and sizes of each token
+// within that string.
+TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
+                         bool exclude_nonalphaspace_tokens);
+
+// Converts the given unicode string (`input_str`) with the specified length
+// (`len`) to a lowercase string.
+//
+// The method populates the lowercased string in `output_str`.
+void LowercaseUnicodeStr(const char* input_str, int len,
+                         std::string* output_str);
+
+}  // namespace mediapipe::tasks::text::language_detector::custom_ops
+
+#endif  // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc
new file mode 100644
index 0000000000..d22af1c95a
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"
+
+#include <string>
+
+#include "mediapipe/framework/port/gmock.h"
+#include "mediapipe/framework/port/gtest.h"
+
+namespace mediapipe::tasks::text::language_detector::custom_ops {
+
+namespace {
+
+using ::testing::Values;
+
+std::string ReconstructStringFromTokens(TokenizedOutput output) {
+  std::string reconstructed_str;
+  for (int i = 0; i < output.tokens.size(); i++) {
+    reconstructed_str.append(
+        output.str.c_str() + output.tokens[i].first,
+        output.str.c_str() + output.tokens[i].first + output.tokens[i].second);
+  }
+  return reconstructed_str;
+}
+
+struct TokenizeTestParams {
+  std::string input_str;
+  size_t max_tokens;
+  bool exclude_nonalphaspace_tokens;
+  std::string expected_output_str;
+};
+
+class TokenizeParameterizedTest
+    : public ::testing::Test,
+      public testing::WithParamInterface<TokenizeTestParams> {};
+
+TEST_P(TokenizeParameterizedTest, Tokenize) {
+  // Checks that the Tokenize method returns the expected value.
+  const TokenizeTestParams params = TokenizeParameterizedTest::GetParam();
+  const TokenizedOutput output = Tokenize(
+      /*input_str=*/params.input_str.c_str(),
+      /*len=*/params.input_str.size(),
+      /*max_tokens=*/params.max_tokens,
+      /*exclude_nonalphaspace_tokens=*/params.exclude_nonalphaspace_tokens);
+
+  // The output string should have the necessary prefixes, and the "!" token
+  // should have been replaced with a " ".
+  EXPECT_EQ(output.str, params.expected_output_str);
+  EXPECT_EQ(ReconstructStringFromTokens(output), params.expected_output_str);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TokenizeParameterizedTests, TokenizeParameterizedTest,
+    Values(
+        // Test including non-alphanumeric characters.
+        TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
+                            /*exclude_alphanonspace=*/false,
+                            /*expected_output_str=*/"^hi!$"}),
+        // Test not including non-alphanumeric characters.
+        TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
+                            /*exclude_alphanonspace=*/true,
+                            /*expected_output_str=*/"^hi $"}),
+        // Test with a maximum of 3 tokens.
+        TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/3,
+                            /*exclude_alphanonspace=*/true,
+                            /*expected_output_str=*/"^h$"}),
+        // Test with non-latin characters.
+        TokenizeTestParams({/*input_str=*/"ありがと", /*max_tokens=*/100,
+                            /*exclude_alphanonspace=*/true,
+                            /*expected_output_str=*/"^ありがと$"})));
+
+TEST(LowercaseUnicodeTest, TestLowercaseUnicode) {
+  {
+    // Check that the method is a no-op when the string is lowercase.
+    std::string input_str = "hello";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "hello");
+  }
+  {
+    // Check that the method has uppercase characters.
+    std::string input_str = "hElLo";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "hello");
+  }
+  {
+    // Check that the method works with non-latin scripts.
+    // Cyrillic has the concept of cases, so it should change the input.
+    std::string input_str = "БЙп";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "бйп");
+  }
+  {
+    // Check that the method works with non-latin scripts.
+    // Japanese doesn't have the concept of cases, so it should not change.
+    std::string input_str = "ありがと";
+    std::string output_str;
+    LowercaseUnicodeStr(
+        /*input_str=*/input_str.c_str(),
+        /*len=*/input_str.size(),
+        /*output_str=*/&output_str);
+
+    EXPECT_EQ(output_str, "ありがと");
+  }
+}
+
+}  // namespace
+}  // namespace mediapipe::tasks::text::language_detector::custom_ops
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD
new file mode 100644
index 0000000000..a718453051
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/BUILD
@@ -0,0 +1,27 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+cc_library(
+    name = "utf",
+    srcs = [
+        "rune.c",
+        "runetype.c",
+        "runetypebody.h",
+    ],
+    hdrs = ["utf.h"],
+)
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/rune.c b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/rune.c
new file mode 100644
index 0000000000..b74450f44d
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/rune.c
@@ -0,0 +1,233 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Forked from a library written by Rob Pike and Ken Thompson. Original
+// copyright message below.
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
+
+enum
+{
+  Bit1  = 7,
+  Bitx  = 6,
+  Bit2  = 5,
+  Bit3  = 4,
+  Bit4  = 3,
+  Bit5  = 2,
+
+  T1  = ((1<<(Bit1+1))-1) ^ 0xFF,  /* 0000 0000 */
+  Tx  = ((1<<(Bitx+1))-1) ^ 0xFF,  /* 1000 0000 */
+  T2  = ((1<<(Bit2+1))-1) ^ 0xFF,  /* 1100 0000 */
+  T3  = ((1<<(Bit3+1))-1) ^ 0xFF,  /* 1110 0000 */
+  T4  = ((1<<(Bit4+1))-1) ^ 0xFF,  /* 1111 0000 */
+  T5  = ((1<<(Bit5+1))-1) ^ 0xFF,  /* 1111 1000 */
+
+  Rune1  = (1<<(Bit1+0*Bitx))-1,    /* 0000 0000 0111 1111 */
+  Rune2  = (1<<(Bit2+1*Bitx))-1,    /* 0000 0111 1111 1111 */
+  Rune3  = (1<<(Bit3+2*Bitx))-1,    /* 1111 1111 1111 1111 */
+  Rune4  = (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */
+
+  Maskx  = (1<<Bitx)-1,      /* 0011 1111 */
+  Testx  = Maskx ^ 0xFF,      /* 1100 0000 */
+
+  Bad  = Runeerror,
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune
+ * that works on strings that are not necessarily null-terminated.
+ *
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int
+utf_charntorune(Rune *rune, const char *str, int length)
+{
+  int c, c1, c2, c3;
+  long l;
+
+  /* When we're not allowed to read anything */
+  if(length <= 0) {
+    goto badlen;
+  }
+
+  /*
+   * one character sequence (7-bit value)
+   *  00000-0007F => T1
+   */
+  c = *(uchar*)str;
+  if(c < Tx) {
+    *rune = c;
+    return 1;
+  }
+
+  // If we can't read more than one character we must stop
+  if(length <= 1) {
+    goto badlen;
+  }
+
+  /*
+   * two character sequence (11-bit value)
+   *  0080-07FF => T2 Tx
+   */
+  c1 = *(uchar*)(str+1) ^ Tx;
+  if(c1 & Testx)
+    goto bad;
+  if(c < T3) {
+    if(c < T2)
+      goto bad;
+    l = ((c << Bitx) | c1) & Rune2;
+    if(l <= Rune1)
+      goto bad;
+    *rune = l;
+    return 2;
+  }
+
+  // If we can't read more than two characters we must stop
+  if(length <= 2) {
+    goto badlen;
+  }
+
+  /*
+   * three character sequence (16-bit value)
+   *  0800-FFFF => T3 Tx Tx
+   */
+  c2 = *(uchar*)(str+2) ^ Tx;
+  if(c2 & Testx)
+    goto bad;
+  if(c < T4) {
+    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+    if(l <= Rune2)
+      goto bad;
+    *rune = l;
+    return 3;
+  }
+
+  if (length <= 3)
+    goto badlen;
+
+  /*
+   * four character sequence (21-bit value)
+   *  10000-1FFFFF => T4 Tx Tx Tx
+   */
+  c3 = *(uchar*)(str+3) ^ Tx;
+  if (c3 & Testx)
+    goto bad;
+  if (c < T5) {
+    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+    if (l <= Rune3)
+      goto bad;
+    if (l > Runemax)
+      goto bad;
+    *rune = l;
+    return 4;
+  }
+
+  // Support for 5-byte or longer UTF-8 would go here, but
+  // since we don't have that, we'll just fall through to bad.
+
+  /*
+   * bad decoding
+   */
+bad:
+  *rune = Bad;
+  return 1;
+badlen:
+  *rune = Bad;
+  return 0;
+
+}
+
+int
+utf_runetochar(char *str, const Rune *rune)
+{
+  /* Runes are signed, so convert to unsigned for range check. */
+  unsigned long c;
+
+  /*
+   * one character sequence
+   *  00000-0007F => 00-7F
+   */
+  c = *rune;
+  if(c <= Rune1) {
+    str[0] = c;
+    return 1;
+  }
+
+  /*
+   * two character sequence
+   *  0080-07FF => T2 Tx
+   */
+  if(c <= Rune2) {
+    str[0] = T2 | (c >> 1*Bitx);
+    str[1] = Tx | (c & Maskx);
+    return 2;
+  }
+
+  /*
+   * If the Rune is out of range, convert it to the error rune.
+   * Do this test here because the error rune encodes to three bytes.
+   * Doing it earlier would duplicate work, since an out of range
+   * Rune wouldn't have fit in one or two bytes.
+   */
+  if (c > Runemax)
+    c = Runeerror;
+
+  /*
+   * three character sequence
+   *  0800-FFFF => T3 Tx Tx
+   */
+  if (c <= Rune3) {
+    str[0] = T3 |  (c >> 2*Bitx);
+    str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+    str[2] = Tx |  (c & Maskx);
+    return 3;
+  }
+
+  /*
+   * four character sequence (21-bit value)
+   *     10000-1FFFFF => T4 Tx Tx Tx
+   */
+  str[0] = T4 | (c >> 3*Bitx);
+  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+  str[3] = Tx | (c & Maskx);
+  return 4;
+}
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetype.c b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetype.c
new file mode 100644
index 0000000000..1dd8abdbd1
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetype.c
@@ -0,0 +1,54 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Forked from a library written by Rob Pike and Ken Thompson. Original
+// copyright message below.
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
+
+static
+Rune*
+rbsearch(Rune c, Rune *t, int n, int ne)
+{
+  Rune *p;
+  int m;
+
+  while(n > 1) {
+    m = n >> 1;
+    p = t + m*ne;
+    if(c >= p[0]) {
+      t = p;
+      n = n-m;
+    } else
+      n = m;
+  }
+  if(n && c >= t[0])
+    return t;
+  return 0;
+}
+
+#define RUNETYPEBODY
+#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetypebody.h"
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetypebody.h b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetypebody.h
new file mode 100644
index 0000000000..66d1dfc19c
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetypebody.h
@@ -0,0 +1,212 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef RUNETYPEBODY
+
+static Rune __isalphar[] = {
+    0x0041,  0x005a,  0x0061,  0x007a,  0x00c0,  0x00d6,  0x00d8,  0x00f6,
+    0x00f8,  0x02c1,  0x02c6,  0x02d1,  0x02e0,  0x02e4,  0x0370,  0x0374,
+    0x0376,  0x0377,  0x037a,  0x037d,  0x0388,  0x038a,  0x038e,  0x03a1,
+    0x03a3,  0x03f5,  0x03f7,  0x0481,  0x048a,  0x0527,  0x0531,  0x0556,
+    0x0561,  0x0587,  0x05d0,  0x05ea,  0x05f0,  0x05f2,  0x0620,  0x064a,
+    0x066e,  0x066f,  0x0671,  0x06d3,  0x06e5,  0x06e6,  0x06ee,  0x06ef,
+    0x06fa,  0x06fc,  0x0712,  0x072f,  0x074d,  0x07a5,  0x07ca,  0x07ea,
+    0x07f4,  0x07f5,  0x0800,  0x0815,  0x0840,  0x0858,  0x08a2,  0x08ac,
+    0x0904,  0x0939,  0x0958,  0x0961,  0x0971,  0x0977,  0x0979,  0x097f,
+    0x0985,  0x098c,  0x098f,  0x0990,  0x0993,  0x09a8,  0x09aa,  0x09b0,
+    0x09b6,  0x09b9,  0x09dc,  0x09dd,  0x09df,  0x09e1,  0x09f0,  0x09f1,
+    0x0a05,  0x0a0a,  0x0a0f,  0x0a10,  0x0a13,  0x0a28,  0x0a2a,  0x0a30,
+    0x0a32,  0x0a33,  0x0a35,  0x0a36,  0x0a38,  0x0a39,  0x0a59,  0x0a5c,
+    0x0a72,  0x0a74,  0x0a85,  0x0a8d,  0x0a8f,  0x0a91,  0x0a93,  0x0aa8,
+    0x0aaa,  0x0ab0,  0x0ab2,  0x0ab3,  0x0ab5,  0x0ab9,  0x0ae0,  0x0ae1,
+    0x0b05,  0x0b0c,  0x0b0f,  0x0b10,  0x0b13,  0x0b28,  0x0b2a,  0x0b30,
+    0x0b32,  0x0b33,  0x0b35,  0x0b39,  0x0b5c,  0x0b5d,  0x0b5f,  0x0b61,
+    0x0b85,  0x0b8a,  0x0b8e,  0x0b90,  0x0b92,  0x0b95,  0x0b99,  0x0b9a,
+    0x0b9e,  0x0b9f,  0x0ba3,  0x0ba4,  0x0ba8,  0x0baa,  0x0bae,  0x0bb9,
+    0x0c05,  0x0c0c,  0x0c0e,  0x0c10,  0x0c12,  0x0c28,  0x0c2a,  0x0c33,
+    0x0c35,  0x0c39,  0x0c58,  0x0c59,  0x0c60,  0x0c61,  0x0c85,  0x0c8c,
+    0x0c8e,  0x0c90,  0x0c92,  0x0ca8,  0x0caa,  0x0cb3,  0x0cb5,  0x0cb9,
+    0x0ce0,  0x0ce1,  0x0cf1,  0x0cf2,  0x0d05,  0x0d0c,  0x0d0e,  0x0d10,
+    0x0d12,  0x0d3a,  0x0d60,  0x0d61,  0x0d7a,  0x0d7f,  0x0d85,  0x0d96,
+    0x0d9a,  0x0db1,  0x0db3,  0x0dbb,  0x0dc0,  0x0dc6,  0x0e01,  0x0e30,
+    0x0e32,  0x0e33,  0x0e40,  0x0e46,  0x0e81,  0x0e82,  0x0e87,  0x0e88,
+    0x0e94,  0x0e97,  0x0e99,  0x0e9f,  0x0ea1,  0x0ea3,  0x0eaa,  0x0eab,
+    0x0ead,  0x0eb0,  0x0eb2,  0x0eb3,  0x0ec0,  0x0ec4,  0x0edc,  0x0edf,
+    0x0f40,  0x0f47,  0x0f49,  0x0f6c,  0x0f88,  0x0f8c,  0x1000,  0x102a,
+    0x1050,  0x1055,  0x105a,  0x105d,  0x1065,  0x1066,  0x106e,  0x1070,
+    0x1075,  0x1081,  0x10a0,  0x10c5,  0x10d0,  0x10fa,  0x10fc,  0x1248,
+    0x124a,  0x124d,  0x1250,  0x1256,  0x125a,  0x125d,  0x1260,  0x1288,
+    0x128a,  0x128d,  0x1290,  0x12b0,  0x12b2,  0x12b5,  0x12b8,  0x12be,
+    0x12c2,  0x12c5,  0x12c8,  0x12d6,  0x12d8,  0x1310,  0x1312,  0x1315,
+    0x1318,  0x135a,  0x1380,  0x138f,  0x13a0,  0x13f4,  0x1401,  0x166c,
+    0x166f,  0x167f,  0x1681,  0x169a,  0x16a0,  0x16ea,  0x1700,  0x170c,
+    0x170e,  0x1711,  0x1720,  0x1731,  0x1740,  0x1751,  0x1760,  0x176c,
+    0x176e,  0x1770,  0x1780,  0x17b3,  0x1820,  0x1877,  0x1880,  0x18a8,
+    0x18b0,  0x18f5,  0x1900,  0x191c,  0x1950,  0x196d,  0x1970,  0x1974,
+    0x1980,  0x19ab,  0x19c1,  0x19c7,  0x1a00,  0x1a16,  0x1a20,  0x1a54,
+    0x1b05,  0x1b33,  0x1b45,  0x1b4b,  0x1b83,  0x1ba0,  0x1bae,  0x1baf,
+    0x1bba,  0x1be5,  0x1c00,  0x1c23,  0x1c4d,  0x1c4f,  0x1c5a,  0x1c7d,
+    0x1ce9,  0x1cec,  0x1cee,  0x1cf1,  0x1cf5,  0x1cf6,  0x1d00,  0x1dbf,
+    0x1e00,  0x1f15,  0x1f18,  0x1f1d,  0x1f20,  0x1f45,  0x1f48,  0x1f4d,
+    0x1f50,  0x1f57,  0x1f5f,  0x1f7d,  0x1f80,  0x1fb4,  0x1fb6,  0x1fbc,
+    0x1fc2,  0x1fc4,  0x1fc6,  0x1fcc,  0x1fd0,  0x1fd3,  0x1fd6,  0x1fdb,
+    0x1fe0,  0x1fec,  0x1ff2,  0x1ff4,  0x1ff6,  0x1ffc,  0x2090,  0x209c,
+    0x210a,  0x2113,  0x2119,  0x211d,  0x212a,  0x212d,  0x212f,  0x2139,
+    0x213c,  0x213f,  0x2145,  0x2149,  0x2183,  0x2184,  0x2c00,  0x2c2e,
+    0x2c30,  0x2c5e,  0x2c60,  0x2ce4,  0x2ceb,  0x2cee,  0x2cf2,  0x2cf3,
+    0x2d00,  0x2d25,  0x2d30,  0x2d67,  0x2d80,  0x2d96,  0x2da0,  0x2da6,
+    0x2da8,  0x2dae,  0x2db0,  0x2db6,  0x2db8,  0x2dbe,  0x2dc0,  0x2dc6,
+    0x2dc8,  0x2dce,  0x2dd0,  0x2dd6,  0x2dd8,  0x2dde,  0x3005,  0x3006,
+    0x3031,  0x3035,  0x303b,  0x303c,  0x3041,  0x3096,  0x309d,  0x309f,
+    0x30a1,  0x30fa,  0x30fc,  0x30ff,  0x3105,  0x312d,  0x3131,  0x318e,
+    0x31a0,  0x31ba,  0x31f0,  0x31ff,  0x3400,  0x4db5,  0x4e00,  0x9fcc,
+    0xa000,  0xa48c,  0xa4d0,  0xa4fd,  0xa500,  0xa60c,  0xa610,  0xa61f,
+    0xa62a,  0xa62b,  0xa640,  0xa66e,  0xa67f,  0xa697,  0xa6a0,  0xa6e5,
+    0xa717,  0xa71f,  0xa722,  0xa788,  0xa78b,  0xa78e,  0xa790,  0xa793,
+    0xa7a0,  0xa7aa,  0xa7f8,  0xa801,  0xa803,  0xa805,  0xa807,  0xa80a,
+    0xa80c,  0xa822,  0xa840,  0xa873,  0xa882,  0xa8b3,  0xa8f2,  0xa8f7,
+    0xa90a,  0xa925,  0xa930,  0xa946,  0xa960,  0xa97c,  0xa984,  0xa9b2,
+    0xaa00,  0xaa28,  0xaa40,  0xaa42,  0xaa44,  0xaa4b,  0xaa60,  0xaa76,
+    0xaa80,  0xaaaf,  0xaab5,  0xaab6,  0xaab9,  0xaabd,  0xaadb,  0xaadd,
+    0xaae0,  0xaaea,  0xaaf2,  0xaaf4,  0xab01,  0xab06,  0xab09,  0xab0e,
+    0xab11,  0xab16,  0xab20,  0xab26,  0xab28,  0xab2e,  0xabc0,  0xabe2,
+    0xac00,  0xd7a3,  0xd7b0,  0xd7c6,  0xd7cb,  0xd7fb,  0xf900,  0xfa6d,
+    0xfa70,  0xfad9,  0xfb00,  0xfb06,  0xfb13,  0xfb17,  0xfb1f,  0xfb28,
+    0xfb2a,  0xfb36,  0xfb38,  0xfb3c,  0xfb40,  0xfb41,  0xfb43,  0xfb44,
+    0xfb46,  0xfbb1,  0xfbd3,  0xfd3d,  0xfd50,  0xfd8f,  0xfd92,  0xfdc7,
+    0xfdf0,  0xfdfb,  0xfe70,  0xfe74,  0xfe76,  0xfefc,  0xff21,  0xff3a,
+    0xff41,  0xff5a,  0xff66,  0xffbe,  0xffc2,  0xffc7,  0xffca,  0xffcf,
+    0xffd2,  0xffd7,  0xffda,  0xffdc,  0x10000, 0x1000b, 0x1000d, 0x10026,
+    0x10028, 0x1003a, 0x1003c, 0x1003d, 0x1003f, 0x1004d, 0x10050, 0x1005d,
+    0x10080, 0x100fa, 0x10280, 0x1029c, 0x102a0, 0x102d0, 0x10300, 0x1031e,
+    0x10330, 0x10340, 0x10342, 0x10349, 0x10380, 0x1039d, 0x103a0, 0x103c3,
+    0x103c8, 0x103cf, 0x10400, 0x1049d, 0x10800, 0x10805, 0x1080a, 0x10835,
+    0x10837, 0x10838, 0x1083f, 0x10855, 0x10900, 0x10915, 0x10920, 0x10939,
+    0x10980, 0x109b7, 0x109be, 0x109bf, 0x10a10, 0x10a13, 0x10a15, 0x10a17,
+    0x10a19, 0x10a33, 0x10a60, 0x10a7c, 0x10b00, 0x10b35, 0x10b40, 0x10b55,
+    0x10b60, 0x10b72, 0x10c00, 0x10c48, 0x11003, 0x11037, 0x11083, 0x110af,
+    0x110d0, 0x110e8, 0x11103, 0x11126, 0x11183, 0x111b2, 0x111c1, 0x111c4,
+    0x11680, 0x116aa, 0x12000, 0x1236e, 0x13000, 0x1342e, 0x16800, 0x16a38,
+    0x16f00, 0x16f44, 0x16f93, 0x16f9f, 0x1b000, 0x1b001, 0x1d400, 0x1d454,
+    0x1d456, 0x1d49c, 0x1d49e, 0x1d49f, 0x1d4a5, 0x1d4a6, 0x1d4a9, 0x1d4ac,
+    0x1d4ae, 0x1d4b9, 0x1d4bd, 0x1d4c3, 0x1d4c5, 0x1d505, 0x1d507, 0x1d50a,
+    0x1d50d, 0x1d514, 0x1d516, 0x1d51c, 0x1d51e, 0x1d539, 0x1d53b, 0x1d53e,
+    0x1d540, 0x1d544, 0x1d54a, 0x1d550, 0x1d552, 0x1d6a5, 0x1d6a8, 0x1d6c0,
+    0x1d6c2, 0x1d6da, 0x1d6dc, 0x1d6fa, 0x1d6fc, 0x1d714, 0x1d716, 0x1d734,
+    0x1d736, 0x1d74e, 0x1d750, 0x1d76e, 0x1d770, 0x1d788, 0x1d78a, 0x1d7a8,
+    0x1d7aa, 0x1d7c2, 0x1d7c4, 0x1d7cb, 0x1ee00, 0x1ee03, 0x1ee05, 0x1ee1f,
+    0x1ee21, 0x1ee22, 0x1ee29, 0x1ee32, 0x1ee34, 0x1ee37, 0x1ee4d, 0x1ee4f,
+    0x1ee51, 0x1ee52, 0x1ee61, 0x1ee62, 0x1ee67, 0x1ee6a, 0x1ee6c, 0x1ee72,
+    0x1ee74, 0x1ee77, 0x1ee79, 0x1ee7c, 0x1ee80, 0x1ee89, 0x1ee8b, 0x1ee9b,
+    0x1eea1, 0x1eea3, 0x1eea5, 0x1eea9, 0x1eeab, 0x1eebb, 0x20000, 0x2a6d6,
+    0x2a700, 0x2b734, 0x2b740, 0x2b81d, 0x2f800, 0x2fa1d,
+};
+
+static Rune __isalphas[] = {
+    0x00aa,  0x00b5,  0x00ba,  0x02ec,  0x02ee,  0x0386,  0x038c,  0x0559,
+    0x06d5,  0x06ff,  0x0710,  0x07b1,  0x07fa,  0x081a,  0x0824,  0x0828,
+    0x08a0,  0x093d,  0x0950,  0x09b2,  0x09bd,  0x09ce,  0x0a5e,  0x0abd,
+    0x0ad0,  0x0b3d,  0x0b71,  0x0b83,  0x0b9c,  0x0bd0,  0x0c3d,  0x0cbd,
+    0x0cde,  0x0d3d,  0x0d4e,  0x0dbd,  0x0e84,  0x0e8a,  0x0e8d,  0x0ea5,
+    0x0ea7,  0x0ebd,  0x0ec6,  0x0f00,  0x103f,  0x1061,  0x108e,  0x10c7,
+    0x10cd,  0x1258,  0x12c0,  0x17d7,  0x17dc,  0x18aa,  0x1aa7,  0x1f59,
+    0x1f5b,  0x1f5d,  0x1fbe,  0x2071,  0x207f,  0x2102,  0x2107,  0x2115,
+    0x2124,  0x2126,  0x2128,  0x214e,  0x2d27,  0x2d2d,  0x2d6f,  0x2e2f,
+    0xa8fb,  0xa9cf,  0xaa7a,  0xaab1,  0xaac0,  0xaac2,  0xfb1d,  0xfb3e,
+    0x10808, 0x1083c, 0x10a00, 0x16f50, 0x1d4a2, 0x1d4bb, 0x1d546, 0x1ee24,
+    0x1ee27, 0x1ee39, 0x1ee3b, 0x1ee42, 0x1ee47, 0x1ee49, 0x1ee4b, 0x1ee54,
+    0x1ee57, 0x1ee59, 0x1ee5b, 0x1ee5d, 0x1ee5f, 0x1ee64, 0x1ee7e,
+};
+
+int utf_isalpharune(Rune c) {
+  Rune *p;
+
+  p = rbsearch(c, __isalphar, nelem(__isalphar) / 2, 2);
+  if (p && c >= p[0] && c <= p[1]) return 1;
+  p = rbsearch(c, __isalphas, nelem(__isalphas), 1);
+  if (p && c == p[0]) return 1;
+  return 0;
+}
+
+static Rune __tolowerr[] = {
+    0x0041, 0x005a, 1048608, 0x00c0,  0x00d6,  1048608, 0x00d8, 0x00de, 1048608,
+    0x0189, 0x018a, 1048781, 0x01b1,  0x01b2,  1048793, 0x0388, 0x038a, 1048613,
+    0x038e, 0x038f, 1048639, 0x0391,  0x03a1,  1048608, 0x03a3, 0x03ab, 1048608,
+    0x03fd, 0x03ff, 1048446, 0x0400,  0x040f,  1048656, 0x0410, 0x042f, 1048608,
+    0x0531, 0x0556, 1048624, 0x10a0,  0x10c5,  1055840, 0x1f08, 0x1f0f, 1048568,
+    0x1f18, 0x1f1d, 1048568, 0x1f28,  0x1f2f,  1048568, 0x1f38, 0x1f3f, 1048568,
+    0x1f48, 0x1f4d, 1048568, 0x1f68,  0x1f6f,  1048568, 0x1f88, 0x1f8f, 1048568,
+    0x1f98, 0x1f9f, 1048568, 0x1fa8,  0x1faf,  1048568, 0x1fb8, 0x1fb9, 1048568,
+    0x1fba, 0x1fbb, 1048502, 0x1fc8,  0x1fcb,  1048490, 0x1fd8, 0x1fd9, 1048568,
+    0x1fda, 0x1fdb, 1048476, 0x1fe8,  0x1fe9,  1048568, 0x1fea, 0x1feb, 1048464,
+    0x1ff8, 0x1ff9, 1048448, 0x1ffa,  0x1ffb,  1048450, 0x2160, 0x216f, 1048592,
+    0x24b6, 0x24cf, 1048602, 0x2c00,  0x2c2e,  1048624, 0x2c7e, 0x2c7f, 1037761,
+    0xff21, 0xff3a, 1048608, 0x10400, 0x10427, 1048616,
+};
+
+static Rune __tolowerp[] = {
+    0x0100, 0x012e, 1048577, 0x0132, 0x0136, 1048577, 0x0139, 0x0147, 1048577,
+    0x014a, 0x0176, 1048577, 0x017b, 0x017d, 1048577, 0x01a2, 0x01a4, 1048577,
+    0x01b3, 0x01b5, 1048577, 0x01cd, 0x01db, 1048577, 0x01de, 0x01ee, 1048577,
+    0x01f8, 0x021e, 1048577, 0x0222, 0x0232, 1048577, 0x0248, 0x024e, 1048577,
+    0x0370, 0x0372, 1048577, 0x03d8, 0x03ee, 1048577, 0x0460, 0x0480, 1048577,
+    0x048a, 0x04be, 1048577, 0x04c3, 0x04cd, 1048577, 0x04d0, 0x0526, 1048577,
+    0x1e00, 0x1e94, 1048577, 0x1ea0, 0x1efe, 1048577, 0x1f59, 0x1f5f, 1048568,
+    0x2c67, 0x2c6b, 1048577, 0x2c80, 0x2ce2, 1048577, 0x2ceb, 0x2ced, 1048577,
+    0xa640, 0xa66c, 1048577, 0xa680, 0xa696, 1048577, 0xa722, 0xa72e, 1048577,
+    0xa732, 0xa76e, 1048577, 0xa779, 0xa77b, 1048577, 0xa780, 0xa786, 1048577,
+    0xa790, 0xa792, 1048577, 0xa7a0, 0xa7a8, 1048577,
+};
+
+static Rune __tolowers[] = {
+    0x0130, 1048377, 0x0178, 1048455, 0x0179, 1048577, 0x0181, 1048786,
+    0x0182, 1048577, 0x0184, 1048577, 0x0186, 1048782, 0x0187, 1048577,
+    0x018b, 1048577, 0x018e, 1048655, 0x018f, 1048778, 0x0190, 1048779,
+    0x0191, 1048577, 0x0193, 1048781, 0x0194, 1048783, 0x0196, 1048787,
+    0x0197, 1048785, 0x0198, 1048577, 0x019c, 1048787, 0x019d, 1048789,
+    0x019f, 1048790, 0x01a0, 1048577, 0x01a6, 1048794, 0x01a7, 1048577,
+    0x01a9, 1048794, 0x01ac, 1048577, 0x01ae, 1048794, 0x01af, 1048577,
+    0x01b7, 1048795, 0x01b8, 1048577, 0x01bc, 1048577, 0x01c4, 1048578,
+    0x01c5, 1048577, 0x01c7, 1048578, 0x01c8, 1048577, 0x01ca, 1048578,
+    0x01cb, 1048577, 0x01f1, 1048578, 0x01f2, 1048577, 0x01f4, 1048577,
+    0x01f6, 1048479, 0x01f7, 1048520, 0x0220, 1048446, 0x023a, 1059371,
+    0x023b, 1048577, 0x023d, 1048413, 0x023e, 1059368, 0x0241, 1048577,
+    0x0243, 1048381, 0x0244, 1048645, 0x0245, 1048647, 0x0246, 1048577,
+    0x0376, 1048577, 0x0386, 1048614, 0x038c, 1048640, 0x03cf, 1048584,
+    0x03f4, 1048516, 0x03f7, 1048577, 0x03f9, 1048569, 0x03fa, 1048577,
+    0x04c0, 1048591, 0x04c1, 1048577, 0x10c7, 1055840, 0x10cd, 1055840,
+    0x1e9e, 1040961, 0x1fbc, 1048567, 0x1fcc, 1048567, 0x1fec, 1048569,
+    0x1ffc, 1048567, 0x2126, 1041059, 0x212a, 1040193, 0x212b, 1040314,
+    0x2132, 1048604, 0x2183, 1048577, 0x2c60, 1048577, 0x2c62, 1037833,
+    0x2c63, 1044762, 0x2c64, 1037849, 0x2c6d, 1037796, 0x2c6e, 1037827,
+    0x2c6f, 1037793, 0x2c70, 1037794, 0x2c72, 1048577, 0x2c75, 1048577,
+    0x2cf2, 1048577, 0xa77d, 1013244, 0xa77e, 1048577, 0xa78b, 1048577,
+    0xa78d, 1006296, 0xa7aa, 1006268,
+};
+
+Rune utf_tolowerrune(Rune c) {
+  Rune *p;
+
+  p = rbsearch(c, __tolowerr, nelem(__tolowerr) / 3, 3);
+  if (p && c >= p[0] && c <= p[1]) return c + p[2] - 1048576;
+  p = rbsearch(c, __tolowerp, nelem(__tolowerp) / 3, 3);
+  if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))
+    return c + p[2] - 1048576;
+  p = rbsearch(c, __tolowers, nelem(__tolowers) / 2, 2);
+  if (p && c == p[0]) return c + p[1] - 1048576;
+  return c;
+}
+
+#endif
diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h
new file mode 100644
index 0000000000..f3b14772ea
--- /dev/null
+++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Fork of several UTF utils originally written by Rob Pike and Ken Thompson.
+#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_UTF_UTF_H_
+#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_UTF_UTF_H_ 1
+
+#include <stdint.h>
+
+// Code-point values in Unicode 4.0 are 21 bits wide.
+typedef signed int Rune;
+
+#define uchar _utfuchar
+
+typedef unsigned char uchar;
+
+#define nelem(x) (sizeof(x) / sizeof((x)[0]))
+
+enum {
+  UTFmax = 4,          // maximum bytes per rune
+  Runeerror = 0xFFFD,  // decoding error in UTF
+  Runemax = 0x10FFFF,  // maximum rune value
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * rune routines
+ */
+
+/*
+ * These routines were written by Rob Pike and Ken Thompson
+ * and first appeared in Plan 9.
+ * SEE ALSO
+ * utf (7)
+ * tcs (1)
+ */
+
+// utf_runetochar copies (encodes) one rune, pointed to by r, to at most
+// UTFmax bytes starting at s and returns the number of bytes generated.
+
+int utf_runetochar(char* s, const Rune* r);
+
+// utf_charntorune copies (decodes) at most UTFmax bytes starting at `str` to
+// one rune, pointed to by `rune`, accesss at most `length` bytes of `str`, and
+// returns the number of bytes consumed.
+// If the UTF sequence is incomplete within n bytes,
+// utf_charntorune will set *r to Runeerror and return 0. If it is complete
+// but not in UTF format, it will set *r to Runeerror and return 1.
+//
+// Added 2004-09-24 by Wei-Hwa Huang
+
+int utf_charntorune(Rune* rune, const char* str, int length);
+
+// Unicode defines some characters as letters and
+// specifies three cases: upper, lower, and title.  Mappings among the
+// cases are also defined, although they are not exhaustive: some
+// upper case letters have no lower case mapping, and so on.  Unicode
+// also defines several character properties, a subset of which are
+// checked by these routines.  These routines are based on Unicode
+// version 3.0.0.
+//
+// NOTE: The routines are implemented in C, so isalpharrune returns 0 for false
+// and 1 for true.
+//
+// utf_tolowerrune is the Unicode case mapping. It returns the character
+// unchanged if it has no defined mapping.
+
+Rune utf_tolowerrune(Rune r);
+
+// utf_isalpharune tests for Unicode letters; this includes ideographs in
+// addition to alphabetic characters.
+
+int utf_isalpharune(Rune r);
+
+// (The comments in this file were copied from the manpage files rune.3,
+// isalpharune.3, and runestrcat.3. Some formatting changes were also made
+// to conform to Google style. /JRM 11/11/05)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_UTF_UTF_H_