Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions extensions/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,54 @@ cc_library(
],
)

cc_library(
name = "regex_ext",
srcs = ["regex_ext.cc"],
hdrs = ["regex_ext.h"],
deps = [
"//common:value",
"//eval/public:cel_function_registry",
"//eval/public:cel_options",
"//internal:status_macros",
"//runtime:function_adapter",
"//runtime:function_registry",
"//runtime:runtime_options",
"@com_google_absl//absl/base:nullability",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/strings:string_view",
"@com_google_protobuf//:protobuf",
"@com_googlesource_code_re2//:re2",
],
)

cc_test(
name = "regex_ext_test",
srcs = ["regex_ext_test.cc"],
deps = [
":regex_ext",
"//common:value",
"//common:value_testing",
"//extensions/protobuf:runtime_adapter",
"//internal:status_macros",
"//internal:testing",
"//internal:testing_descriptor_pool",
"//parser",
"//runtime",
"//runtime:activation",
"//runtime:optional_types",
"//runtime:reference_resolver",
"//runtime:runtime_builder",
"//runtime:runtime_options",
"//runtime:standard_runtime_builder_factory",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:status_matchers",
"@com_google_absl//absl/status:statusor",
"@com_google_protobuf//:protobuf",
],
)

cc_test(
name = "formatting_test",
srcs = ["formatting_test.cc"],
Expand Down
263 changes: 263 additions & 0 deletions extensions/regex_ext.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "extensions/regex_ext.h"

#include <algorithm>
#include <cstdint>
#include <memory>
#include <string>
#include <utility>

#include "absl/base/nullability.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "common/value.h"
#include "eval/public/cel_function_registry.h"
#include "eval/public/cel_options.h"
#include "internal/status_macros.h"
#include "runtime/function_adapter.h"
#include "runtime/function_registry.h"
#include "runtime/runtime_options.h"
#include "google/protobuf/arena.h"
#include "google/protobuf/descriptor.h"
#include "google/protobuf/message.h"
#include "re2/re2.h"

namespace cel::extensions {
namespace {

Value Extract(const StringValue& target, const StringValue& regex,
const google::protobuf::DescriptorPool* ABSL_NONNULL descriptor_pool,
google::protobuf::MessageFactory* ABSL_NONNULL message_factory,
google::protobuf::Arena* ABSL_NONNULL arena) {
std::string target_scratch;
std::string regex_scratch;
absl::string_view target_view = target.ToStringView(&target_scratch);
absl::string_view regex_view = regex.ToStringView(&regex_scratch);
RE2 re2(regex_view);
if (!re2.ok()) {
return ErrorValue(absl::InvalidArgumentError(
absl::StrFormat("given regex is invalid: %s", re2.error())));
}
const int group_count = re2.NumberOfCapturingGroups();
if (group_count > 1) {
return ErrorValue(absl::InvalidArgumentError(absl::StrFormat(
"regular expression has more than one capturing group: %s",
regex_view)));
}

// Space for the full match (\0) and the first capture group (\1).
absl::string_view submatches[2];
if (re2.Match(target_view, 0, target_view.length(), RE2::UNANCHORED,
submatches, 2)) {
// Return the capture group if it exists else return the full match.
const absl::string_view result_view =
(group_count == 1) ? submatches[1] : submatches[0];
return OptionalValue::Of(StringValue::From(result_view, arena), arena);
}

return OptionalValue::None();
}

Value ExtractAll(const StringValue& target, const StringValue& regex,
const google::protobuf::DescriptorPool* ABSL_NONNULL descriptor_pool,
google::protobuf::MessageFactory* ABSL_NONNULL message_factory,
google::protobuf::Arena* ABSL_NONNULL arena) {
std::string target_scratch;
std::string regex_scratch;
absl::string_view target_view = target.ToStringView(&target_scratch);
absl::string_view regex_view = regex.ToStringView(&regex_scratch);
RE2 re2(regex_view);
if (!re2.ok()) {
return ErrorValue(absl::InvalidArgumentError(
absl::StrFormat("given regex is invalid: %s", re2.error())));
}
const int group_count = re2.NumberOfCapturingGroups();
if (group_count > 1) {
return ErrorValue(absl::InvalidArgumentError(absl::StrFormat(
"regular expression has more than one capturing group: %s",
regex_view)));
}

auto builder = NewListValueBuilder(arena);
absl::string_view temp_target = target_view;

// Space for the full match (\0) and the first capture group (\1).
absl::string_view submatches[2];
const int group_to_extract = (group_count == 1) ? 1 : 0;

while (re2.Match(temp_target, 0, temp_target.length(), RE2::UNANCHORED,
submatches, group_count + 1)) {
const absl::string_view& full_match = submatches[0];
const absl::string_view& desired_capture = submatches[group_to_extract];

// Avoid infinite loops on zero-length matches
if (full_match.empty()) {
if (temp_target.empty()) {
break;
}
temp_target.remove_prefix(1);
continue;
}

if (group_count == 1 && desired_capture.empty()) {
temp_target.remove_prefix(full_match.data() - temp_target.data() +
full_match.length());
continue;
}

absl::Status status =
builder->Add(StringValue::From(desired_capture, arena));
if (!status.ok()) {
return ErrorValue(status);
}
temp_target.remove_prefix(full_match.data() - temp_target.data() +
full_match.length());
}

return std::move(*builder).Build();
}

Value ReplaceAll(const StringValue& target, const StringValue& regex,
const StringValue& replacement,
const google::protobuf::DescriptorPool* ABSL_NONNULL descriptor_pool,
google::protobuf::MessageFactory* ABSL_NONNULL message_factory,
google::protobuf::Arena* ABSL_NONNULL arena) {
std::string target_scratch;
std::string regex_scratch;
std::string replacement_scratch;
absl::string_view target_view = target.ToStringView(&target_scratch);
absl::string_view regex_view = regex.ToStringView(&regex_scratch);
absl::string_view replacement_view =
replacement.ToStringView(&replacement_scratch);
RE2 re2(regex_view);
if (!re2.ok()) {
return ErrorValue(absl::InvalidArgumentError(
absl::StrFormat("given regex is invalid: %s", re2.error())));
}

std::string error_string;
if (!re2.CheckRewriteString(replacement_view, &error_string)) {
return ErrorValue(absl::InvalidArgumentError(
absl::StrFormat("invalid replacement string: %s", error_string)));
}

std::string output(target_view);
RE2::GlobalReplace(&output, re2, replacement_view);

return StringValue::From(std::move(output), arena);
}

Value ReplaceN(const StringValue& target, const StringValue& regex,
const StringValue& replacement, int64_t count,
const google::protobuf::DescriptorPool* ABSL_NONNULL descriptor_pool,
google::protobuf::MessageFactory* ABSL_NONNULL message_factory,
google::protobuf::Arena* ABSL_NONNULL arena) {
if (count == 0) {
return target;
}
if (count < 0) {
return ReplaceAll(target, regex, replacement, descriptor_pool,
message_factory, arena);
}

std::string target_scratch;
std::string regex_scratch;
std::string replacement_scratch;
absl::string_view target_view = target.ToStringView(&target_scratch);
absl::string_view regex_view = regex.ToStringView(&regex_scratch);
absl::string_view replacement_view =
replacement.ToStringView(&replacement_scratch);
RE2 re2(regex_view);
if (!re2.ok()) {
return ErrorValue(absl::InvalidArgumentError(
absl::StrFormat("given regex is invalid: %s", re2.error())));
}
std::string error_string;
if (!re2.CheckRewriteString(replacement_view, &error_string)) {
return ErrorValue(absl::InvalidArgumentError(
absl::StrFormat("invalid replacement string: %s", error_string)));
}

std::string output;
absl::string_view temp_target = target_view;
int replaced_count = 0;
// RE2's Rewrite only supports substitutions for groups \0 through \9.
absl::string_view match[10];
int nmatch = std::min(9, re2.NumberOfCapturingGroups()) + 1;

while (replaced_count < count &&
re2.Match(temp_target, 0, temp_target.length(), RE2::UNANCHORED, match,
nmatch)) {
absl::string_view full_match = match[0];

output.append(temp_target.data(), full_match.data() - temp_target.data());

if (!re2.Rewrite(&output, replacement_view, match, nmatch)) {
// This should ideally not happen given CheckRewriteString passed
return ErrorValue(absl::InternalError("rewrite failed unexpectedly"));
}

temp_target.remove_prefix(full_match.data() - temp_target.data() +
full_match.length());
replaced_count++;
}

output.append(temp_target.data(), temp_target.length());

return StringValue::From(std::move(output), arena);
}

} // namespace

absl::Status RegisterRegexExtensionFunctions(FunctionRegistry& registry) {
CEL_RETURN_IF_ERROR(
(BinaryFunctionAdapter<absl::StatusOr<Value>, StringValue, StringValue>::
RegisterGlobalOverload("regex.extract", &Extract, registry)));
CEL_RETURN_IF_ERROR(
(BinaryFunctionAdapter<absl::StatusOr<Value>, StringValue, StringValue>::
RegisterGlobalOverload("regex.extractAll", &ExtractAll, registry)));
CEL_RETURN_IF_ERROR(
(TernaryFunctionAdapter<
absl::StatusOr<Value>, StringValue, StringValue,
StringValue>::RegisterGlobalOverload("regex.replace", &ReplaceAll,
registry)));
CEL_RETURN_IF_ERROR(
(QuaternaryFunctionAdapter<
absl::StatusOr<Value>, StringValue, StringValue, StringValue,
int64_t>::RegisterGlobalOverload("regex.replace", &ReplaceN,
registry)));
return absl::OkStatus();
}

absl::Status RegisterRegexExtensionFunctions(FunctionRegistry& registry,
const RuntimeOptions& options) {
if (options.enable_regex) {
CEL_RETURN_IF_ERROR(RegisterRegexExtensionFunctions(registry));
}
return absl::OkStatus();
}

absl::Status RegisterRegexExtensionFunctions(
google::api::expr::runtime::CelFunctionRegistry* registry,
const google::api::expr::runtime::InterpreterOptions& options) {
return RegisterRegexExtensionFunctions(
registry->InternalGetRegistry(),
google::api::expr::runtime::ConvertToRuntimeOptions(options));
}

} // namespace cel::extensions
Loading