Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RegEx support using RE2 (rebased #665) #1039

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ matrix:
apt:
packages:
- g++-5
- libre2-dev
sources: &sources
- llvm-toolchain-bionic-11
- sourceline: 'deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main'
Expand All @@ -20,6 +21,7 @@ matrix:
apt:
packages:
- clang-11
- libre2-dev
sources: *sources
- os: osx
osx_image: xcode11.3
Expand Down
45 changes: 43 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,50 @@ endif()

set(CMAKE_CXX_STANDARD 11)

# Include external RE2 project. This runs a CMake sub-script
# (RE2CMakeLists.txt.in) that downloads googletest source. It's then built as part
# of the jsonnet project. The conventional way of handling CMake dependencies is
# to use a find_package script, which finds and installs the library from
# known locations on the local machine. Downloading the library ourselves
# allows us to pin to a specific version and makes things easier for users
# who don't have package managers.

# Generate and download RE2 project.
set(RE2_DIR ${GLOBAL_OUTPUT_PATH}/re2-download)
configure_file(RE2CMakeLists.txt.in ${RE2_DIR}/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${RE2_DIR}
)
if(result)
message(FATAL_ERROR "RE2 download failed: ${result}")
endif()

# Build RE2.
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${RE2_DIR})
if(result)
message(FATAL_ERROR "Build step for re2 failed: ${result}")
endif()

# Add RE2 directly to our build. This defines
# the re2 target.
add_subdirectory(${GLOBAL_OUTPUT_PATH}/re2-src
${GLOBAL_OUTPUT_PATH}/re2-build)

# Include RE2 headers.
include_directories("${RE2_SOURCE_DIR}/include")

# Allow linking into a shared library.
set_property(TARGET re2 PROPERTY POSITION_INDEPENDENT_CODE ON)

# RE2 requires pthreads
set_property(TARGET re2 PROPERTY INTERFACE_COMPILE_OPTIONS $<${UNIX}:-pthread>)
set_property(TARGET re2 PROPERTY INTERFACE_LINK_LIBRARIES $<${UNIX}:-pthread>)

# Include external googletest project. This runs a CMake sub-script
# (CMakeLists.txt.in) that downloads googletest source. It's then built as part
# (GoogleTestCMakeLists.txt.in) that downloads googletest source. It's then built as part
# of the jsonnet project. The conventional way of handling CMake dependencies is
# to use a find_package script, which finds and installs the library from
# known locations on the local machine. Downloading the library ourselves
Expand All @@ -58,7 +99,7 @@ if (BUILD_TESTS AND NOT USE_SYSTEM_GTEST)

# Generate and download googletest project.
set(GOOGLETEST_DIR ${GLOBAL_OUTPUT_PATH}/googletest-download)
configure_file(CMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
configure_file(GoogleTestCMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${GOOGLETEST_DIR}
Expand Down
File renamed without changes.
7 changes: 3 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ CXXFLAGS += -Iinclude -Ithird_party/md5 -Ithird_party/json -Ithird_party/rapidya
CFLAGS ?= -g $(OPT) -Wall -Wextra -pedantic -std=c99 -fPIC
CFLAGS += -Iinclude
MAKEDEPENDFLAGS += -Iinclude -Ithird_party/md5 -Ithird_party/json -Ithird_party/rapidyaml/rapidyaml/src/ -Ithird_party/rapidyaml/rapidyaml/ext/c4core/src/
LDFLAGS ?=

LDFLAGS ?= -lre2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the most controversial part. Introducing a dependency here may break existing users. The fix is easy for them, so maybe not a very big deal. Going from 0 to some dependencies might still be problematic in some contexts, but it's probably worth it for regexps.

We definitely need to add a README section for dependencies, though. It would also be good to check that the Python package works as expected and maybe mention the dependencies in the description.

@sparkprime What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look into that when I do the release. I presume it will break existing windows build processes but whoever uses that ought to be able to figure it out.


SHARED_LDFLAGS ?= -shared

Expand Down Expand Up @@ -160,11 +159,11 @@ core/desugarer.cpp: core/std.jsonnet.h

# Commandline executable.
jsonnet: cmd/jsonnet.cpp cmd/utils.cpp $(LIB_OBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)

# Commandline executable (reformatter).
jsonnetfmt: cmd/jsonnetfmt.cpp cmd/utils.cpp $(LIB_OBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)

# C binding.
libjsonnet.so.$(VERSION): $(LIB_OBJ)
Expand Down
18 changes: 18 additions & 0 deletions RE2CMakeLists.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# CMake script run a generation-time. This must be separate from the main
# CMakeLists.txt file to allow downloading and building googletest at generation
# time.
cmake_minimum_required(VERSION 2.8.2)

project(re2-download NONE)

include(ExternalProject)
ExternalProject_Add(re2
GIT_REPOSITORY https://github.com/google/re2.git
GIT_TAG 2022-06-01
SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src"
BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
10 changes: 9 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,19 @@ git_repository(
git_repository(
name = "com_google_googletest",
remote = "https://github.com/google/googletest.git",
# If updating googletest version, also update CMakeLists.txt.in.
# If updating googletest version, also update GoogleTestCMakeLists.txt.in.
commit = "2fe3bd994b3189899d93f1d5a881e725e046fdc2", # release: release-1.8.1
shallow_since = "1535728917 -0400",
)

git_repository(
name = "com_googlesource_code_re2",
remote = "https://github.com/google/re2.git",
# If updating RE2 version, also update RE2CMakeLists.txt.in.
commit = "5723bb8950318135ed9cf4fc76bed988a087f536", # release: 2022-06-01
shallow_since = "1558525654 +0000",
)

# This allows using py_test and py_library against python3.
register_toolchains("//platform_defs:default_python3_toolchain")

Expand Down
1 change: 1 addition & 0 deletions core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ cc_library(
"//third_party/json",
"//third_party/md5:libmd5",
"//third_party/rapidyaml:ryml",
"@com_googlesource_code_re2//:re2",
],
)

Expand Down
9 changes: 4 additions & 5 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,14 @@ set(LIBJSONNET_SOURCE
vm.cpp)

add_library(libjsonnet SHARED ${LIBJSONNET_HEADERS} ${LIBJSONNET_SOURCE})
add_dependencies(libjsonnet md5 stdlib)
target_link_libraries(libjsonnet md5 nlohmann_json::nlohmann_json ryml)
add_dependencies(libjsonnet md5 re2 stdlib)
target_link_libraries(libjsonnet md5 nlohmann_json::nlohmann_json ryml re2)

file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/../include/libjsonnet.h JSONNET_VERSION_DEF
REGEX "[#]define[ \t]+LIB_JSONNET_VERSION[ \t]+")
string(REGEX REPLACE ".*\"v([^\"]+)\".*" "\\1" JSONNET_VERSION ${JSONNET_VERSION_DEF})
message("Extracted Jsonnet version: " ${JSONNET_VERSION})


# CMake prepends CMAKE_SHARED_LIBRARY_PREFIX to shared libraries, so without
# this step the output would be |liblibjsonnet|.
set_target_properties(libjsonnet PROPERTIES OUTPUT_NAME jsonnet
Expand All @@ -54,8 +53,8 @@ target_include_directories(libjsonnet INTERFACE
if (BUILD_STATIC_LIBS)
# Static library for jsonnet command-line tool.
add_library(libjsonnet_static STATIC ${LIBJSONNET_SOURCE})
add_dependencies(libjsonnet_static md5 stdlib)
target_link_libraries(libjsonnet_static md5 nlohmann_json::nlohmann_json ryml)
add_dependencies(libjsonnet_static md5 re2 stdlib)
target_link_libraries(libjsonnet_static md5 nlohmann_json::nlohmann_json ryml re2)
set_target_properties(libjsonnet_static PROPERTIES OUTPUT_NAME jsonnet)
install(TARGETS libjsonnet_static DESTINATION "${CMAKE_INSTALL_LIBDIR}")
target_include_directories(libjsonnet_static INTERFACE
Expand Down
7 changes: 6 additions & 1 deletion core/desugarer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct BuiltinDecl {
std::vector<UString> params;
};

static unsigned long max_builtin = 38;
static unsigned long max_builtin = 43;
BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
{
switch (builtin) {
Expand Down Expand Up @@ -77,6 +77,11 @@ BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
case 36: return {U"parseYaml", {U"str"}};
case 37: return {U"encodeUTF8", {U"str"}};
case 38: return {U"decodeUTF8", {U"arr"}};
case 39: return {U"regexFullMatch", {U"pattern", U"str"}};
case 40: return {U"regexPartialMatch", {U"pattern", U"str"}};
case 41: return {U"regexQuoteMeta", {U"str"}};
case 42: return {U"regexReplace", {U"str", U"pattern", U"to"}};
case 43: return {U"regexGlobalReplace", {U"str", U"pattern", U"to"}};
default:
std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl;
std::abort();
Expand Down
133 changes: 133 additions & 0 deletions core/vm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ limitations under the License.
#include "parser.h"
#include "ryml_std.hpp" // include this before any other ryml header
#include "ryml.hpp"
#include "re2/re2.h"
#include "state.h"
#include "static_analysis.h"
#include "string_utils.h"
Expand All @@ -49,6 +50,10 @@ using json = nlohmann::json;

namespace {

static const Fodder EF; // Empty fodder.

static const LocationRange E; // Empty.

/** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/". If there is no path returns "".
*/
std::string dir_name(const std::string &path)
Expand Down Expand Up @@ -938,6 +943,11 @@ class Interpreter {
builtins["parseYaml"] = &Interpreter::builtinParseYaml;
builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8;
builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8;
builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch;
builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch;
builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta;
builtins["regexReplace"] = &Interpreter::builtinRegexReplace;
builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace;

DesugaredObject *stdlib = makeStdlibAST(alloc, "__internal__");
jsonnet_static_analysis(stdlib);
Expand Down Expand Up @@ -1440,6 +1450,129 @@ class Interpreter {
return decodeUTF8();
}

const AST *regexMatch(const std::string &pattern, const std::string &string, bool full)
{
RE2 re(pattern, RE2::CannedOptions::Quiet);
if (!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

int num_groups = re.NumberOfCapturingGroups();

std::vector<std::string> rcaptures(num_groups);
Copy link
Contributor

@CertainLach CertainLach Jun 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of optional group, h(.+)?o, should it be an empty string, or null?
std.regexFullMatch('h(.+)?o', 'ho').captures == ['']
std.regexFullMatch('h(.+)?o', 'ho').captures == [null]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the underlying library returns an empty string then it's clearly "OK" to do that. Would we choose to differ? I'm not sure there's much point, because you have to do != null instead of != '' right? So it doesn't help the programmer to use null instead. On the other hand, it means there is a difference between the underlying library and the Jsonnet manifestation of it. That difference would have to be documented by us and remembered by everyone else.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could also make the argument that the only reason the RE2 C++ API returns the empty string is because it is much harder to return a sentinel in that language. I wonder what the Go library does? @rohitjangid

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, empty match is not the same as the optional group is not matched.
Howewer, looks like both C++ re2 and golang regex implementations don't care, and leave default type value for such matches, unlike Rust (https://docs.rs/regex/latest/regex/struct.Captures.html#method.get) and js regexps.

std::vector<RE2::Arg> rargv(num_groups);
std::vector<const RE2::Arg*> rargs(num_groups);
for (int i = 0; i < num_groups; ++i) {
rargs[i] = &rargv[i];
rargv[i] = &rcaptures[i];
}

if (full ? RE2::FullMatchN(string, re, rargs.data(), num_groups)
: RE2::PartialMatchN(string, re, rargs.data(), num_groups)) {
std::map<const Identifier *, HeapSimpleObject::Field> fields;

const Identifier *fid = alloc->makeIdentifier(U"string");
fields[fid].hide = ObjectField::VISIBLE;
fields[fid].body = alloc->make<LiteralString>(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", "");

fid = alloc->makeIdentifier(U"captures");
fields[fid].hide = ObjectField::VISIBLE;
std::vector<Array::Element> captures;
for (int i = 0; i < num_groups; ++i) {
captures.push_back(Array::Element(
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""),
EF));
}
fields[fid].body = alloc->make<Array>(E, EF, captures, false, EF);

fid = alloc->makeIdentifier(U"namedCaptures");
fields[fid].hide = ObjectField::VISIBLE;
DesugaredObject::Fields named_captures;
const std::map<std::string, int> &named_groups = re.NamedCapturingGroups();
for (auto it = named_groups.cbegin(); it != named_groups.cend(); ++it) {
named_captures.push_back(DesugaredObject::Field(
ObjectField::VISIBLE,
alloc->make<LiteralString>(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""),
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", "")));
}
fields[fid].body = alloc->make<DesugaredObject>(E, ASTs{}, named_captures);

scratch = makeObject<HeapSimpleObject>(BindingFrame{}, fields, ASTs{});
} else {
scratch = makeNull();
}
return nullptr;
}

const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING});

std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);

return regexMatch(pattern, string, true);
}

const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING});

std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);

return regexMatch(pattern, string, false);
}

const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING});
scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast<HeapString *>(args[0].v.h)->value))));
return nullptr;
}

const AST *builtinRegexReplace(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING});

std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);

RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

RE2::Replace(&string, re, replace);
scratch = makeString(decode_utf8(string));
return nullptr;
}

const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING});

std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);

RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

RE2::GlobalReplace(&string, re, replace);
scratch = makeString(decode_utf8(string));
return nullptr;
}

const AST *builtinTrace(const LocationRange &loc, const std::vector<Value> &args)
{
if(args[0].t != Value::STRING) {
Expand Down
Loading