From 2a67b6e5d00d6ac94d81bf2e87ebb83bfc6f7402 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Tue, 23 Apr 2024 16:13:06 -0700 Subject: [PATCH] Allow redundant dots in JSON paths accepted by json_extract[_scalar] Presto (#9585) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/9585 json_extract_scalar and json_extract Presto function allow paths like '$.[0]. [1].[2].foo' that include "redundant" dots before opening brackets. These paths are non-standard. They are enabled via use of Jayway engine. This change allows Velox to support these paths by ignoring dots that appear before opening brackets. See https://github.com/prestodb/presto/issues/22589 Part of https://github.com/facebookincubator/velox/issues/7049 Reviewed By: xiaoxmeng Differential Revision: D56467690 fbshipit-source-id: 42866815ff186510679a0ab877e2796d324309c7 --- velox/functions/prestosql/json/JsonPathTokenizer.cpp | 6 +++++- velox/functions/prestosql/json/JsonPathTokenizer.h | 8 ++++++++ .../prestosql/json/tests/JsonPathTokenizerTest.cpp | 8 ++++++++ velox/functions/prestosql/tests/JsonExtractScalarTest.cpp | 7 +++++++ velox/functions/prestosql/tests/JsonFunctionsTest.cpp | 8 ++++++++ 5 files changed, 36 insertions(+), 1 deletion(-) diff --git a/velox/functions/prestosql/json/JsonPathTokenizer.cpp b/velox/functions/prestosql/json/JsonPathTokenizer.cpp index 1c785b9df6a3..3c862403f39b 100644 --- a/velox/functions/prestosql/json/JsonPathTokenizer.cpp +++ b/velox/functions/prestosql/json/JsonPathTokenizer.cpp @@ -85,7 +85,11 @@ std::optional JsonPathTokenizer::getNext() { } if (match(DOT)) { - return matchDotKey(); + if (hasNext() && path_[index_] != OPEN_BRACKET) { + return matchDotKey(); + } + // Simply ignore the '.' followed by '['. This allows non-standard paths + // like '$.[0].[1].[2]' supported by Jayway / Presto. } if (match(OPEN_BRACKET)) { diff --git a/velox/functions/prestosql/json/JsonPathTokenizer.h b/velox/functions/prestosql/json/JsonPathTokenizer.h index dfba062a33b0..7efb0f7265e8 100644 --- a/velox/functions/prestosql/json/JsonPathTokenizer.h +++ b/velox/functions/prestosql/json/JsonPathTokenizer.h @@ -38,9 +38,17 @@ namespace facebook::velox::functions { /// equivalent. This is not part of JSONPath syntax, but this is the behavior of /// Jayway implementation used by Presto. /// +/// It is allowed to use dot-notation redundantly, e.g. non-standard path +/// '$.[0].foo' is allowed and is equivalent to '$[0].foo'. Similarly, paths +/// '$.[0].[1].[2]' and '$[0].[1][2]' are allowed and equivalent to +/// '$[0][1][2]'. +/// /// Examples: +/// "$" /// "$.store.book[0].author" /// "store.book[0].author" +/// "store.book.[0].author" +/// "$[0].foo.bar" class JsonPathTokenizer { public: /// Resets the tokenizer to a new path. This method must be called and return diff --git a/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp b/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp index 12dbb021607f..9ea038798e91 100644 --- a/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp +++ b/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp @@ -95,6 +95,14 @@ TEST(JsonPathTokenizerTest, validPaths) { assertValidPath("foo[12].bar", TokenList({"foo", "12", "bar"})); assertValidPath("foo.bar.baz", TokenList({"foo", "bar", "baz"})); + // Paths with redundant '.'s. + assertValidPath("$.[0].[1].[2]", TokenList({"0", "1", "2"})); + assertValidPath("$[0].[1].[2]", TokenList({"0", "1", "2"})); + assertValidPath("$[0].[1][2]", TokenList({"0", "1", "2"})); + assertValidPath("$.[0].[1].foo.[2]", TokenList({"0", "1", "foo", "2"})); + assertValidPath("$[0].[1].foo.[2]", TokenList({"0", "1", "foo", "2"})); + assertValidPath("$[0][1].foo.[2]", TokenList({"0", "1", "foo", "2"})); + assertQuotedToken("!@#$%^&*()[]{}/?'"s, TokenList{"!@#$%^&*()[]{}/?'"s}); assertQuotedToken("ab\u0001c"s, TokenList{"ab\u0001c"s}); assertQuotedToken("ab\0c"s, TokenList{"ab\0c"s}); diff --git a/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp b/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp index b62107330599..150880e2018a 100644 --- a/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp +++ b/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp @@ -108,6 +108,13 @@ TEST_F(JsonExtractScalarTest, simple) { // Paths without leading '$'. EXPECT_EQ(jsonExtractScalar(R"({"k1":"v1"})", "k1"), "v1"); EXPECT_EQ(jsonExtractScalar(R"({"k1":{"k2": 999}})", "k1.k2"), "999"); + + // Paths with redundant '.'s. + auto json = "[1, 2, [10, 20, [100, 200, 300]]]"; + EXPECT_EQ(jsonExtractScalar(json, "$[2][2][2]"), "300"); + EXPECT_EQ(jsonExtractScalar(json, "$.[2].[2].[2]"), "300"); + EXPECT_EQ(jsonExtractScalar(json, "$[2].[2].[2]"), "300"); + EXPECT_EQ(jsonExtractScalar(json, "$[2][2].[2]"), "300"); } TEST_F(JsonExtractScalarTest, jsonType) { diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 575482bad953..6a4275389f03 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -610,6 +610,14 @@ TEST_F(JsonFunctionsTest, jsonExtract) { EXPECT_EQ(std::nullopt, jsonExtract(json, "x.c")); EXPECT_EQ(std::nullopt, jsonExtract(json, "x.b[20]")); + // Paths with redundant '.'s. + json = R"([[[{"a": 1, "b": [1, 2, 3]}]]])"; + EXPECT_EQ("1", jsonExtract(json, "$.[0][0][0].a")); + EXPECT_EQ("[1, 2, 3]", jsonExtract(json, "$.[0].[0].[0].b")); + EXPECT_EQ("[1, 2, 3]", jsonExtract(json, "$[0][0].[0].b")); + EXPECT_EQ("3", jsonExtract(json, "$[0][0][0].b.[2]")); + EXPECT_EQ("3", jsonExtract(json, "$.[0].[0][0].b.[2]")); + // TODO The following paths are supported by Presto via Jayway, but do not // work in Velox yet. Figure out how to add support for these. VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path");