Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow quotes to be escaped in JSON path #12033

Merged
merged 4 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 67 additions & 42 deletions extension/json/json_common.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "json_common.hpp"

#include "duckdb/common/exception/binder_exception.hpp"

namespace duckdb {
Expand Down Expand Up @@ -31,24 +32,65 @@ string ThrowPathError(const char *ptr, const char *end, const bool binder) {
}
}

static inline idx_t ReadString(const char *ptr, const char *const end, const bool escaped) {
struct JSONKeyReadResult {
public:
static inline JSONKeyReadResult Empty() {
return {idx_t(0), string()};
}

static inline JSONKeyReadResult WildCard() {
return {1, "*"};
}

inline bool IsValid() {
return chars_read != 0;
}

inline bool IsWildCard() {
return key == "*";
}

public:
idx_t chars_read;
string key;
lnkuiper marked this conversation as resolved.
Show resolved Hide resolved
};

static inline JSONKeyReadResult ReadString(const char *ptr, const char *const end, const bool escaped) {
const char *const before = ptr;
if (escaped) {
auto key = make_unsafe_uniq_array<char>(end - ptr);
idx_t key_len = 0;

bool backslash = false;
while (ptr != end) {
if (*ptr == '"') {
break;
if (!backslash) {
break;
}
backslash = false;
} else if (*ptr == '\\') {
if (!backslash) {
backslash = true;
Mytherin marked this conversation as resolved.
Show resolved Hide resolved
ptr++;
continue;
}
backslash = false;
}
ptr++;
key[key_len++] = *ptr++;
}
if (ptr == end || backslash) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this not be an error if backslash is true?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we ignore other backslashes as we do for regular (non quoted) JSON paths

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, but then returning an empty result here doesn't seem correct to me - shouldn't we match x\ as the string literal 'x\' in that case, instead of returning an empty result? Could we add such a test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, I understand the confusion. If ptr == end || backslash, we return an empty result, which will cause a descriptive error, pointing to the location where the error occurred. So, if we have $."x\ as a path, you will get an error

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems a bit inconsistent, no? We accept a backslash as a regular backslash except when it is the last character? I would say we should turn it into a regular backslash in this scenario perhaps?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you have the path $."x\, we need to throw an error, because the path is "escaped", i.e., the object key starts with a ", so it needs to end with a " too. If it ends with a backslash, it's wrong. Actually, I just realised that it can be removed from the check, since, ptr == end will be true

return JSONKeyReadResult::Empty();
} else {
return {idx_t(ptr - before), string(key.get(), key_len)};
}
return ptr == end ? 0 : ptr - before;
} else {
while (ptr != end) {
if (*ptr == '.' || *ptr == '[') {
break;
}
ptr++;
}
return ptr - before;
return {idx_t(ptr - before), string(before, ptr - before)};
}
}

Expand Down Expand Up @@ -79,28 +121,24 @@ static inline idx_t ReadInteger(const char *ptr, const char *const end, idx_t &i
return idx >= (idx_t)IDX_T_MAX ? 0 : ptr - before;
}

static inline bool ReadKey(const char *&ptr, const char *const end, const char *&key_ptr, idx_t &key_len) {
static inline JSONKeyReadResult ReadKey(const char *ptr, const char *const end) {
D_ASSERT(ptr != end);
if (*ptr == '*') { // Wildcard
ptr++;
key_len = DConstants::INVALID_INDEX;
return true;
return JSONKeyReadResult::WildCard();
}
bool escaped = false;
if (*ptr == '"') {
ptr++; // Skip past opening '"'
escaped = true;
}
key_ptr = ptr;
key_len = ReadString(ptr, end, escaped);
if (key_len == 0) {
return false;
auto result = ReadString(ptr, end, escaped);
if (!result.IsValid()) {
return result;
}
ptr += key_len;
if (escaped) {
ptr++; // Skip past closing '"'
result.chars_read += 2; // Account for surrounding quotes
}
return true;
return result;
}

static inline bool ReadArrayIndex(const char *&ptr, const char *const end, idx_t &array_index, bool &from_back) {
Expand Down Expand Up @@ -155,14 +193,13 @@ JSONPathType JSONCommon::ValidatePath(const char *ptr, const idx_t &len, const b
}
switch (c) {
case '.': { // Object field
const char *key_ptr;
idx_t key_len;
if (!ReadKey(ptr, end, key_ptr, key_len)) {
auto key = ReadKey(ptr, end);
if (!key.IsValid()) {
ThrowPathError(ptr, end, binder);
}
if (key_len == DConstants::INVALID_INDEX) {
} else if (key.IsWildCard()) {
path_type = JSONPathType::WILDCARD;
}
ptr += key.chars_read;
break;
}
case '[': { // Array index
Expand Down Expand Up @@ -195,16 +232,10 @@ yyjson_val *JSONCommon::GetPath(yyjson_val *val, const char *ptr, const idx_t &l
if (!unsafe_yyjson_is_obj(val)) {
return nullptr;
}
const char *key_ptr;
idx_t key_len;
#ifdef DEBUG
bool success =
#endif
ReadKey(ptr, end, key_ptr, key_len);
#ifdef DEBUG
D_ASSERT(success);
#endif
val = yyjson_obj_getn(val, key_ptr, key_len);
auto key_result = ReadKey(ptr, end);
D_ASSERT(key_result.IsValid());
ptr += key_result.chars_read;
val = yyjson_obj_getn(val, key_result.key.c_str(), key_result.key.size());
break;
}
case '[': { // Array index
Expand Down Expand Up @@ -243,24 +274,18 @@ void GetWildcardPathInternal(yyjson_val *val, const char *ptr, const char *const
if (!unsafe_yyjson_is_obj(val)) {
return;
}
const char *key_ptr;
idx_t key_len;
#ifdef DEBUG
bool success =
#endif
ReadKey(ptr, end, key_ptr, key_len);
#ifdef DEBUG
D_ASSERT(success);
#endif
if (key_len == DConstants::INVALID_INDEX) { // Wildcard
auto key_result = ReadKey(ptr, end);
D_ASSERT(key_result.IsValid());
ptr += key_result.chars_read;
if (key_result.IsWildCard()) { // Wildcard
size_t idx, max;
yyjson_val *key, *obj_val;
yyjson_obj_foreach(val, idx, max, key, obj_val) {
GetWildcardPathInternal(obj_val, ptr, end, vals);
}
return;
}
val = yyjson_obj_getn(val, key_ptr, key_len);
val = yyjson_obj_getn(val, key_result.key.c_str(), key_result.key.size());
break;
}
case '[': { // Array index
Expand Down
16 changes: 16 additions & 0 deletions test/sql/json/scalar/test_json_extract.test
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,19 @@ query T
execute q1('a')
----
1

# test issue 11997
query I
select json_extract_string(json('{"j[so]n_\"key": 67}'), '$."j[so]n_\"key"');
lnkuiper marked this conversation as resolved.
Show resolved Hide resolved
----
67

query I
select '{"\"duck\"": 42}'->'$."\"duck\""';
----
42

query I
select '{"\"du\\ck\"": 42}'->'$."\"du\\ck\""';
----
42
Loading