diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 85a9010a7d..ec26d14f33 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -36,6 +36,10 @@ import bigframes.series as series +# Array functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions + + def array_length(series: series.Series) -> series.Series: """Compute the length of each array element in the Series. @@ -154,6 +158,56 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) +# JSON functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions + + +def json_set( + series: series.Series, + json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], +) -> series.Series: + """Produces a new JSON value within a Series by inserting or replacing values at + specified paths. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] + >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) + 0 {"a":100,"b":"hi"} + Name: data, dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path_value_pairs (Sequence[Tuple[str, typing.Any]]): + Pairs of JSON path and the new value to insert/replace. + + Returns: + bigframes.series.Series: A new Series with the transformed JSON data. + + """ + # SQLGlot parser does not support the "create_if_missing => true" syntax, so + # create_if_missing is not currently implemented. + + for json_path_value_pair in json_path_value_pairs: + if len(json_path_value_pair) != 2: + raise ValueError( + "Incorrect format: Expected (, ), but found: " + + f"{json_path_value_pair}" + ) + + json_path, json_value = json_path_value_pair + series = series._apply_binary_op( + json_value, ops.JSONSet(json_path=json_path), alignment="left" + ) + return series + + def vector_search( base_table: str, column_to_search: str, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 6b8e60434e..0bc9f2e370 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -894,6 +894,26 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) +# JSON Ops +@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) +def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): + if x.type().is_json(): + return json_set( + json_obj=x, + json_path=op.json_path, + json_value=y, + ).to_expr() + else: + # Enabling JSON type eliminates the need for less efficient string conversions. + return vendored_ibis_ops.ToJsonString( + json_set( + json_obj=parse_json(x), + json_path=op.json_path, + json_value=y, + ) + ).to_expr() + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" @@ -1469,3 +1489,15 @@ def float_floor(a: float) -> float: def float_ceil(a: float) -> float: """Convert string to timestamp.""" return 0 # pragma: NO COVER + + +@ibis.udf.scalar.builtin(name="parse_json") +def parse_json(a: str) -> ibis_dtypes.JSON: + """Converts a JSON-formatted STRING value to a JSON value.""" + + +@ibis.udf.scalar.builtin(name="json_set") +def json_set( + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str, json_value +) -> ibis_dtypes.JSON: + """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5de8f896a9..160802ded9 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -240,6 +240,17 @@ def is_struct_like(type: ExpressionType) -> bool: ) +def is_json_like(type: ExpressionType) -> bool: + # TODO: Add JSON type support + return type == STRING_DTYPE + + +def is_json_encoding_type(type: ExpressionType) -> bool: + # Types can be converted into JSON. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_encodings + return type != GEO_DTYPE + + def is_numeric(type: ExpressionType) -> bool: return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index c10b743631..145c415ca0 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -707,6 +707,30 @@ def output_type(self, *input_types): strconcat_op = StrConcatOp() +## JSON Ops +@dataclasses.dataclass(frozen=True) +class JSONSet(BinaryOp): + name: typing.ClassVar[str] = "json_set" + json_path: str + + def output_type(self, *input_types): + left_type = input_types[0] + right_type = input_types[1] + if not dtypes.is_json_like(left_type): + raise TypeError( + "Input type must be an valid JSON object or JSON-formatted string type." + + f" Received type: {left_type}" + ) + if not dtypes.is_json_encoding_type(right_type): + raise TypeError( + "The value to be assigned must be a type that can be encoded as JSON." + + f"Received type: {right_type}" + ) + + # After JSON type implementation, ONLY return JSON data. + return left_type + + # Ternary Ops @dataclasses.dataclass(frozen=True) class WhereOp(TernaryOp): diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py new file mode 100644 index 0000000000..ff759b8fda --- /dev/null +++ b/tests/system/small/bigquery/test_json.py @@ -0,0 +1,119 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +import geopandas as gpd # type: ignore +import pandas as pd +import pytest + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def _get_series_from_json(json_data): + sql = " UNION ALL ".join( + [ + f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data" + for id, data in enumerate(json_data) + ] + ) + df = bpd.read_gbq(sql).set_index("id").sort_index() + return df["data"] + + +@pytest.mark.parametrize( + ("json_path", "expected_json"), + [ + pytest.param("$.a", [{"a": 10}], id="simple"), + pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"), + ], +) +def test_json_set_at_json_path(json_path, expected_json): + s = _get_series_from_json([{"a": {"b": {"c": "tester", "d": []}}}]) + actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) + + expected = _get_series_from_json(expected_json) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +@pytest.mark.parametrize( + ("json_value", "expected_json"), + [ + pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"), + pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"), + pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"), + pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"), + ], +) +def test_json_set_at_json_value_type(json_value, expected_json): + s = _get_series_from_json([{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}]) + actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) + + expected = _get_series_from_json(expected_json) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_set_w_more_pairs(): + s = _get_series_from_json([{"a": 2}, {"b": 5}, {"c": 1}]) + actual = bbq.json_set( + s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])] + ) + expected = _get_series_from_json( + [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}] + ) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +@pytest.mark.parametrize( + ("series", "json_path_value_pairs"), + [ + pytest.param( + _get_series_from_json([{"a": 10}]), + [("$.a", 1, 100)], + id="invalid_json_path_value_pairs", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + _get_series_from_json([{"a": 10}]), + [ + ( + "$.a", + bpd.read_pandas( + gpd.GeoSeries.from_wkt(["POINT (1 2)", "POINT (2 1)"]) + ), + ) + ], + id="invalid_json_value_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + pytest.param( + bpd.Series([1, 2]), + [("$.a", 1)], + id="invalid_series_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_json_set_w_invalid(series, json_path_value_pairs): + bbq.json_set(series, json_path_value_pairs=json_path_value_pairs) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index 772c2e8ff4..1eb0554137 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -2,8 +2,8 @@ from __future__ import annotations import ibis.expr.datatypes as dt -from ibis.expr.operations.core import Unary +import ibis.expr.operations.core as ibis_ops_core -class ToJsonString(Unary): +class ToJsonString(ibis_ops_core.Unary): dtype = dt.string