From f12c90611adb4741069ec32840ebbf2aea83a9f3 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Thu, 30 May 2024 10:31:02 -0700 Subject: [PATCH] feat: adds bigframes.bigquery.array_to_string to convert array elements to delimited strings (#731) --- bigframes/bigquery/__init__.py | 35 ++++++++++++++++++-- bigframes/core/compile/scalar_op_compiler.py | 6 ++++ bigframes/dtypes.py | 8 +++++ bigframes/operations/__init__.py | 13 ++++++++ tests/system/small/bigquery/test_array.py | 13 ++++++++ 5 files changed, 72 insertions(+), 3 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 6c9c04dca7..5808aa28bf 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -57,8 +57,7 @@ def array_length(series: series.Series) -> series.Series: dtype: Int64 Args: - series (bigframes.series.Series): - A Series with array columns. + series (bigframes.series.Series): A Series with array columns. Returns: bigframes.series.Series: A Series of integer values indicating @@ -104,7 +103,7 @@ def array_agg( Args: obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): - A GroupBy object to be applied the function. + A GroupBy object to be applied the function. Returns: bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or @@ -119,3 +118,33 @@ def array_agg( raise ValueError( f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" ) + + +def array_to_string(series: series.Series, delimiter: str) -> series.Series: + """Converts array elements within a Series into delimited strings. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) + >>> bbq.array_to_string(s, delimiter=", ") + 0 H, i, ! + 1 Hello, World + 2 + 3 + 4 Hi + dtype: string + + Args: + series (bigframes.series.Series): A Series containing arrays. + delimiter (str): The string used to separate array elements. + + Returns: + bigframes.series.Series: A Series containing delimited strings. + + """ + return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index e1b497d0dd..000c4a4c09 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -885,6 +885,12 @@ def map_op_impl(x: ibis_types.Value, op: ops.MapOp): return case.else_(x).end() +# Array Ops +@scalar_op_compiler.register_unary_op(ops.ArrayToStringOp, pass_op=True) +def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): + return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 0c32a81404..3df67ed9e4 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -134,6 +134,14 @@ def is_array_like(type: ExpressionType) -> bool: ) +def is_array_string_like(type: ExpressionType) -> bool: + return ( + isinstance(type, pd.ArrowDtype) + and isinstance(type.pyarrow_dtype, pa.ListType) + and pa.types.is_string(type.pyarrow_dtype.value_type) + ) + + def is_struct_like(type: ExpressionType) -> bool: return isinstance(type, pd.ArrowDtype) and isinstance( type.pyarrow_dtype, pa.StructType diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index c1854b1b61..42f83913ee 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -580,6 +580,19 @@ def output_type(self, *input_types): return input_types[0] +## Array Ops +@dataclasses.dataclass(frozen=True) +class ArrayToStringOp(UnaryOp): + name: typing.ClassVar[str] = "array_to_string" + delimiter: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_array_string_like(input_type): + raise TypeError("Input type must be an array of string type.") + return dtypes.STRING_DTYPE + + # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py index 0664c31a3c..d6823a3a54 100644 --- a/tests/system/small/bigquery/test_array.py +++ b/tests/system/small/bigquery/test_array.py @@ -139,3 +139,16 @@ def test_array_agg_matches_after_explode(): result.to_pandas(), # type: ignore df.to_pandas(), ) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param([[1, 2], [3, 4], [5]], id="int_array"), + pytest.param(["hello", "world"], id="string"), + ], +) +def test_array_to_string_w_type_checks(data): + series = bpd.Series(data) + with pytest.raises(TypeError): + bbq.array_to_string(series, delimiter=", ")