Skip to content

Commit

Permalink
feat: adds bigframes.bigquery.array_to_string to convert array elemen…
Browse files Browse the repository at this point in the history
…ts to delimited strings (#731)
  • Loading branch information
chelsea-lin committed May 30, 2024
1 parent 9f0406e commit f12c906
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 3 deletions.
35 changes: 32 additions & 3 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ def array_length(series: series.Series) -> series.Series:
dtype: Int64
Args:
series (bigframes.series.Series):
A Series with array columns.
series (bigframes.series.Series): A Series with array columns.
Returns:
bigframes.series.Series: A Series of integer values indicating
Expand Down Expand Up @@ -104,7 +103,7 @@ def array_agg(
Args:
obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
A GroupBy object to be applied the function.
A GroupBy object to be applied the function.
Returns:
bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
Expand All @@ -119,3 +118,33 @@ def array_agg(
raise ValueError(
f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
)


def array_to_string(series: series.Series, delimiter: str) -> series.Series:
"""Converts array elements within a Series into delimited strings.
**Examples:**
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> import numpy as np
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
>>> bbq.array_to_string(s, delimiter=", ")
0 H, i, !
1 Hello, World
2
3
4 Hi
dtype: string
Args:
series (bigframes.series.Series): A Series containing arrays.
delimiter (str): The string used to separate array elements.
Returns:
bigframes.series.Series: A Series containing delimited strings.
"""
return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
6 changes: 6 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,12 @@ def map_op_impl(x: ibis_types.Value, op: ops.MapOp):
return case.else_(x).end()


# Array Ops
@scalar_op_compiler.register_unary_op(ops.ArrayToStringOp, pass_op=True)
def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp):
return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down
8 changes: 8 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ def is_array_like(type: ExpressionType) -> bool:
)


def is_array_string_like(type: ExpressionType) -> bool:
return (
isinstance(type, pd.ArrowDtype)
and isinstance(type.pyarrow_dtype, pa.ListType)
and pa.types.is_string(type.pyarrow_dtype.value_type)
)


def is_struct_like(type: ExpressionType) -> bool:
return isinstance(type, pd.ArrowDtype) and isinstance(
type.pyarrow_dtype, pa.StructType
Expand Down
13 changes: 13 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,19 @@ def output_type(self, *input_types):
return input_types[0]


## Array Ops
@dataclasses.dataclass(frozen=True)
class ArrayToStringOp(UnaryOp):
name: typing.ClassVar[str] = "array_to_string"
delimiter: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_array_string_like(input_type):
raise TypeError("Input type must be an array of string type.")
return dtypes.STRING_DTYPE


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
13 changes: 13 additions & 0 deletions tests/system/small/bigquery/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,16 @@ def test_array_agg_matches_after_explode():
result.to_pandas(), # type: ignore
df.to_pandas(),
)


@pytest.mark.parametrize(
("data"),
[
pytest.param([[1, 2], [3, 4], [5]], id="int_array"),
pytest.param(["hello", "world"], id="string"),
],
)
def test_array_to_string_w_type_checks(data):
series = bpd.Series(data)
with pytest.raises(TypeError):
bbq.array_to_string(series, delimiter=", ")

0 comments on commit f12c906

Please sign in to comment.