diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index c599a4b543..0650953fc7 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -40,6 +40,7 @@ st_intersection, st_isclosed, st_length, + st_regionstats, st_simplify, ) from bigframes.bigquery._operations.json import ( @@ -81,6 +82,7 @@ st_intersection, st_isclosed, st_length, + st_regionstats, st_simplify, # json ops json_extract, diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 6b7e5d88a2..f0fda99a16 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -14,11 +14,13 @@ from __future__ import annotations -from typing import Union +import json +from typing import Mapping, Optional, Union import shapely # type: ignore from bigframes import operations as ops +import bigframes.dataframe import bigframes.geopandas import bigframes.series @@ -677,6 +679,65 @@ def st_length( return series +def st_regionstats( + geography: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], + raster_id: str, + band: Optional[str] = None, + include: Optional[str] = None, + options: Optional[Mapping[str, Union[str, int, float]]] = None, +) -> bigframes.series.Series: + """Returns statistics summarizing the pixel values of the raster image + referenced by raster_id that intersect with geography. + + The statistics include the count, minimum, maximum, sum, standard + deviation, mean, and area of the valid pixels of the raster band named + band_name. Google Earth Engine computes the results of the function call. + + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats + + Args: + geography (bigframes.series.Series | bigframes.geopandas.GeoSeries): + A series of geography objects to intersect with the raster image. + raster_id (str): + A string that identifies a raster image. The following formats are + supported. A URI from an image table provided by Google Earth Engine + in BigQuery sharing (formerly Analytics Hub). A URI for a readable + GeoTIFF raster file. A Google Earth Engine asset path that + references public catalog data or project-owned assets with read + access. + band (Optional[str]): + A string in one of the following formats: + A single band within the raster image specified by raster_id. A + formula to compute a value from the available bands in the raster + image. The formula uses the Google Earth Engine image expression + syntax. Bands can be referenced by their name, band_name, in + expressions. If you don't specify a band, the first band of the + image is used. + include (Optional[str]): + An optional string formula that uses the Google Earth Engine image + expression syntax to compute a pixel weight. The formula should + return values from 0 to 1. Values outside this range are set to the + nearest limit, either 0 or 1. A value of 0 means that the pixel is + invalid and it's excluded from analysis. A positive value means that + a pixel is valid. Values between 0 and 1 represent proportional + weights for calculations, such as weighted means. + options (Mapping[str, Union[str, int, float]], optional): + A dictionary of options to pass to the function. See the BigQuery + documentation for a list of available options. + + Returns: + bigframes.pandas.Series: + A STRUCT Series containing the computed statistics. + """ + op = ops.GeoStRegionStatsOp( + raster_id=raster_id, + band=band, + include=include, + options=json.dumps(options) if options else None, + ) + return geography._apply_unary_op(op) + + def st_simplify( geography: "bigframes.series.Series", tolerance_meters: float, diff --git a/bigframes/core/compile/ibis_compiler/operations/geo_ops.py b/bigframes/core/compile/ibis_compiler/operations/geo_ops.py index 2f06c76768..0ca69726ff 100644 --- a/bigframes/core/compile/ibis_compiler/operations/geo_ops.py +++ b/bigframes/core/compile/ibis_compiler/operations/geo_ops.py @@ -16,8 +16,10 @@ from typing import cast +from bigframes_vendored import ibis from bigframes_vendored.ibis.expr import types as ibis_types import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.operations.geospatial as ibis_geo import bigframes_vendored.ibis.expr.operations.udf as ibis_udf from bigframes.core.compile.ibis_compiler import scalar_op_compiler @@ -101,6 +103,35 @@ def geo_st_isclosed_op_impl(x: ibis_types.Value): return st_isclosed(x) +@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True) +def geo_st_regionstats_op_impl( + geography: ibis_types.Value, + op: ops.GeoStRegionStatsOp, +): + if op.band: + band = ibis.literal(op.band, type=ibis_dtypes.string()) + else: + band = None + + if op.include: + include = ibis.literal(op.include, type=ibis_dtypes.string()) + else: + include = None + + if op.options: + options = ibis.literal(op.options, type=ibis_dtypes.json()) + else: + options = None + + return ibis_geo.GeoRegionStats( + arg=geography, # type: ignore + raster_id=ibis.literal(op.raster_id, type=ibis_dtypes.string()), # type: ignore + band=band, # type: ignore + include=include, # type: ignore + options=options, # type: ignore + ).to_expr() + + @register_unary_op(ops.GeoStSimplifyOp, pass_op=True) def st_simplify_op_impl(x: ibis_types.Value, op: ops.GeoStSimplifyOp): x = cast(ibis_types.GeoSpatialValue, x) diff --git a/bigframes/core/compile/sqlglot/expressions/geo_ops.py b/bigframes/core/compile/sqlglot/expressions/geo_ops.py index 53a50fab47..4585a7f073 100644 --- a/bigframes/core/compile/sqlglot/expressions/geo_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/geo_ops.py @@ -74,6 +74,32 @@ def _(expr: TypedExpr, op: ops.GeoStLengthOp) -> sge.Expression: return sge.func("ST_LENGTH", expr.expr) +@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True) +def _( + geography: TypedExpr, + op: ops.GeoStRegionStatsOp, +): + args = [geography.expr, sge.convert(op.raster_id)] + if op.band: + args.append(sge.Kwarg(this="band", expression=sge.convert(op.band))) + if op.include: + args.append(sge.Kwarg(this="include", expression=sge.convert(op.include))) + if op.options: + args.append( + sge.Kwarg(this="options", expression=sge.JSON(this=sge.convert(op.options))) + ) + return sge.func("ST_REGIONSTATS", *args) + + +@register_unary_op(ops.GeoStSimplifyOp, pass_op=True) +def _(expr: TypedExpr, op: ops.GeoStSimplifyOp) -> sge.Expression: + return sge.func( + "ST_SIMPLIFY", + expr.expr, + sge.convert(op.tolerance_meters), + ) + + @register_unary_op(ops.geo_x_op) def _(expr: TypedExpr) -> sge.Expression: return sge.func("SAFE.ST_X", expr.expr) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index cb03943ada..2a0beb3fb3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -121,6 +121,7 @@ GeoStBufferOp, GeoStDistanceOp, GeoStLengthOp, + GeoStRegionStatsOp, GeoStSimplifyOp, ) from bigframes.operations.json_ops import ( @@ -415,12 +416,13 @@ "geo_st_geogpoint_op", "geo_st_intersection_op", "geo_st_isclosed_op", - "GeoStBufferOp", - "GeoStLengthOp", - "GeoStSimplifyOp", "geo_x_op", "geo_y_op", + "GeoStBufferOp", "GeoStDistanceOp", + "GeoStLengthOp", + "GeoStRegionStatsOp", + "GeoStSimplifyOp", # AI ops "AIClassify", "AIGenerate", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 86e913d543..75fef1b832 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -13,6 +13,7 @@ # limitations under the License. import dataclasses +from typing import Optional from bigframes import dtypes from bigframes.operations import base_ops @@ -135,6 +136,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.FLOAT_DTYPE +@dataclasses.dataclass(frozen=True) +class GeoStRegionStatsOp(base_ops.UnaryOp): + """See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats""" + + name = "geo_st_regionstats" + raster_id: str + band: Optional[str] + include: Optional[str] + options: Optional[str] + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.struct_type( + [ + ("min", dtypes.FLOAT_DTYPE), + ("max", dtypes.FLOAT_DTYPE), + ("sum", dtypes.FLOAT_DTYPE), + ("count", dtypes.INT_DTYPE), + ("mean", dtypes.FLOAT_DTYPE), + ("area", dtypes.FLOAT_DTYPE), + ] + ) + + @dataclasses.dataclass(frozen=True) class GeoStSimplifyOp(base_ops.UnaryOp): name = "st_simplify" diff --git a/samples/snippets/st_regionstats_test.py b/samples/snippets/st_regionstats_test.py new file mode 100644 index 0000000000..f0f4963a82 --- /dev/null +++ b/samples/snippets/st_regionstats_test.py @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Code sample for https://docs.cloud.google.com/bigquery/docs/raster-data#analytics-hub-source""" + + +def test_st_regionstats() -> None: + project_id = "bigframes-dev" + + # [START bigquery_dataframes_st_regionstats] + import datetime + from typing import cast + + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # TODO: Set the project_id to your Google Cloud project ID. + # project_id = "your-project-id" + bpd.options.bigquery.project = project_id + + # TODO: Set the dataset_id to the ID of the dataset that contains the + # `climate` table. This is likely a linked dataset to Earth Engine. + # See: https://cloud.google.com/bigquery/docs/link-earth-engine + linked_dataset = "era5_land_daily_aggregated" + + # For the best efficiency, use partial ordering mode. + bpd.options.bigquery.ordering_mode = "partial" + + # Load the table of country boundaries. + countries = bpd.read_gbq("bigquery-public-data.overture_maps.division_area") + + # Filter to just the countries. + countries = countries[countries["subtype"] == "country"].copy() + countries["name"] = countries["names"].struct.field("primary") + countries["simplified_geometry"] = bbq.st_simplify( + countries["geometry"], + tolerance_meters=10_000, + ) + + # Get the reference to the temperature data from a linked dataset. + # Note: This sample assumes you have a linked dataset to Earth Engine. + image_href = ( + bpd.read_gbq(f"{project_id}.{linked_dataset}.climate") + .set_index("start_datetime") + .loc[[datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)], :] + ) + raster_id = image_href["assets"].struct.field("image").struct.field("href") + raster_id = raster_id.item() + stats = bbq.st_regionstats( + countries["simplified_geometry"], + raster_id=cast(str, raster_id), + band="temperature_2m", + ) + + # Extract the mean and convert from Kelvin to Celsius. + countries["mean_temperature"] = stats.struct.field("mean") - 273.15 + + # Sort by the mean temperature to find the warmest countries. + result = countries[["name", "mean_temperature"]].sort_values( + "mean_temperature", ascending=False + ) + print(result.head(10)) + # [END bigquery_dataframes_st_regionstats] + + assert len(result) > 0 + + +if __name__ == "__main__": + test_st_regionstats() diff --git a/setup.py b/setup.py index abc760b691..fa663f66d5 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,8 @@ "pydata-google-auth >=1.8.2", "requests >=2.27.1", "shapely >=1.8.5", - "sqlglot >=23.6.3", + # 25.20.0 introduces this fix https://github.com/TobikoData/sqlmesh/issues/3095 for rtrim/ltrim. + "sqlglot >=25.20.0", "tabulate >=0.9", "ipywidgets >=7.7.1", "humanize >=4.6.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index eceec07dc4..b8dc8697d6 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -21,7 +21,7 @@ pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 shapely==1.8.5 -sqlglot==23.6.3 +sqlglot==25.20.0 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats/out.sql new file mode 100644 index 0000000000..63076077cf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats/out.sql @@ -0,0 +1,36 @@ +WITH `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT('POINT(1 1)', 0)]) +), `bfcte_1` AS ( + SELECT + *, + ST_REGIONSTATS( + `bfcol_0`, + 'ee://some/raster/uri', + band => 'band1', + include => 'some equation', + options => JSON '{"scale": 100}' + ) AS `bfcol_2` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_2`.`min` AS `bfcol_5`, + `bfcol_2`.`max` AS `bfcol_6`, + `bfcol_2`.`sum` AS `bfcol_7`, + `bfcol_2`.`count` AS `bfcol_8`, + `bfcol_2`.`mean` AS `bfcol_9`, + `bfcol_2`.`area` AS `bfcol_10` + FROM `bfcte_1` +) +SELECT + `bfcol_5` AS `min`, + `bfcol_6` AS `max`, + `bfcol_7` AS `sum`, + `bfcol_8` AS `count`, + `bfcol_9` AS `mean`, + `bfcol_10` AS `area` +FROM `bfcte_2` +ORDER BY + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats_without_optional_args/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats_without_optional_args/out.sql new file mode 100644 index 0000000000..f794711961 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_regionstats_without_optional_args/out.sql @@ -0,0 +1,30 @@ +WITH `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT('POINT(1 1)', 0)]) +), `bfcte_1` AS ( + SELECT + *, + ST_REGIONSTATS(`bfcol_0`, 'ee://some/raster/uri') AS `bfcol_2` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_2`.`min` AS `bfcol_5`, + `bfcol_2`.`max` AS `bfcol_6`, + `bfcol_2`.`sum` AS `bfcol_7`, + `bfcol_2`.`count` AS `bfcol_8`, + `bfcol_2`.`mean` AS `bfcol_9`, + `bfcol_2`.`area` AS `bfcol_10` + FROM `bfcte_1` +) +SELECT + `bfcol_5` AS `min`, + `bfcol_6` AS `max`, + `bfcol_7` AS `sum`, + `bfcol_8` AS `count`, + `bfcol_9` AS `mean`, + `bfcol_10` AS `area` +FROM `bfcte_2` +ORDER BY + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_simplify/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_simplify/out.sql new file mode 100644 index 0000000000..b8dd1587a8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_geo/test_st_simplify/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT('POINT(1 1)', 0)]) +), `bfcte_1` AS ( + SELECT + *, + ST_SIMPLIFY(`bfcol_0`, 123.125) AS `bfcol_2` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `0` +FROM `bfcte_1` +ORDER BY + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_geo.py b/tests/unit/core/compile/sqlglot/test_compile_geo.py new file mode 100644 index 0000000000..50de1488e6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_geo.py @@ -0,0 +1,52 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.bigquery as bbq +import bigframes.geopandas as gpd + +pytest.importorskip("pytest_snapshot") + + +def test_st_regionstats(compiler_session, snapshot): + geos = gpd.GeoSeries(["POINT(1 1)"], session=compiler_session) + result = bbq.st_regionstats( + geos, + "ee://some/raster/uri", + band="band1", + include="some equation", + options={"scale": 100}, + ) + assert "area" in result.struct.dtypes.index + snapshot.assert_match(result.struct.explode().sql, "out.sql") + + +def test_st_regionstats_without_optional_args(compiler_session, snapshot): + geos = gpd.GeoSeries(["POINT(1 1)"], session=compiler_session) + result = bbq.st_regionstats( + geos, + "ee://some/raster/uri", + ) + assert "area" in result.struct.dtypes.index + snapshot.assert_match(result.struct.explode().sql, "out.sql") + + +def test_st_simplify(compiler_session, snapshot): + geos = gpd.GeoSeries(["POINT(1 1)"], session=compiler_session) + result = bbq.st_simplify( + geos, + tolerance_meters=123.125, + ) + snapshot.assert_match(result.to_frame().sql, "out.sql") diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index cbc51e59d6..c01d87fb28 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -811,7 +811,7 @@ def visit_DefaultLiteral(self, op, *, value, dtype): elif dtype.is_uuid(): return self.cast(str(value), dtype) elif dtype.is_json(): - return sge.ParseJSON(this=sge.convert(str(value))) + return sge.JSON(this=sge.convert(str(value))) elif dtype.is_geospatial(): wkt = value if isinstance(value, str) else value.wkt return self.f.st_geogfromtext(wkt) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index cf205b69d6..95d28991a9 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -261,6 +261,16 @@ def visit_BoundingBox(self, op, *, arg): visit_GeoXMax = visit_GeoXMin = visit_GeoYMax = visit_GeoYMin = visit_BoundingBox + def visit_GeoRegionStats(self, op, *, arg, raster_id, band, include, options): + args = [arg, raster_id] + if op.band: + args.append(sge.Kwarg(this="band", expression=band)) + if op.include: + args.append(sge.Kwarg(this="include", expression=include)) + if op.options: + args.append(sge.Kwarg(this="options", expression=options)) + return sge.func("ST_REGIONSTATS", *args) + def visit_GeoSimplify(self, op, *, arg, tolerance, preserve_collapsed): if ( not isinstance(op.preserve_collapsed, ops.Literal) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py b/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py index 0be832af78..efe038599a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/geospatial.py @@ -343,6 +343,28 @@ class GeoNRings(GeoSpatialUnOp): dtype = dt.int64 +@public +class GeoRegionStats(GeoSpatialUnOp): + """Returns results of ST_REGIONSTATS.""" + + raster_id: Value[dt.String] + band: Value[dt.String] + include: Value[dt.String] + options: Value[dt.JSON] + + dtype = dt.Struct( + fields={ + "count": dt.int64, + "min": dt.float64, + "max": dt.float64, + "stdDev": dt.float64, + "sum": dt.float64, + "mean": dt.float64, + "area": dt.float64, + } + ) + + @public class GeoSRID(GeoSpatialUnOp): """Returns the spatial reference identifier for the ST_Geometry."""