Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
st_intersection,
st_isclosed,
st_length,
st_regionstats,
st_simplify,
)
from bigframes.bigquery._operations.json import (
Expand Down Expand Up @@ -81,6 +82,7 @@
st_intersection,
st_isclosed,
st_length,
st_regionstats,
st_simplify,
# json ops
json_extract,
Expand Down
63 changes: 62 additions & 1 deletion bigframes/bigquery/_operations/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@

from __future__ import annotations

from typing import Union
import json
from typing import Mapping, Optional, Union

import shapely # type: ignore

from bigframes import operations as ops
import bigframes.dataframe
import bigframes.geopandas
import bigframes.series

Expand Down Expand Up @@ -677,6 +679,65 @@ def st_length(
return series


def st_regionstats(
geography: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
raster_id: str,
band: Optional[str] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be consistent, we may want to make the optional args as kwargs.

def st_length(
    series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
    *,
    use_spheroid: bool = False,
)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I'm okay either way with this one. In my testing, BigQuery SQL actually accepts the optional parameters to ST_REGIONSTATS as either keyword arguments or positional, so the Python default seems sensible here.

include: Optional[str] = None,
options: Optional[Mapping[str, Union[str, int, float]]] = None,
) -> bigframes.series.Series:
"""Returns statistics summarizing the pixel values of the raster image
referenced by raster_id that intersect with geography.

The statistics include the count, minimum, maximum, sum, standard
deviation, mean, and area of the valid pixels of the raster band named
band_name. Google Earth Engine computes the results of the function call.

See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats

Args:
geography (bigframes.series.Series | bigframes.geopandas.GeoSeries):
A series of geography objects to intersect with the raster image.
raster_id (str):
A string that identifies a raster image. The following formats are
supported. A URI from an image table provided by Google Earth Engine
in BigQuery sharing (formerly Analytics Hub). A URI for a readable
GeoTIFF raster file. A Google Earth Engine asset path that
references public catalog data or project-owned assets with read
access.
band (Optional[str]):
A string in one of the following formats:
A single band within the raster image specified by raster_id. A
formula to compute a value from the available bands in the raster
image. The formula uses the Google Earth Engine image expression
syntax. Bands can be referenced by their name, band_name, in
expressions. If you don't specify a band, the first band of the
image is used.
include (Optional[str]):
An optional string formula that uses the Google Earth Engine image
expression syntax to compute a pixel weight. The formula should
return values from 0 to 1. Values outside this range are set to the
nearest limit, either 0 or 1. A value of 0 means that the pixel is
invalid and it's excluded from analysis. A positive value means that
a pixel is valid. Values between 0 and 1 represent proportional
weights for calculations, such as weighted means.
options (Mapping[str, Union[str, int, float]], optional):
A dictionary of options to pass to the function. See the BigQuery
documentation for a list of available options.

Returns:
bigframes.pandas.Series:
A STRUCT Series containing the computed statistics.
"""
op = ops.GeoStRegionStatsOp(
raster_id=raster_id,
band=band,
include=include,
options=json.dumps(options) if options else None,
)
return geography._apply_unary_op(op)


def st_simplify(
geography: "bigframes.series.Series",
tolerance_meters: float,
Expand Down
31 changes: 31 additions & 0 deletions bigframes/core/compile/ibis_compiler/operations/geo_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@

from typing import cast

from bigframes_vendored import ibis
from bigframes_vendored.ibis.expr import types as ibis_types
import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
import bigframes_vendored.ibis.expr.operations.geospatial as ibis_geo
import bigframes_vendored.ibis.expr.operations.udf as ibis_udf

from bigframes.core.compile.ibis_compiler import scalar_op_compiler
Expand Down Expand Up @@ -101,6 +103,35 @@ def geo_st_isclosed_op_impl(x: ibis_types.Value):
return st_isclosed(x)


@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True)
def geo_st_regionstats_op_impl(
geography: ibis_types.Value,
op: ops.GeoStRegionStatsOp,
):
if op.band:
band = ibis.literal(op.band, type=ibis_dtypes.string())
else:
band = None

if op.include:
include = ibis.literal(op.include, type=ibis_dtypes.string())
else:
include = None

if op.options:
options = ibis.literal(op.options, type=ibis_dtypes.json())
else:
options = None

return ibis_geo.GeoRegionStats(
arg=geography, # type: ignore
raster_id=ibis.literal(op.raster_id, type=ibis_dtypes.string()), # type: ignore
band=band, # type: ignore
include=include, # type: ignore
options=options, # type: ignore
).to_expr()


@register_unary_op(ops.GeoStSimplifyOp, pass_op=True)
def st_simplify_op_impl(x: ibis_types.Value, op: ops.GeoStSimplifyOp):
x = cast(ibis_types.GeoSpatialValue, x)
Expand Down
26 changes: 26 additions & 0 deletions bigframes/core/compile/sqlglot/expressions/geo_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,32 @@ def _(expr: TypedExpr, op: ops.GeoStLengthOp) -> sge.Expression:
return sge.func("ST_LENGTH", expr.expr)


@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True)
def _(
geography: TypedExpr,
op: ops.GeoStRegionStatsOp,
):
args = [geography.expr, sge.convert(op.raster_id)]
if op.band:
args.append(sge.Kwarg(this="band", expression=sge.convert(op.band)))
if op.include:
args.append(sge.Kwarg(this="include", expression=sge.convert(op.include)))
if op.options:
args.append(
sge.Kwarg(this="options", expression=sge.JSON(this=sge.convert(op.options)))
)
return sge.func("ST_REGIONSTATS", *args)


@register_unary_op(ops.GeoStSimplifyOp, pass_op=True)
def _(expr: TypedExpr, op: ops.GeoStSimplifyOp) -> sge.Expression:
return sge.func(
"ST_SIMPLIFY",
expr.expr,
sge.convert(op.tolerance_meters),
)


@register_unary_op(ops.geo_x_op)
def _(expr: TypedExpr) -> sge.Expression:
return sge.func("SAFE.ST_X", expr.expr)
Expand Down
8 changes: 5 additions & 3 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
GeoStBufferOp,
GeoStDistanceOp,
GeoStLengthOp,
GeoStRegionStatsOp,
GeoStSimplifyOp,
)
from bigframes.operations.json_ops import (
Expand Down Expand Up @@ -415,12 +416,13 @@
"geo_st_geogpoint_op",
"geo_st_intersection_op",
"geo_st_isclosed_op",
"GeoStBufferOp",
"GeoStLengthOp",
"GeoStSimplifyOp",
"geo_x_op",
"geo_y_op",
"GeoStBufferOp",
"GeoStDistanceOp",
"GeoStLengthOp",
"GeoStRegionStatsOp",
"GeoStSimplifyOp",
# AI ops
"AIClassify",
"AIGenerate",
Expand Down
24 changes: 24 additions & 0 deletions bigframes/operations/geo_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import dataclasses
from typing import Optional

from bigframes import dtypes
from bigframes.operations import base_ops
Expand Down Expand Up @@ -135,6 +136,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
return dtypes.FLOAT_DTYPE


@dataclasses.dataclass(frozen=True)
class GeoStRegionStatsOp(base_ops.UnaryOp):
"""See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats"""

name = "geo_st_regionstats"
raster_id: str
band: Optional[str]
include: Optional[str]
options: Optional[str]

def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
return dtypes.struct_type(
[
("min", dtypes.FLOAT_DTYPE),
("max", dtypes.FLOAT_DTYPE),
("sum", dtypes.FLOAT_DTYPE),
("count", dtypes.INT_DTYPE),
("mean", dtypes.FLOAT_DTYPE),
("area", dtypes.FLOAT_DTYPE),
]
)


@dataclasses.dataclass(frozen=True)
class GeoStSimplifyOp(base_ops.UnaryOp):
name = "st_simplify"
Expand Down
80 changes: 80 additions & 0 deletions samples/snippets/st_regionstats_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Code sample for https://docs.cloud.google.com/bigquery/docs/raster-data#analytics-hub-source"""


def test_st_regionstats() -> None:
project_id = "bigframes-dev"

# [START bigquery_dataframes_st_regionstats]
import datetime
from typing import cast

import bigframes.bigquery as bbq
import bigframes.pandas as bpd

# TODO: Set the project_id to your Google Cloud project ID.
# project_id = "your-project-id"
bpd.options.bigquery.project = project_id

# TODO: Set the dataset_id to the ID of the dataset that contains the
# `climate` table. This is likely a linked dataset to Earth Engine.
# See: https://cloud.google.com/bigquery/docs/link-earth-engine
linked_dataset = "era5_land_daily_aggregated"

# For the best efficiency, use partial ordering mode.
bpd.options.bigquery.ordering_mode = "partial"

# Load the table of country boundaries.
countries = bpd.read_gbq("bigquery-public-data.overture_maps.division_area")

# Filter to just the countries.
countries = countries[countries["subtype"] == "country"].copy()
countries["name"] = countries["names"].struct.field("primary")
countries["simplified_geometry"] = bbq.st_simplify(
countries["geometry"],
tolerance_meters=10_000,
)

# Get the reference to the temperature data from a linked dataset.
# Note: This sample assumes you have a linked dataset to Earth Engine.
image_href = (
bpd.read_gbq(f"{project_id}.{linked_dataset}.climate")
.set_index("start_datetime")
.loc[[datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)], :]
)
raster_id = image_href["assets"].struct.field("image").struct.field("href")
raster_id = raster_id.item()
stats = bbq.st_regionstats(
countries["simplified_geometry"],
raster_id=cast(str, raster_id),
band="temperature_2m",
)

# Extract the mean and convert from Kelvin to Celsius.
countries["mean_temperature"] = stats.struct.field("mean") - 273.15

# Sort by the mean temperature to find the warmest countries.
result = countries[["name", "mean_temperature"]].sort_values(
"mean_temperature", ascending=False
)
print(result.head(10))
# [END bigquery_dataframes_st_regionstats]

assert len(result) > 0


if __name__ == "__main__":
test_st_regionstats()
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@
"pydata-google-auth >=1.8.2",
"requests >=2.27.1",
"shapely >=1.8.5",
"sqlglot >=23.6.3",
# 25.20.0 introduces this fix https://github.com/TobikoData/sqlmesh/issues/3095 for rtrim/ltrim.
"sqlglot >=25.20.0",
"tabulate >=0.9",
"ipywidgets >=7.7.1",
"humanize >=4.6.0",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pydata-google-auth==1.8.2
requests==2.27.1
scikit-learn==1.2.2
shapely==1.8.5
sqlglot==23.6.3
sqlglot==25.20.0
tabulate==0.9
ipywidgets==7.7.1
humanize==4.6.0
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
WITH `bfcte_0` AS (
SELECT
*
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` STRING, `bfcol_1` INT64>>[STRUCT('POINT(1 1)', 0)])
), `bfcte_1` AS (
SELECT
*,
ST_REGIONSTATS(
`bfcol_0`,
'ee://some/raster/uri',
band => 'band1',
include => 'some equation',
options => JSON '{"scale": 100}'
) AS `bfcol_2`
FROM `bfcte_0`
), `bfcte_2` AS (
SELECT
*,
`bfcol_2`.`min` AS `bfcol_5`,
`bfcol_2`.`max` AS `bfcol_6`,
`bfcol_2`.`sum` AS `bfcol_7`,
`bfcol_2`.`count` AS `bfcol_8`,
`bfcol_2`.`mean` AS `bfcol_9`,
`bfcol_2`.`area` AS `bfcol_10`
FROM `bfcte_1`
)
SELECT
`bfcol_5` AS `min`,
`bfcol_6` AS `max`,
`bfcol_7` AS `sum`,
`bfcol_8` AS `count`,
`bfcol_9` AS `mean`,
`bfcol_10` AS `area`
FROM `bfcte_2`
ORDER BY
`bfcol_1` ASC NULLS LAST
Loading