From 8cfaca6e6b222f1140c25222ffffee9a40a2a229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 16:24:28 +0000 Subject: [PATCH 01/63] docs: remove import bigframes.pandas as bpd boilerplate from many samples Also, fixes several constructors that didn't take a session for compatibility with multi-session applications. --- bigframes/bigquery/_operations/ai.py | 21 +- bigframes/bigquery/_operations/approx_agg.py | 1 - bigframes/bigquery/_operations/array.py | 6 - bigframes/bigquery/_operations/datetime.py | 10 +- bigframes/bigquery/_operations/geo.py | 13 - bigframes/bigquery/_operations/json.py | 12 - bigframes/bigquery/_operations/search.py | 1 - bigframes/bigquery/_operations/sql.py | 3 - bigframes/bigquery/_operations/struct.py | 1 - {tests/unit => bigframes}/conftest.py | 21 + bigframes/core/compile/polars/compiler.py | 4 +- bigframes/core/indexes/base.py | 11 +- bigframes/core/log_adapter.py | 4 +- bigframes/core/reshape/tile.py | 7 +- bigframes/core/tools/datetimes.py | 10 +- bigframes/dataframe.py | 6 +- bigframes/ml/compose.py | 1 - bigframes/operations/ai.py | 17 +- bigframes/operations/base.py | 13 +- bigframes/operations/semantics.py | 20 +- bigframes/operations/strings.py | 1 - bigframes/pandas/__init__.py | 18 +- bigframes/series.py | 5 - bigframes/session/__init__.py | 102 +++- dummy.pkl | Bin 0 -> 1150 bytes .../bigframes_vendored/geopandas/geoseries.py | 9 - .../bigframes_vendored/pandas/AUTHORS.md | 1 - .../bigframes_vendored/pandas/README.md | 2 - .../bigframes_vendored/pandas/conftest.py | 45 ++ .../pandas/core/arrays/arrow/accessors.py | 25 +- .../pandas/core/arrays/datetimelike.py | 7 +- .../pandas/core/computation/eval.py | 3 - .../pandas/core/computation/expr.py | 3 - .../pandas/core/computation/ops.py | 1 - .../bigframes_vendored/pandas/core/frame.py | 436 ++--------------- .../bigframes_vendored/pandas/core/generic.py | 45 +- .../pandas/core/groupby/__init__.py | 127 +---- .../pandas/core/indexes/accessor.py | 47 -- .../pandas/core/indexes/base.py | 108 +---- .../pandas/core/indexes/datetimes.py | 24 - .../pandas/core/indexes/multi.py | 4 - .../pandas/core/reshape/tile.py | 3 - .../bigframes_vendored/pandas/core/series.py | 439 ++---------------- .../pandas/core/strings/accessor.py | 104 +---- .../pandas/core/tools/datetimes.py | 4 +- .../pandas/core/tools/timedeltas.py | 1 - .../bigframes_vendored/pandas/io/gbq.py | 1 - .../bigframes_vendored/pandas/io/parquet.py | 2 - .../pandas/io/parsers/readers.py | 4 - .../bigframes_vendored/pandas/io/pickle.py | 2 - .../pandas/pandas/_typing.py | 2 - .../pandas/plotting/_core.py | 12 +- .../sklearn/cluster/_kmeans.py | 1 - .../sklearn/decomposition/_mf.py | 1 - .../sklearn/decomposition/_pca.py | 1 - .../sklearn/impute/_base.py | 1 - .../sklearn/linear_model/_base.py | 1 - .../sklearn/linear_model/_logistic.py | 1 - .../sklearn/metrics/_classification.py | 5 - .../sklearn/metrics/_ranking.py | 3 - .../sklearn/metrics/_regression.py | 3 - .../sklearn/model_selection/_split.py | 2 - .../sklearn/model_selection/_validation.py | 1 - .../sklearn/preprocessing/_encoder.py | 1 - 64 files changed, 321 insertions(+), 1469 deletions(-) rename {tests/unit => bigframes}/conftest.py (57%) create mode 100644 dummy.pkl create mode 100644 third_party/bigframes_vendored/pandas/conftest.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index a789310683..7698c2c95c 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -52,14 +52,13 @@ def generate( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> country = bpd.Series(["Japan", "Canada"]) - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) # doctest: +SKIP 0 {'result': 'Tokyo\\n', 'full_response': '{"cand... 1 {'result': 'Ottawa\\n', 'full_response': '{"can... dtype: struct>, status: string>[pyarrow] - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") # doctest: +SKIP 0 Tokyo\\n 1 Ottawa\\n Name: result, dtype: string @@ -125,7 +124,6 @@ def generate_bool( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "col_1": ["apple", "bear", "pear"], ... "col_2": ["fruit", "animal", "animal"] @@ -203,8 +201,7 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... 1 {'result': 4, 'full_response': '{"candidates":... @@ -278,8 +275,7 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... 1 {'result': 4.0, 'full_response': '{"candidates... @@ -350,8 +346,7 @@ def if_( **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) + >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True 1 True @@ -400,8 +395,7 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) + >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df creature type @@ -451,8 +445,7 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) + >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 1 1.0 diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 696f8f5a66..73b6fdbb73 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -40,7 +40,6 @@ def approx_top_count( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) >>> bbq.approx_top_count(s, number=2) [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 4af1416127..239bc9566a 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -40,8 +40,6 @@ def array_length(series: series.Series) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) >>> bbq.array_length(s) 0 4 @@ -78,8 +76,6 @@ def array_agg( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None For a SeriesGroupBy object: @@ -128,8 +124,6 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) >>> bbq.array_to_string(s, delimiter=", ") diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index f8767336dd..c4aba91a29 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -21,11 +21,8 @@ def unix_seconds(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_seconds(s) 0 86400 @@ -48,11 +45,8 @@ def unix_millis(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_millis(s) 0 86400000 @@ -75,10 +69,8 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd + >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_micros(s) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 9a92a8960d..e5aa383779 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -53,8 +53,6 @@ def st_area( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -125,8 +123,6 @@ def st_buffer( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Point(0, 0), @@ -195,8 +191,6 @@ def st_centroid( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -250,8 +244,6 @@ def st_convexhull( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -312,7 +304,6 @@ def st_difference( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -407,7 +398,6 @@ def st_distance( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -489,7 +479,6 @@ def st_intersection( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -583,7 +572,6 @@ def st_isclosed( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point, LineString, Polygon - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -650,7 +638,6 @@ def st_length( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point, GeometryCollection - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 656e59af0d..4e1f43aab0 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -49,8 +49,6 @@ def json_set( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) @@ -101,7 +99,6 @@ def json_extract( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") @@ -141,7 +138,6 @@ def json_extract_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_array(s) @@ -204,7 +200,6 @@ def json_extract_string_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_string_array(s) @@ -272,7 +267,6 @@ def json_query( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_query(s, json_path="$.class") @@ -303,7 +297,6 @@ def json_query_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_query_array(s) @@ -355,7 +348,6 @@ def json_value( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"name": "Jakob", "age": "6"}', '{"name": "Jakob", "age": []}']) >>> bbq.json_value(s, json_path="$.age") @@ -392,7 +384,6 @@ def json_value_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_value_array(s) @@ -439,7 +430,6 @@ def to_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json(s) @@ -473,7 +463,6 @@ def to_json_string( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json_string(s) @@ -512,7 +501,6 @@ def parse_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> s diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index c16c2af1a9..b65eed2475 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -111,7 +111,6 @@ def vector_search( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column is used as the search query: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index a2de61fc21..295412fd75 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -36,9 +36,6 @@ def sql_scalar( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import pandas as pd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index 7cb826351c..a6304677ef 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -39,7 +39,6 @@ def struct(value: dataframe.DataFrame) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import bigframes.series as series - >>> bpd.options.display.progress_bar = None >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) >>> df = srs.struct.explode() diff --git a/tests/unit/conftest.py b/bigframes/conftest.py similarity index 57% rename from tests/unit/conftest.py rename to bigframes/conftest.py index a9b26afeef..e1f3f6d84c 100644 --- a/tests/unit/conftest.py +++ b/bigframes/conftest.py @@ -12,8 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + +import numpy as np +import pandas as pd +import pyarrow as pa import pytest +import bigframes._config + @pytest.fixture(scope="session") def polars_session(): @@ -22,3 +29,17 @@ def polars_session(): from bigframes.testing import polars_session return polars_session.TestSession() + + +@pytest.fixture(autouse=True) +def default_doctest_imports(doctest_namespace, polars_session): + """ + Avoid some boilerplate in pandas-inspired tests. + + See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["pa"] = pa + doctest_namespace["bpd"] = polars_session + bigframes._config.options.display.progress_bar = None diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index f7c742e852..059ec72076 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -493,9 +493,9 @@ def compile_agg_op( if isinstance(op, agg_ops.MedianOp): return pl.median(*inputs) if isinstance(op, agg_ops.AllOp): - return pl.all(*inputs) + return pl.col(inputs).cast(pl.Boolean).all() if isinstance(op, agg_ops.AnyOp): - return pl.any(*inputs) # type: ignore + return pl.col(inputs).cast(pl.Boolean).any() if isinstance(op, agg_ops.NuniqueOp): return pl.col(*inputs).drop_nulls().n_unique() if isinstance(op, agg_ops.MinOp): diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index a6b18fcb43..b79363aa0a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -383,9 +383,16 @@ def to_series( name = self.name if name is None else name if index is None: - return bigframes.series.Series(data=self, index=self, name=name) + return bigframes.series.Series( + data=self, index=self, name=name, session=self._session + ) else: - return bigframes.series.Series(data=self, index=Index(index), name=name) + return bigframes.series.Series( + data=self, + index=Index(index, session=self._session), + name=name, + session=self._session, + ) def get_level_values(self, level) -> Index: level_n = level if isinstance(level, int) else self.names.index(level) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 3ec1e86dc7..8179ffbeed 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -155,7 +155,9 @@ def method_logger(method=None, /, *, custom_base_name: Optional[str] = None): def outer_wrapper(method): @functools.wraps(method) def wrapper(*args, **kwargs): - api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) + api_method_name = getattr( + method, LOG_OVERRIDE_NAME, method.__name__ + ).lower() if custom_base_name is None: qualname_parts = getattr(method, "__qualname__", method.__name__).split( "." diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index 74a941be54..a2efa8f927 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import Optional, TYPE_CHECKING import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -31,6 +32,9 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series +if TYPE_CHECKING: + import bigframes.session + def cut( x, @@ -42,6 +46,7 @@ def cut( *, right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, + session: Optional[bigframes.session.Session] = None, ) -> bigframes.series.Series: if ( labels is not None @@ -65,7 +70,7 @@ def cut( raise ValueError("Cannot cut empty array.") if not isinstance(x, bigframes.series.Series): - x = bigframes.series.Series(x) + x = bigframes.series.Series(x, session=session) if isinstance(bins, int): if bins <= 0: diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 7edf2fa2e4..fd7561f4b4 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + from collections.abc import Mapping from datetime import date, datetime -from typing import Optional, Union +from typing import Optional, TYPE_CHECKING, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes @@ -25,6 +27,9 @@ import bigframes.operations as ops import bigframes.series +if TYPE_CHECKING: + import bigframes.session + def to_datetime( arg: Union[ @@ -37,6 +42,7 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, + session: Optional[bigframes.session.Session], ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( @@ -52,7 +58,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg) + arg = bigframes.series.Series(arg, session=session) if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1bde29506d..49ec2fced3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -688,7 +688,7 @@ def _getitem_label(self, key: blocks.Label): return DataFrame(block) if len(col_ids) == 1: - return bigframes.series.Series(block) + return bigframes.series.Series(block, name=key) return DataFrame(block) # Bool Series selects rows @@ -1771,7 +1771,6 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 2, 2]}) Download the data from BigQuery and convert it into an in-memory pandas DataFrame. @@ -1893,7 +1892,6 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) Iterate through the results in batches, limiting the total rows yielded @@ -4252,8 +4250,6 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 92c98695cd..54ce7066cb 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -69,7 +69,6 @@ class SQLScalarColumnTransformer: >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index ac294b0fbd..dbbf16afc3 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -45,7 +45,6 @@ def filter( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -115,8 +114,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -134,8 +132,7 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -266,8 +263,7 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -356,8 +352,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -496,7 +491,6 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.ai_operators = True @@ -608,8 +602,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index f2bbcb3320..ebb5767264 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -115,8 +115,6 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) - if name: - block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -132,6 +130,13 @@ def __init__( block = read_pandas_func(pd_series)._get_block() # type:ignore assert block is not None + + # If we didn't get a block make sure the name is what the user + # explicitly chose even if it is None. This is important for the + # polars backend where the implicit column labels are integers. + if not isinstance(data, blocks.Block): + block = block.with_column_labels([name]) + self._block: blocks.Block = block @property @@ -160,7 +165,9 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - return series.Series(block.select_column(result_id)) + result = series.Series(block.select_column(result_id)) + result.name = getattr(self, "name", None) + return result def _apply_binary_op( self, diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 9fa5450748..b4f7af1aca 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -52,7 +52,6 @@ def agg( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -247,8 +246,7 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -321,8 +319,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -435,8 +432,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -558,8 +554,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -697,7 +692,6 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True @@ -800,8 +794,7 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -1001,8 +994,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 4743483954..c69993849a 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -68,7 +68,6 @@ def reverse(self) -> series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) >>> s.str.reverse() diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 2ea10132bc..19ea282762 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -16,8 +16,8 @@ from __future__ import annotations -from collections import namedtuple -from datetime import date, datetime +import collections +import datetime import inspect import sys import typing @@ -198,18 +198,18 @@ def to_datetime( @typing.overload def to_datetime( - arg: Union[int, float, str, datetime, date], + arg: Union[int, float, str, datetime.datetime, datetime.date], *, utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime]: +) -> Union[pandas.Timestamp, datetime.datetime]: ... def to_datetime( arg: Union[ - Union[int, float, str, datetime, date], + Union[int, float, str, datetime.datetime, datetime.date], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, @@ -218,13 +218,15 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: - return bigframes.core.tools.to_datetime( +) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + return global_session.with_default_session( + bigframes.session.Session.to_datetime, arg, utc=utc, format=format, unit=unit, ) + return bigframes.core.tools.to_datetime() to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ @@ -321,7 +323,7 @@ def clean_up_by_session_id( __version__ = bigframes.version.__version__ # Other public pandas attributes -NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +NamedAgg = collections.namedtuple("NamedAgg", ["column", "aggfunc"]) options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" diff --git a/bigframes/series.py b/bigframes/series.py index 490298d8dd..337a796739 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -533,7 +533,6 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2]) Download the data from BigQuery and convert it into an in-memory pandas Series. @@ -661,7 +660,6 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2, 2, 3]) Iterate through the results in batches, limiting the total rows yielded @@ -2421,9 +2419,6 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> data = { ... "timestamp_col": pd.date_range( ... start="2021-01-01 13:00:00", periods=30, freq="1s" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f0cec864b4..11621e8ea7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,6 +67,7 @@ import bigframes.constants import bigframes.core from bigframes.core import blocks, log_adapter, utils +import bigframes.core.indexes import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -83,7 +84,6 @@ # Avoid circular imports. if typing.TYPE_CHECKING: - import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series import bigframes.streaming.dataframe as streaming_dataframe @@ -315,6 +315,15 @@ def bqconnectionmanager(self): ) return self._bq_connection_manager + @property + def options(self) -> bigframes._config.Options: + """Options for configuring BigQuery DataFrames. + + Included for compatibility between bpd and Session. + """ + # TODO(tswast): Consider making a separate session-level options object. + return bigframes._config.options + @property def session_id(self): return self._session_id @@ -597,7 +606,6 @@ def read_gbq_query( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Simple query input: @@ -753,7 +761,6 @@ def read_gbq_table( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). @@ -832,7 +839,6 @@ def read_gbq_table_streaming( >>> import bigframes.streaming as bst >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") @@ -861,7 +867,6 @@ def read_gbq_model(self, model_name: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Read an existing BigQuery ML model. @@ -931,8 +936,6 @@ def read_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -1810,7 +1813,6 @@ def udf( >>> import bigframes.pandas as bpd >>> import datetime - >>> bpd.options.display.progress_bar = None Turning an arbitrary python function into a BigQuery managed python udf: @@ -1973,7 +1975,6 @@ def read_gbq_function( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. @@ -2283,6 +2284,89 @@ def read_gbq_object_table( s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() + # ========================================================================= + # bigframes.pandas attributes + # + # These are included so that Session and bigframes.pandas can be used + # interchangeably. + # ========================================================================= + def cut(self, *args, **kwargs) -> bigframes.series.Series: + import bigframes.core.reshape.tile + + return bigframes.core.reshape.tile.cut( + *args, + session=self, + **kwargs, + ) + + def DataFrame(self, *args, **kwargs) -> bigframes.dataframe.DataFrame: + """Constructs a DataFrame. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.DataFrame` for full documentation. + """ + import bigframes.dataframe + + return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) + + def MultiIndex(self, *args, **kwargs) -> bigframes.core.indexes.MultiIndex: + """Constructs a MultiIndex. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.MulitIndex` for full documentation. + """ + import bigframes.core.indexes + + return bigframes.core.indexes.MultiIndex(*args, session=self, **kwargs) + + MultiIndex.from_tuples = bigframes.core.indexes.MultiIndex.from_tuples # type: ignore + MultiIndex.from_frame = bigframes.core.indexes.MultiIndex.from_frame # type: ignore + MultiIndex.from_arrays = bigframes.core.indexes.MultiIndex.from_arrays # type: ignore + + def Index(self, *args, **kwargs) -> bigframes.core.indexes.Index: + """Constructs a Index. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.Index` for full documentation. + """ + import bigframes.core.indexes + + return bigframes.core.indexes.Index(*args, session=self, **kwargs) + + def Series(self, *args, **kwargs) -> bigframes.series.Series: + """Constructs a Series. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.Series` for full documentation. + """ + import bigframes.series + + return bigframes.series.Series(*args, session=self, **kwargs) + + def to_datetime( + self, *args, **kwargs + ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + import bigframes.core.tools + + return bigframes.core.tools.to_datetime( + *args, + session=self, + **kwargs, + ) + + def to_timedelta(self, *args, **kwargs): + import bigframes.pandas.core.tools.timedeltas + + return bigframes.pandas.core.tools.timedeltas.to_timedelta( + *args, + session=self, + **kwargs, + ) + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/dummy.pkl b/dummy.pkl new file mode 100644 index 0000000000000000000000000000000000000000..76a409b1ded309cfc7b30cccd49d85a710e737bd GIT binary patch literal 1150 zcmbVMU2D`p6is$F+m9_uQPyHX!3T{XfkORwP()jV8eBodw=zt!lbx{nSTYmpf`UG@ zwlHsd?cdcqli4LgMKBM!H_4ql=bU>c-@Koq=a@@v&uB5GB8bb11xZD725RGwO8Um+ z3wZb)zJjlMB%f5E?zGF(Lb9r$nFw-P&UF%7emK7)(%AMgSEu&dnXFdB{C z{=&=LLPtUrdZ(b=1CUsxJd#r}1%A`N^i~^V0(?hxqP=#nFMsL9@0w1164RJ79LW+6 z-%^_>%#-~?i_XZxAL*eD(qha$lQ^RSN3+uw*L-0jh^WAcdq=uZQyUnvq@l`pRd0%w z$RveYM52z=dQ_*GObcx2i7bt^AfXewp{w->JNnKC{8}>|zO6|wHD8kNTM^c5T(@z< zM&P?zzlJlvcZF{ETi=l?5BMH}<1Y|Lr;X$cj=|@T)g~#}30bn_PmQG;-wfNn$!p|h zjE8T1B0d9+UahmU2&fpLJ^u>> import bigframes.geopandas >>> import bigframes.pandas as bpd >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -73,7 +72,6 @@ def x(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -100,7 +98,6 @@ def y(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -129,7 +126,6 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Polygon, LineString, Point >>> s = geopandas.GeoSeries( @@ -171,7 +167,6 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import bigframes.geopandas - >>> bpd.options.display.progress_bar = None >>> x = [2.5, 5, -3.0] >>> y = [0.5, 1, 1.5] @@ -210,7 +205,6 @@ def from_wkt(cls, data, index=None) -> bigframes.geopandas.GeoSeries: >>> import bigframes as bpd >>> import bigframes.geopandas - >>> bpd.options.display.progress_bar = None >>> wkts = [ ... 'POINT (1 1)', @@ -246,7 +240,6 @@ def to_wkt(self) -> bigframes.series.Series: >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -279,7 +272,6 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -411,7 +403,6 @@ def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignor >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. diff --git a/third_party/bigframes_vendored/pandas/AUTHORS.md b/third_party/bigframes_vendored/pandas/AUTHORS.md index 84fcfe05e3..396bcbf9dd 100644 --- a/third_party/bigframes_vendored/pandas/AUTHORS.md +++ b/third_party/bigframes_vendored/pandas/AUTHORS.md @@ -47,7 +47,6 @@ file to indicate the copyright and license terms: Other licenses can be found in the LICENSES directory. -License ======= pandas is distributed under a 3-clause ("Simplified" or "New") BSD diff --git a/third_party/bigframes_vendored/pandas/README.md b/third_party/bigframes_vendored/pandas/README.md index 1aa5068d5e..f92a629a4c 100644 --- a/third_party/bigframes_vendored/pandas/README.md +++ b/third_party/bigframes_vendored/pandas/README.md @@ -60,7 +60,6 @@ Here are just a few of the things that pandas does well: generation and frequency conversion, moving window statistics, date shifting and lagging - [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures @@ -120,7 +119,6 @@ python setup.py install or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): - ```sh python -m pip install -e . --no-build-isolation --no-use-pep517 ``` diff --git a/third_party/bigframes_vendored/pandas/conftest.py b/third_party/bigframes_vendored/pandas/conftest.py new file mode 100644 index 0000000000..e1f3f6d84c --- /dev/null +++ b/third_party/bigframes_vendored/pandas/conftest.py @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import bigframes._config + + +@pytest.fixture(scope="session") +def polars_session(): + pytest.importorskip("polars") + + from bigframes.testing import polars_session + + return polars_session.TestSession() + + +@pytest.fixture(autouse=True) +def default_doctest_imports(doctest_namespace, polars_session): + """ + Avoid some boilerplate in pandas-inspired tests. + + See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["pa"] = pa + doctest_namespace["bpd"] = polars_session + bigframes._config.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index fe15e7b40d..9f6dfc1c74 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -19,14 +19,12 @@ def len(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.len() 0 3 @@ -45,14 +43,12 @@ def __getitem__(self, key: int | slice): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list[0] 0 1 @@ -83,15 +79,13 @@ def field(self, name_or_index: str | int): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -129,15 +123,13 @@ def explode(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -165,15 +157,13 @@ def dtypes(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -200,8 +190,6 @@ def explode(self, column, *, separator: str = "."): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ @@ -209,7 +197,7 @@ def explode(self, column, *, separator: str = "."): ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -233,7 +221,6 @@ def explode(self, column, *, separator: str = "."): Separator/delimiter to use to separate the original column name from the sub-field column name. - Returns: DataFrame: Original DataFrame with exploded struct column(s). diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 1736a7f9ef..eeffbbdb7f 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -15,8 +15,6 @@ def strftime(self, date_format: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.to_datetime( ... ['2014-08-15 08:15:12', '2012-02-29 08:15:12+06:00', '2015-08-15 08:15:12+05:00'], ... utc=True @@ -36,6 +34,7 @@ def strftime(self, date_format: str): bigframes.pandas.Series: Series of formatted strings. """ + # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def normalize(self): @@ -51,7 +50,6 @@ def normalize(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', @@ -68,6 +66,7 @@ def normalize(self): bigframes.pandas.Series: Series of the same dtype as the data. """ + # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floor(self, freq: str): @@ -85,8 +84,6 @@ def floor(self, freq: str): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index d3d11a9c2a..2f01b7edfc 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -172,9 +172,6 @@ def eval( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df animal age diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py index 44f649e59d..ca9e6a60ce 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/expr.py +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -165,7 +165,6 @@ def _is_type(t): _is_list = _is_type(list) _is_str = _is_type(str) - # partition all AST nodes _all_nodes = frozenset( node @@ -197,11 +196,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) - # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) - _unsupported_expr_nodes = frozenset( [ "Yield", diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py index 75b914c876..a15972fc4c 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/ops.py +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -52,7 +52,6 @@ MATHOPS = _unary_math_ops + _binary_math_ops - LOCAL_TAG = "__pd_eval_local_" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 557c332797..b433c739cc 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -39,9 +39,6 @@ def shape(self) -> tuple[int, int]: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2, 3], ... 'col2': [4, 5, 6]}) >>> df.shape @@ -63,9 +60,6 @@ def axes(self) -> list: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes[1:] [Index(['col1', 'col2'], dtype='object')] @@ -78,9 +72,6 @@ def values(self) -> np.ndarray: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.values array([[1, 3], @@ -110,8 +101,6 @@ def T(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -146,9 +135,6 @@ def transpose(self) -> DataFrame: **Square DataFrame with homogeneous dtype** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = bpd.DataFrame(data=d1) >>> df1 @@ -256,9 +242,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) >>> df.select_dtypes(include=['Int64']) col1 @@ -274,7 +257,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: [2 rows x 2 columns] - Args: include (scalar or list-like): A selection of dtypes or strings to be included. @@ -380,9 +362,6 @@ def to_numpy( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_numpy() array([[1, 3], @@ -419,11 +398,9 @@ def to_gbq( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Write a DataFrame to a BigQuery table. + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> # destination_table = PROJECT_ID + "." + DATASET_ID + "." + TABLE_NAME >>> df.to_gbq("bigframes-dev.birds.test-numbers", if_exists="replace") @@ -510,7 +487,6 @@ def to_gbq( If an invalid value is provided for ``if_exists`` that is not one of ``fail``, ``replace``, or ``append``. - """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -530,8 +506,6 @@ def to_parquet( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" >>> df.to_parquet(path=gcs_bucket) @@ -586,9 +560,6 @@ def to_dict( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() {'col1': {np.int64(0): 1, np.int64(1): 2}, 'col2': {np.int64(0): 3, np.int64(1): 4}} @@ -666,12 +637,17 @@ def to_excel( **Examples:** - >>> import bigframes.pandas as bpd >>> import tempfile - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - >>> df.to_excel(tempfile.TemporaryFile()) + + >>> try: + ... import openpyxl + ... df.to_excel(tempfile.TemporaryFile()) + ... + ... except ImportError: + ... pass # openpyxl is required. Args: excel_writer (path-like, file-like, or ExcelWriter object): @@ -703,9 +679,6 @@ def to_latex( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_latex()) \begin{tabular}{lrr} @@ -754,9 +727,6 @@ def to_records( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_records() rec.array([(0, 1, 3), (1, 2, 4)], @@ -814,9 +784,6 @@ def to_string( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_string()) col1 col2 @@ -914,9 +881,6 @@ def to_html( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_html()) @@ -1024,9 +988,6 @@ def to_markdown( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_markdown()) | | col1 | col2 | @@ -1058,9 +1019,6 @@ def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" >>> df.to_pickle(path=gcs_bucket) @@ -1080,9 +1038,6 @@ def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | No **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> import tempfile >>> df.to_orc(tempfile.TemporaryFile()) @@ -1190,9 +1145,6 @@ def insert(self, loc, column, value, allow_duplicates=False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. @@ -1243,9 +1195,6 @@ def drop( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) >>> df @@ -1284,7 +1233,6 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame: - >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -1369,7 +1317,6 @@ def align( Join method is specified for each axis Index. - Args: other (DataFrame or Series): join ({'outer', 'inner', 'left', 'right'}, default 'outer'): @@ -1402,9 +1349,6 @@ def rename( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df A B @@ -1474,9 +1418,6 @@ def set_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], ... 'sale': [55, 40, 84, 31]}) @@ -1616,10 +1557,6 @@ def reset_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> import numpy as np >>> df = bpd.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -1659,7 +1596,6 @@ class max_speed You can also use ``reset_index`` with ``MultiIndex``. - >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), @@ -1700,7 +1636,6 @@ class name speed max [4 rows x 2 columns] - Args: level (int, str, tuple, or list, default None): Only remove the given levels from the index. Removes all levels by @@ -1795,12 +1730,9 @@ def dropna( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) + ... "born": [pd.NA, "1940-04-25", pd.NA]}) >>> df name toy born 0 Alfred @@ -1889,7 +1821,6 @@ def dropna( ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - Returns: bigframes.pandas.DataFrame: DataFrame with NA entries dropped from it. @@ -1908,9 +1839,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) >>> df @@ -1964,9 +1892,6 @@ def keys(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1985,8 +1910,6 @@ def iterrows(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2011,8 +1934,6 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2044,9 +1965,6 @@ def items(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, ... index=['panda', 'polar', 'koala']) @@ -2085,9 +2003,6 @@ def where(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2177,9 +2092,6 @@ def mask(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2280,11 +2192,8 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ - ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], + ... 'col1': ['A', 'A', 'B', pd.NA, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] @@ -2424,9 +2333,6 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2467,9 +2373,6 @@ def __eq__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2498,9 +2401,6 @@ def __invert__(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'a':[True, False, True], 'b':[-1, 0, 1]}) >>> ~df a b @@ -2527,9 +2427,6 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2569,9 +2466,6 @@ def __ne__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2609,9 +2503,6 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2652,9 +2543,6 @@ def __le__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2692,9 +2580,6 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2735,9 +2620,6 @@ def __lt__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2775,9 +2657,6 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2818,9 +2697,6 @@ def __ge__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2858,9 +2734,6 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'angles': [0, 3, 4], ... 'degrees': [360, 180, 360]}, ... index=['circle', 'triangle', 'rectangle']) @@ -2899,9 +2772,6 @@ def __gt__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2936,9 +2806,6 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2980,9 +2847,6 @@ def __add__(self, other) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'height': [1.5, 2.6], ... 'weight': [500, 800] @@ -3055,9 +2919,6 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3118,9 +2979,6 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3162,9 +3020,6 @@ def __sub__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can subtract a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3210,9 +3065,6 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3271,9 +3123,6 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3315,9 +3164,6 @@ def __mul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3363,9 +3209,6 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3407,9 +3250,6 @@ def __rmul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3455,9 +3295,6 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3499,9 +3336,6 @@ def __truediv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3547,9 +3381,6 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3608,9 +3439,6 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3652,9 +3480,6 @@ def __floordiv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can divide by a scalar: >>> df = bpd.DataFrame({"a": [15, 15, 15], "b": [30, 30, 30]}) @@ -3700,9 +3525,6 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3761,9 +3583,6 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3805,9 +3624,6 @@ def __mod__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can modulo with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3853,9 +3669,6 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3915,9 +3728,6 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3960,9 +3770,6 @@ def __pow__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can exponentiate with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -4009,9 +3816,6 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -4105,9 +3909,6 @@ def combine( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 @@ -4155,9 +3956,6 @@ def combine_first(self, other) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) @@ -4185,10 +3983,6 @@ def explode( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) @@ -4244,9 +4038,6 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4278,9 +4069,6 @@ def cov(self, *, numeric_only) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4317,9 +4105,6 @@ def corrwith( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] >>> df1 = bpd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) @@ -4353,9 +4138,6 @@ def update( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) >>> new_df = bpd.DataFrame({'B': [4, 5, 6], @@ -4418,9 +4200,6 @@ def groupby( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) @@ -4515,17 +4294,18 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") - ... def minutes_to_hours(x: int) -> float: - ... return x/60 + >>> def minutes_to_hours(x: int) -> float: + ... return x / 60 + >>> minutes_to_hours = bpd.deploy_remote_function( + ... minutes_to_hours, + ... reuse=False, + ... cloud_function_service_account="default", + ... ) # doctest: +SKIP >>> df_minutes = bpd.DataFrame( ... {"system_minutes" : [0, 30, 60, 90, 120], @@ -4540,8 +4320,8 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: [5 rows x 2 columns] - >>> df_hours = df_minutes.map(minutes_to_hours) - >>> df_hours + >>> df_hours = df_minutes.map(minutes_to_hours) # doctest: +SKIP + >>> df_hours # doctest: +SKIP system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4557,11 +4337,11 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> df_minutes = bpd.DataFrame( ... { - ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], - ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] + ... "system_minutes" : [0, 30, 60, None, 90, 120, pd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, pd.NA] ... }, dtype="Int64") - >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') - >>> df_hours + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') # doctest: +SKIP + >>> df_hours # doctest: +SKIP system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4612,9 +4392,6 @@ def join( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Join two DataFrames by specifying how to handle the operation: >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) @@ -4668,7 +4445,6 @@ def join( [1 rows x 4 columns] - Another option to join using the key columns is to use the on parameter: >>> df1.join(df2, on="col2", how="right") @@ -4764,9 +4540,6 @@ def merge( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Merge DataFrames df1 and df2 by specifying type of merge: >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) @@ -4897,7 +4670,6 @@ def round(self, decimals): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -4980,10 +4752,6 @@ def apply(self, func, *, axis=0, args=(), **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -5008,14 +4776,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] ... result += row["col2"]*row["col2"] ... return result - >>> df[["col1", "col2"]].apply(foo, axis=1) + >>> df[["col1", "col2"]].apply(foo, axis=1) # doctest: +SKIP 0 11 1 19 dtype: Int64 @@ -5023,7 +4791,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -5040,8 +4808,8 @@ def apply(self, func, *, axis=0, args=(), **kwargs): ... "chemistry": [88, 56, 72], ... "algebra": [78, 91, 79] ... }, index=["Alice", "Bob", "Charlie"]) - >>> stats = df.apply(marks_analyzer, axis=1) - >>> stats + >>> stats = df.apply(marks_analyzer, axis=1) # doctest: +SKIP + >>> stats # doctest: +SKIP Alice [77.67 78. 77.19 76.71] Bob [75.67 80. 74.15 72.56] Charlie [75.33 75. 75.28 75.22] @@ -5064,14 +4832,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x ... result += y/z ... return result - >>> df.apply(foo, axis=1) + >>> df.apply(foo, axis=1) # doctest: +SKIP 0 2.6 1 3.8 dtype: Float64 @@ -5131,9 +4899,6 @@ def any(self, *, axis=0, bool_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -5178,9 +4943,6 @@ def all(self, axis=0, *, bool_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -5222,8 +4984,6 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df A B @@ -5268,9 +5028,6 @@ def min(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5313,9 +5070,6 @@ def max(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5357,9 +5111,6 @@ def sum(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5399,9 +5150,6 @@ def mean(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5442,8 +5190,6 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5480,7 +5226,6 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5517,9 +5262,6 @@ def var(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5542,7 +5284,6 @@ def var(self, axis=0, *, numeric_only: bool = False): 1 0.5 dtype: Float64 - Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -5562,9 +5303,6 @@ def skew(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], ... 'B': [5, 4, 3, 2, 1], ... 'C': [2, 2, 3, 2, 2]}) @@ -5603,9 +5341,6 @@ def kurt(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5643,9 +5378,6 @@ def std(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5685,9 +5417,6 @@ def count(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -5739,8 +5468,6 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5831,8 +5558,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5880,7 +5605,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): [1 rows x 3 columns] - Args: n (int): Number of rows to return. @@ -5912,9 +5636,6 @@ def idxmin(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5942,9 +5663,6 @@ def idxmax(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5976,9 +5694,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -6027,7 +5742,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): [10 rows x 3 columns] - Args: id_vars (tuple, list, or ndarray, optional): Column(s) to use as identifier variables. @@ -6051,9 +5765,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) >>> df A B @@ -6080,9 +5791,6 @@ def cummin(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6112,9 +5820,6 @@ def cummax(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6144,9 +5849,6 @@ def cumsum(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6181,9 +5883,6 @@ def cumprod(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6222,9 +5921,6 @@ def diff( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6270,9 +5966,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6335,8 +6028,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df A B C @@ -6359,7 +6050,6 @@ def describe(self, include: None | Literal["all"] = None): [8 rows x 2 columns] - Using describe with include = "all": >>> df.describe(include="all") A B C @@ -6406,9 +6096,6 @@ def pivot(self, *, columns, index=None, values=None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... "foo": ["one", "one", "one", "two", "two"], ... "bar": ["A", "B", "C", "A", "B"], @@ -6477,8 +6164,6 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], @@ -6569,9 +6254,6 @@ def stack(self, level=-1): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6608,9 +6290,6 @@ def unstack(self, level=-1): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6649,9 +6328,6 @@ def index(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can access the index of a DataFrame via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6702,9 +6378,6 @@ def columns(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can access the column labels of a DataFrame via ``columns`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6750,11 +6423,8 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], - ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, + ... 'num_wings': [2, 0, 0, 0, pd.NA]}, ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], ... dtype='Int64') >>> df @@ -6831,9 +6501,6 @@ def eval(self, expr: str) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B @@ -6891,7 +6558,6 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 4 columns] - Args: expr (str): The expression string to evaluate. @@ -6907,9 +6573,6 @@ def query(self, expr: str) -> DataFrame | None: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) @@ -6982,9 +6645,6 @@ def interpolate(self, method: str = "linear"): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], ... 'B': [None, 6, None, 2, None, 3], @@ -7032,9 +6692,6 @@ def fillna(self, value): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -7110,8 +6767,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], ... 'string_col': ["a", "b", "c", "b"], @@ -7150,7 +6805,6 @@ def replace( [4 rows x 2 columns] - Args: to_replace (str, regex, list, int, float or None): How to find the values that will be replaced. @@ -7206,9 +6860,6 @@ def iat(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -7240,9 +6891,6 @@ def at(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -7289,9 +6937,6 @@ def dot(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7383,9 +7028,6 @@ def __matmul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7443,9 +7085,6 @@ def __len__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, 1, 2], ... 'b': [3, 4, 5] @@ -7466,10 +7105,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) >>> np.array(df) @@ -7501,9 +7136,6 @@ def __getitem__(self, key): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], @@ -7547,7 +7179,6 @@ def __getitem__(self, key): You can specify a pandas Index with desired column labels. - >>> import pandas as pd >>> df[pd.Index(["age", "location"])] age location 0 20 WA @@ -7576,9 +7207,6 @@ def __setitem__(self, key, value): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 273339efcf..e8079e573b 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -38,9 +38,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) >>> s.size 3 @@ -65,9 +62,6 @@ def __iter__(self) -> Iterator: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -106,9 +100,6 @@ def astype(self, dtype): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Create a DataFrame: >>> d = {'col1': [1, 2], 'col2': [3, 4]} @@ -152,7 +143,7 @@ def astype(self, dtype): Note that this is equivalent of using ``to_datetime`` with ``unit='us'``: - >>> bpd.to_datetime(ser, unit='us', utc=True) + >>> bpd.to_datetime(ser, unit='us', utc=True) # doctest: +SKIP 0 2034-02-08 11:13:20.246789+00:00 1 2021-06-19 17:20:44.123101+00:00 2 2003-06-05 17:30:34.120101+00:00 @@ -350,9 +341,6 @@ def get(self, key, default=None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame( ... [ ... [24.3, 75.7, "high"], @@ -461,9 +449,6 @@ def head(self, n: int = 5): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df @@ -562,8 +547,6 @@ def sample( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, @@ -643,9 +626,6 @@ def dtypes(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) >>> df.dtypes float Float64 @@ -668,9 +648,6 @@ def copy(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Modification in the original Series will not affect the copy Series: >>> s = bpd.Series([1, 2], index=["a", "b"]) @@ -741,10 +718,6 @@ def ffill(self, *, limit: Optional[int] = None): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -770,7 +743,6 @@ def ffill(self, *, limit: Optional[int] = None): [4 rows x 4 columns] - Fill NA/NaN values in Series: >>> series = bpd.Series([1, np.nan, 2, 3]) @@ -790,7 +762,6 @@ def ffill(self, *, limit: Optional[int] = None): maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - Returns: bigframes.pandas.DataFrame or bigframes.pandas.Series or None: Object with missing values filled. @@ -825,13 +796,9 @@ def isna(self) -> NDFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], - ... born=[bpd.NA, "1940-04-25", "1940-04-25"], + ... born=[pd.NA, "1940-04-25", "1940-04-25"], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'], ... )) @@ -863,7 +830,7 @@ def isna(self) -> NDFrame: Show which entries in a Series are NA: - >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) + >>> ser = bpd.Series([5, None, 6, np.nan, pd.NA]) >>> ser 0 5 1 @@ -1068,8 +1035,6 @@ def rolling( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() 0 @@ -1154,10 +1119,6 @@ def pipe( Constructing a income DataFrame from a dictionary. - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) >>> df diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 1e39ec8f94..8dba97ff07 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -45,8 +45,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df A B C @@ -86,8 +84,6 @@ def any(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).any() @@ -125,8 +121,6 @@ def all(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).all() @@ -163,10 +157,6 @@ def count(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) >>> ser.groupby(level=0).count() @@ -202,9 +192,6 @@ def mean( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -263,9 +250,6 @@ def median( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).median() @@ -304,7 +288,6 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -343,10 +326,6 @@ def std( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).std() @@ -390,10 +369,6 @@ def var( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).var() @@ -435,9 +410,6 @@ def rank( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame( ... { ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], @@ -510,10 +482,6 @@ def skew( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', ... 'Parrot', 'Parrot', 'Parrot'], @@ -546,9 +514,6 @@ def kurt( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurt() @@ -579,9 +544,6 @@ def kurtosis( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurtosis() @@ -606,9 +568,8 @@ def first(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() B C @@ -647,8 +608,6 @@ def last(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) >>> df.groupby("A").last() @@ -685,9 +644,6 @@ def sum( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).sum() @@ -730,10 +686,6 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).prod() @@ -766,10 +718,6 @@ def min( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).min() @@ -815,9 +763,6 @@ def max( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).max() @@ -859,9 +804,6 @@ def cumcount(self, ascending: bool = True): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b', 'c'] >>> ser = bpd.Series([5, 1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).cumcount() @@ -897,10 +839,6 @@ def cumprod(self, *args, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumprod() @@ -936,10 +874,6 @@ def cumsum(self, *args, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumsum() @@ -975,10 +909,6 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummin() @@ -1014,10 +944,6 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummax() @@ -1055,10 +981,6 @@ def diff(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).diff() @@ -1101,10 +1023,6 @@ def shift(self, periods: int = 1): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).shift(1) @@ -1145,9 +1063,6 @@ def rolling(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).rolling(2).min() @@ -1204,9 +1119,6 @@ def expanding(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).expanding().min() @@ -1230,9 +1142,6 @@ def head(self, n: int = 5): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) >>> df.groupby('A').head(1) @@ -1259,9 +1168,6 @@ def size(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For SeriesGroupBy: >>> lst = ['a', 'a', 'b'] @@ -1313,9 +1219,6 @@ def __iter__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For SeriesGroupBy: >>> lst = ["a", "a", "b"] @@ -1377,10 +1280,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) min max @@ -1410,10 +1309,6 @@ def aggregate(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) min max @@ -1443,10 +1338,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) >>> ser.groupby(level=0).nunique() @@ -1494,10 +1385,6 @@ def agg(self, func, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1554,10 +1441,6 @@ def aggregate(self, func, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1614,10 +1497,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], ... 'value1': [1, 5, 5, 2, 5, 5], @@ -1650,10 +1529,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 0dd487d056..0e74b3e178 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -12,9 +12,6 @@ def day(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") ... ) @@ -42,9 +39,6 @@ def dayofweek(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -76,9 +70,6 @@ def day_of_week(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -106,9 +97,7 @@ def dayofyear(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -134,9 +123,7 @@ def day_of_year(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -168,7 +155,6 @@ def date(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -189,9 +175,7 @@ def hour(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -215,9 +199,7 @@ def minute(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -241,9 +223,6 @@ def month(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") ... ) @@ -267,9 +246,6 @@ def isocalendar(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() ... ) @@ -287,11 +263,9 @@ def isocalendar(self): [9 rows x 3 columns] - Returns: DataFrame With columns year, week and day. - """ @property @@ -300,9 +274,7 @@ def second(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -331,7 +303,6 @@ def time(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -353,7 +324,6 @@ def quarter(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -374,9 +344,6 @@ def year(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") ... ) @@ -400,9 +367,6 @@ def days(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -418,9 +382,6 @@ def seconds(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -436,9 +397,6 @@ def microseconds(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -453,9 +411,6 @@ def total_seconds(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s 0 1 days 00:01:01.000001 @@ -472,7 +427,6 @@ def tz(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -495,7 +449,6 @@ def unit(self) -> str: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index eba47fc1f9..04f7f5938d 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -32,9 +32,6 @@ def name(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3], name='x') >>> idx Index([1, 2, 3], dtype='Int64', name='x') @@ -63,9 +60,6 @@ def values(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -86,9 +80,6 @@ def ndim(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -121,9 +112,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -156,9 +144,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bool(bpd.Index([1, 2, 3]).is_monotonic_increasing) True @@ -181,9 +166,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bool(bpd.Index([3, 2, 1]).is_monotonic_decreasing) True @@ -206,9 +188,6 @@ def from_frame(cls, frame) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) @@ -246,9 +225,6 @@ def shape(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -268,9 +244,6 @@ def nlevels(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> mi = bpd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) >>> mi MultiIndex([('a', 'b', 'c')], @@ -290,9 +263,6 @@ def is_unique(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 5, 7, 7]) >>> idx.is_unique False @@ -313,9 +283,6 @@ def has_duplicates(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 5, 7, 7]) >>> bool(idx.has_duplicates) True @@ -336,9 +303,6 @@ def dtype(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -364,9 +328,6 @@ def T(self) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -403,9 +364,6 @@ def copy( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['a', 'b', 'c']) >>> new_idx = idx.copy() >>> idx is new_idx @@ -438,14 +396,10 @@ def astype(self, dtype): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') - Args: dtype (str, data type, or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, @@ -487,9 +441,6 @@ def get_level_values(self, level) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(list('abc')) >>> idx Index(['a', 'b', 'c'], dtype='string') @@ -517,9 +468,6 @@ def to_series(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['Ant', 'Bear', 'Cow'], name='animal') By default, the original index and original name is reused. @@ -571,9 +519,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1,2,3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -611,9 +556,6 @@ def all(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - True, because nonzero integers are considered True. >>> bool(bpd.Index([1, 2, 3]).all()) @@ -639,9 +581,6 @@ def any(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> index = bpd.Index([0, 1, 2]) >>> bool(index.any()) True @@ -665,9 +604,6 @@ def min(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.min()) 1 @@ -687,9 +623,6 @@ def max(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.max()) 3 @@ -713,9 +646,6 @@ def argmin(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Consider dataset containing cereal calories >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -750,9 +680,6 @@ def get_loc( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> unique_index = bpd.Index(list('abc')) >>> unique_index.get_loc('b') 1 @@ -794,9 +721,6 @@ def argmax(self) -> int: Consider dataset containing cereal calories - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) >>> s @@ -828,9 +752,6 @@ def nunique(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -860,9 +781,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([10, 100, 1, 1000]) >>> idx Index([10, 100, 1, 1000], dtype='Int64') @@ -904,10 +822,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 @@ -961,10 +875,6 @@ def fillna(self, value) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) Index([0.0, 0.0, 3.0], dtype='Float64') @@ -992,9 +902,6 @@ def rename(self, name, *, inplace): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') Index(['A', 'C', 'A', 'B'], dtype='string', name='grade') @@ -1022,9 +929,6 @@ def drop(self, labels) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['a', 'b', 'c']) >>> idx.drop(['a']) Index(['b', 'c'], dtype='string') @@ -1042,10 +946,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() Index([1.0, 3.0], dtype='Float64') @@ -1070,11 +970,9 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Generate an pandas.Index with duplicate values. + >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) The keep parameter controls which duplicate values are removed. @@ -1113,8 +1011,6 @@ def unique(self, level: Hashable | int | None = None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 1, 2, 3, 3]) >>> idx.unique() Index([1, 2, 3], dtype='Int64') @@ -1134,8 +1030,6 @@ def item(self, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1], index=['a']) >>> s.index.item() 'a' diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 105a376728..973d5c763a 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -15,10 +15,6 @@ def year(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year Index([2025], dtype='Int64') @@ -31,10 +27,6 @@ def month(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month Index([2], dtype='Int64') @@ -47,10 +39,6 @@ def day(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day Index([15], dtype='Int64') @@ -63,10 +51,6 @@ def day_of_week(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week Index([5], dtype='Int64') @@ -79,10 +63,6 @@ def dayofweek(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek Index([5], dtype='Int64') @@ -95,10 +75,6 @@ def weekday(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday Index([5], dtype='Int64') diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py index a882aa40e3..018e638de3 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/multi.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -25,8 +25,6 @@ def from_tuples( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) @@ -62,8 +60,6 @@ def from_arrays( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) MultiIndex([(1, 'red'), diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 697c17f23c..0f42433384 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -34,8 +34,6 @@ def cut( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 5, 10]) >>> s 0 0 @@ -73,7 +71,6 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: - >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 932959a826..c6ec5dfaf1 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -38,9 +38,6 @@ def dt(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series 0 2000-01-01 00:00:00 @@ -110,9 +107,6 @@ def index(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can access the index of a Series via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -161,13 +155,10 @@ def shape(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape (4,) - >>> s = bpd.Series(['Alice', 'Bob', bpd.NA]) + >>> s = bpd.Series(['Alice', 'Bob', pd.NA]) >>> s.shape (3,) """ @@ -180,9 +171,6 @@ def dtype(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.dtype Int64Dtype() @@ -200,9 +188,6 @@ def name(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For a Series: >>> s = bpd.Series([1, 2, 3], dtype="Int64", name='Numbers') @@ -248,9 +233,6 @@ def hasnans(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, None]) >>> s 0 1.0 @@ -272,9 +254,6 @@ def T(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -297,9 +276,6 @@ def transpose(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -337,10 +313,6 @@ def reset_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) >>> s.index.name = "idx" @@ -440,9 +412,6 @@ def keys(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s.keys() Index([0, 1, 2], dtype='Int64') @@ -522,9 +491,6 @@ def to_markdown( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) | | animal | @@ -577,16 +543,14 @@ def to_dict( **Examples:** - >>> import bigframes.pandas as bpd >>> from collections import OrderedDict, defaultdict - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.to_dict() {np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4} >>> s.to_dict(into=OrderedDict) - OrderedDict({np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4}) + OrderedDict([(np.int64(0), 1), (np.int64(1), 2), (np.int64(2), 3), (np.int64(3), 4)]) >>> dd = defaultdict(list) >>> s.to_dict(into=dd) @@ -617,9 +581,6 @@ def to_frame(self, name=None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["a", "b", "c"], ... name="vals") >>> s.to_frame() @@ -714,9 +675,6 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -748,10 +706,6 @@ def to_numpy( **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -803,9 +757,6 @@ def to_pickle(self, path, *, allow_large_results=None, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> original_df = bpd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar @@ -865,9 +816,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4]) >>> s 0 1 @@ -902,10 +850,7 @@ def count(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([0.0, 1.0, bpd.NA]) + >>> s = bpd.Series([0.0, 1.0, pd.NA]) >>> s 0 0.0 1 1.0 @@ -928,9 +873,6 @@ def nunique(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -963,9 +905,6 @@ def unique(self, keep_order=True) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -1006,9 +945,6 @@ def mode(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, 4, 8, 2, 4, None]) >>> s.mode() 0 2.0 @@ -1031,11 +967,9 @@ def drop_duplicates( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Generate a Series with duplicated entries. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], ... name='animal') >>> s @@ -1101,7 +1035,6 @@ def duplicated(self, keep="first") -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1172,9 +1105,6 @@ def idxmin(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(data=[1, None, 4, 1], ... index=['A', 'B', 'C', 'D']) >>> s @@ -1201,9 +1131,6 @@ def idxmax(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(data=[1, None, 4, 3, 4], ... index=['A', 'B', 'C', 'D', 'E']) >>> s @@ -1229,8 +1156,6 @@ def round(self, decimals: int = 0) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() 0 0.0 @@ -1262,9 +1187,6 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) >>> s 0 [1 2 3] @@ -1301,9 +1223,6 @@ def corr(self, other, method="pearson", min_periods=None) -> float: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) >>> s1.corr(s2) @@ -1340,8 +1259,6 @@ def autocorr(self, lag: int = 1) -> float: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS np.float64(0.10355263309024067) @@ -1377,9 +1294,6 @@ def cov( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = bpd.Series([0.12528585, 0.26962463, 0.51111198]) >>> s1.cov(s2) @@ -1403,12 +1317,8 @@ def diff(self) -> Series: Calculates the difference of a Series element compared with another element in the Series (default is element in previous row). - **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Difference with previous row >>> s = bpd.Series([1, 1, 2, 3, 5, 8]) @@ -1472,9 +1382,6 @@ def dot(self, other) -> Series | np.ndarray: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) >>> s.dot(other) @@ -1496,7 +1403,6 @@ def dot(self, other) -> Series | np.ndarray: Series and each rows of other if other is a DataFrame or a numpy.ndarray between the Series and each columns of the numpy array. - """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1529,10 +1435,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s 0 @@ -1628,10 +1530,6 @@ def sort_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() 1 c @@ -1690,8 +1588,6 @@ def nlargest( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1776,8 +1672,6 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1864,7 +1758,6 @@ def apply( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For applying arbitrary python function a `remote_function` is recommended. Let's use ``reuse=False`` flag to make sure a new `remote_function` @@ -1872,9 +1765,13 @@ def apply( to potentially reuse a previously deployed `remote_function` from the same user defined function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") - ... def minutes_to_hours(x: int) -> float: + >>> def minutes_to_hours(x: int) -> float: ... return x/60 + >>> bpd.deploy_remote_function( # doctest: +SKIP + ... minutes_to_hours, + ... reuse=False, + ... cloud_function_service_account="default", + ... ) >>> minutes = bpd.Series([0, 30, 60, 90, 120]) >>> minutes @@ -1885,7 +1782,7 @@ def apply( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) + >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP >>> hours 0 0.0 1 0.5 @@ -1898,7 +1795,7 @@ def apply( a `remote_function`, you would provide the names of the packages via `packages` param. - >>> @bpd.remote_function( + >>> @bpd.remote_function( # doctest: +SKIP ... reuse=False, ... packages=["cryptography"], ... cloud_function_service_account="default" @@ -1915,11 +1812,11 @@ def apply( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) + >>> hashes = names.apply(get_hash) # doctest: +SKIP You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -1932,8 +1829,8 @@ def apply( ... "I love this product! It's amazing.", ... "Hungry? Wanna eat? Lets go!" ... ]) - >>> features = texts.apply(text_analyzer) - >>> features + >>> features = texts.apply(text_analyzer) # doctest: +SKIP + >>> features # doctest: +SKIP 0 [9 1 0 0] 1 [6 1 1 0] 2 [5 0 1 2] @@ -2006,8 +1903,6 @@ def combine( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -2065,9 +1960,6 @@ def groupby( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can group by a named index level. >>> s = bpd.Series([380, 370., 24., 26.], @@ -2089,7 +1981,6 @@ def groupby( You can also group by more than one index levels. - >>> import pandas as pd >>> s = bpd.Series([380, 370., 24., 26.], ... index=pd.MultiIndex.from_tuples( ... [("Falcon", "Clear"), @@ -2238,9 +2129,6 @@ def drop( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) >>> s A 0 @@ -2256,7 +2144,6 @@ def drop( Drop 2nd level label in MultiIndex Series: - >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -2322,7 +2209,6 @@ def reorder_levels(self, order: Sequence, axis) -> Series: axis ({0 or 'index', 1 or 'columns'}, default 0): For `Series` this parameter is unused and defaults to 0. - Returns: type of caller (new object) """ @@ -2369,10 +2255,6 @@ def interpolate(self, method: str = "linear"): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - Filling in NaN in a Series via linear interpolation. >>> s = bpd.Series([0, 1, np.nan, 3]) @@ -2414,10 +2296,6 @@ def fillna( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s 0 @@ -2470,8 +2348,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s 0 1 @@ -2596,10 +2472,6 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - Drop NA values from a Series: >>> ser = bpd.Series([1., 2., np.nan]) @@ -2616,7 +2488,7 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') + >>> ser = bpd.Series(['2', pd.NA, '', None, 'I stay'], dtype='object') >>> ser 0 2 1 @@ -2660,10 +2532,6 @@ def between( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - Boundary values are included by default: >>> s = bpd.Series([2, 0, 4, 8, np.nan]) @@ -2719,10 +2587,6 @@ def case_when( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) >>> b = bpd.Series([0, 3, 4, 5]) @@ -2789,9 +2653,6 @@ def cumprod(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2826,10 +2687,6 @@ def cumsum(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2869,10 +2726,6 @@ def cummax(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2892,7 +2745,6 @@ def cummax(self): 4 5.0 dtype: Float64 - Returns: bigframes.pandas.Series: Return cumulative maximum of scalar or Series. @@ -2908,10 +2760,6 @@ def cummin(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2945,10 +2793,6 @@ def eq(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2990,10 +2834,6 @@ def ne(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3037,10 +2877,6 @@ def le(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3083,10 +2919,6 @@ def lt(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3130,10 +2962,6 @@ def ge(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3177,10 +3005,6 @@ def gt(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3223,10 +3047,7 @@ def add(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> a = bpd.Series([1, 2, 3, bpd.NA]) + >>> a = bpd.Series([1, 2, 3, pd.NA]) >>> a 0 1 1 2 @@ -3287,9 +3108,6 @@ def __add__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3339,10 +3157,6 @@ def radd(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3404,10 +3218,6 @@ def sub( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3449,9 +3259,6 @@ def __sub__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3501,10 +3308,6 @@ def rsub(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3563,10 +3366,6 @@ def mul(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3609,9 +3408,6 @@ def __mul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3649,10 +3445,6 @@ def rmul(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3710,10 +3502,6 @@ def truediv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3756,9 +3544,6 @@ def __truediv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3796,10 +3581,6 @@ def rtruediv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3858,10 +3639,6 @@ def floordiv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3904,9 +3681,6 @@ def __floordiv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can divide by a scalar: >>> s = bpd.Series([15, 30, 45]) @@ -3944,10 +3718,6 @@ def rfloordiv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4006,10 +3776,6 @@ def mod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4052,9 +3818,6 @@ def __mod__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can modulo with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -4091,10 +3854,6 @@ def rmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4155,9 +3914,6 @@ def pow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4201,9 +3957,6 @@ def __pow__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can exponentiate with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -4242,9 +3995,6 @@ def rpow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4304,10 +4054,6 @@ def divmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4356,10 +4102,6 @@ def rdivmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4411,10 +4153,6 @@ def combine_first(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) >>> s1.combine_first(s2) @@ -4453,11 +4191,6 @@ def update(self, other) -> None: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) >>> s @@ -4547,10 +4280,6 @@ def any( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - For Series input, the output is a scalar indicating whether any element is True. >>> bpd.Series([False, False]).any() @@ -4583,9 +4312,6 @@ def max( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the max of a Series: >>> s = bpd.Series([1, 3]) @@ -4599,7 +4325,7 @@ def max( Calculating the max of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4625,9 +4351,6 @@ def min( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the min of a Series: >>> s = bpd.Series([1, 3]) @@ -4641,7 +4364,7 @@ def min( Calculating the min of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4666,9 +4389,6 @@ def std( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], ... 'age': [21, 25, 62, 43], ... 'height': [1.61, 1.87, 1.49, 2.01]} @@ -4714,9 +4434,6 @@ def sum(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the sum of a Series: >>> s = bpd.Series([1, 3]) @@ -4730,7 +4447,7 @@ def sum(self): Calculating the sum of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4750,9 +4467,6 @@ def mean(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the mean of a Series: >>> s = bpd.Series([1, 3]) @@ -4766,7 +4480,7 @@ def mean(self): Calculating the mean of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4787,8 +4501,6 @@ def median(self, *, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.median() np.float64(2.0) @@ -4828,8 +4540,6 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) np.float64(2.5) @@ -4880,9 +4590,6 @@ def describe(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['A', 'A', 'B']) >>> s 0 A @@ -4908,9 +4615,6 @@ def skew(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.skew() np.float64(0.0) @@ -4946,9 +4650,6 @@ def kurt(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) >>> s cat 1 @@ -4989,9 +4690,6 @@ def item(self: Series, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -5013,9 +4711,6 @@ def items(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): ... print(f"Index : {index}, Value : {value}") @@ -5035,9 +4730,6 @@ def where(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -5103,9 +4795,6 @@ def mask(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -5149,7 +4838,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -5162,12 +4851,12 @@ def mask(self, cond, other): 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask) + >>> s.mask(should_mask) # doctest: +SKIP 0 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask, "REDACTED") + >>> s.mask(should_mask, "REDACTED") # doctest: +SKIP 0 REDACTED 1 Bob 2 Caroline @@ -5261,9 +4950,6 @@ def argmax(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -5299,9 +4985,6 @@ def argmin(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -5340,9 +5023,6 @@ def rename(self, index, *, inplace, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -5392,9 +5072,6 @@ def rename_axis(self, mapper, *, inplace, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Series >>> s = bpd.Series(["dog", "cat", "monkey"]) @@ -5457,10 +5134,7 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") + >>> s = bpd.Series([3, 1, 2, 3, 4, pd.NA], dtype="Int64") >>> s 0 3 @@ -5536,8 +5210,6 @@ def str(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["A_Str_Series"]) >>> s 0 A_Str_Series @@ -5565,8 +5237,6 @@ def plot(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") >>> plot @@ -5592,9 +5262,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', ... 'hippo'], name='animal') >>> s @@ -5658,9 +5325,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing np.True_ @@ -5682,9 +5346,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing np.True_ @@ -5725,9 +5386,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) + >>> s = bpd.Series(['cat', 'dog', pd.NA, 'rabbit']) >>> s 0 cat 1 dog @@ -5747,7 +5406,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function(cloud_function_service_account="default") + >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: @@ -5756,7 +5415,7 @@ def map( ... ]) ... return "N/A" - >>> s.map(my_mapper) + >>> s.map(my_mapper) # doctest: +SKIP 0 cAt 1 dOg 2 N/A @@ -5790,9 +5449,6 @@ def iloc(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] @@ -5870,9 +5526,6 @@ def loc(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], ... columns=['max_speed', 'shield']) @@ -5957,9 +5610,6 @@ def iat(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -5992,9 +5642,6 @@ def at(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -6028,9 +5675,6 @@ def values(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.Series([1, 2, 3]).values array([1, 2, 3]) @@ -6050,9 +5694,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -6087,10 +5728,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> ser = bpd.Series([1, 2, 3]) >>> np.asarray(ser) @@ -6115,9 +5752,6 @@ def __len__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> len(s) 3 @@ -6131,9 +5765,6 @@ def __invert__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series([True, False, True]) >>> ~ser 0 False @@ -6152,9 +5783,6 @@ def __and__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -6191,9 +5819,6 @@ def __or__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -6230,9 +5855,6 @@ def __xor__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -6269,9 +5891,6 @@ def __getitem__(self, indexer): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([15, 30, 45]) >>> s[1] np.int64(30) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index fe94bf3049..7a37eba341 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,8 +20,6 @@ def __getitem__(self, key: typing.Union[int, slice]): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) >>> s.str[0] 0 A @@ -53,12 +51,10 @@ def extract(self, pat: str, flags: int = 0): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - A pattern with two groups will return a DataFrame with two columns. Non-matches will be `NaN`. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['a1', 'b2', 'c3']) >>> s.str.extract(r'([ab])(\\d)') 0 1 @@ -115,8 +111,6 @@ def find(self, sub, start: int = 0, end=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) >>> ser.str.find("_") 0 3 @@ -145,12 +139,10 @@ def len(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Returns the length (number of characters) in a string. - >>> s = bpd.Series(['dog', '', bpd.NA]) + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(['dog', '', pd.NA]) >>> s.str.len() 0 3 1 0 @@ -172,8 +164,6 @@ def lower(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -197,8 +187,6 @@ def slice(self, start=None, stop=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["koala", "dog", "chameleon"]) >>> s 0 koala @@ -250,13 +238,11 @@ def strip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([ ... '1. Ant.', ... ' 2. Bee? ', ... '\\t3. Cat!\\n', - ... bpd.NA, + ... pd.NA, ... ]) >>> s.str.strip() 0 1. Ant. @@ -293,8 +279,6 @@ def upper(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -322,8 +306,6 @@ def isnumeric(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False @@ -349,8 +331,6 @@ def isalpha(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalpha() 0 True @@ -375,8 +355,6 @@ def isdigit(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['23', '1a', '1/5', '']) >>> s.str.isdigit() 0 True @@ -401,8 +379,6 @@ def isalnum(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True @@ -439,8 +415,6 @@ def isspace(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) >>> s.str.isspace() 0 True @@ -465,8 +439,6 @@ def islower(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.islower() 0 True @@ -492,8 +464,6 @@ def isupper(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.isupper() 0 False @@ -518,12 +488,10 @@ def isdecimal(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - The `isdecimal` method checks for characters used to form numbers in base 10. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['23', '³', '⅕', '']) >>> s.str.isdecimal() 0 True @@ -550,9 +518,7 @@ def rstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) >>> s.str.rstrip() 0 Ant 1 Bee @@ -583,9 +549,7 @@ def lstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) >>> s.str.lstrip() 0 Ant 1 Bee @@ -611,8 +575,6 @@ def repeat(self, repeats: int): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['a', 'b', 'c']) >>> s 0 a @@ -645,8 +607,6 @@ def capitalize(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -672,11 +632,9 @@ def cat(self, others, *, join): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can concatenate each string in a Series to another string. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Jane', 'John']) >>> s.str.cat(" Doe") 0 Jane Doe @@ -729,11 +687,9 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Returning a Series of booleans using only a literal pattern. + >>> import bigframes.pandas as bpd >>> s1 = bpd.Series(['Mouse', 'dog', 'house and parrot', '23', None]) >>> s1.str.contains('og') 0 False @@ -833,14 +789,12 @@ def replace( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - When *pat* is a string and *regex* is True, the given *pat* is compiled as a regex. When *repl* is a string, it replaces matching regex patterns as with `re.sub()`. NaN value(s) in the Series are left as is: - >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(['foo', 'fuz', pd.NA]) >>> s.str.replace('f.', 'ba', regex=True) 0 bao 1 baz @@ -850,7 +804,7 @@ def replace( When *pat* is a string and *regex* is False, every *pat* is replaced with *repl* as with `str.replace()`: - >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) + >>> s = bpd.Series(['f.o', 'fuz', pd.NA]) >>> s.str.replace('f.', 'ba', regex=False) 0 bao 1 fuz @@ -896,9 +850,7 @@ def startswith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) + >>> s = bpd.Series(['bat', 'Bear', 'caT', pd.NA]) >>> s 0 bat 1 Bear @@ -941,9 +893,7 @@ def endswith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) + >>> s = bpd.Series(['bat', 'bear', 'caT', pd.NA]) >>> s 0 bat 1 bear @@ -987,9 +937,6 @@ def split( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series( ... [ ... "a regular sentence", @@ -1031,8 +978,6 @@ def match(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(["horse", "eagle", "donkey"]) >>> ser.str.match("e") 0 False @@ -1060,8 +1005,6 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r'd.+') 0 False @@ -1092,8 +1035,6 @@ def get(self, i: int): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["apple", "banana", "fig"]) >>> s.str.get(3) 0 l @@ -1122,8 +1063,6 @@ def pad( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["caribou", "tiger"]) >>> s 0 caribou @@ -1170,8 +1109,6 @@ def ljust( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.ljust(8, fillchar='.') 0 dog..... @@ -1202,8 +1139,6 @@ def rjust( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.rjust(8, fillchar='.') 0 .....dog @@ -1238,9 +1173,7 @@ def zfill( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) + >>> s = bpd.Series(['-1', '1', '1000', pd.NA]) >>> s 0 -1 1 1 @@ -1278,8 +1211,6 @@ def center( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.center(8, fillchar='.') 0 ..dog... @@ -1309,12 +1240,9 @@ def join(self, sep: str): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import pandas as pd - Example with a list that contains non-string elements. + >>> import bigframes.pandas as bpd >>> s = bpd.Series([['lion', 'elephant', 'zebra'], ... ['dragon'], ... ['duck', 'swan', 'fish', 'guppy']]) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 9c17b9632e..189dabcf24 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -37,11 +37,9 @@ def to_datetime( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Converting a Scalar to datetime: + >>> import bigframes.pandas as bpd >>> scalar = 123456.789 >>> bpd.to_datetime(scalar, unit = 's') Timestamp('1970-01-02 10:17:36.789000') diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index 9442e965fa..220b15f56e 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -55,7 +55,6 @@ def to_timedelta( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Converting a Scalar to timedelta diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 0fdca4dde1..3190c92b92 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -61,7 +61,6 @@ def read_gbq( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None If the input is a table ID: diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index aec911d2fe..7d5c108f93 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -27,8 +27,6 @@ def read_parquet( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path, engine="bigquery") diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 4757f5ed9d..9dc7b39873 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -71,8 +71,6 @@ def read_csv( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) >>> df.head(2) @@ -192,8 +190,6 @@ def read_json( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") >>> df.head(2) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 33088dc019..2950cf422a 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -35,8 +35,6 @@ def read_pickle( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) diff --git a/third_party/bigframes_vendored/pandas/pandas/_typing.py b/third_party/bigframes_vendored/pandas/pandas/_typing.py index e665339fc8..76e984a173 100644 --- a/third_party/bigframes_vendored/pandas/pandas/_typing.py +++ b/third_party/bigframes_vendored/pandas/pandas/_typing.py @@ -100,7 +100,6 @@ Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, datetime] IntStrT = TypeVar("IntStrT", int, str) - # timestamp and timedelta convertible types TimestampConvertibleTypes = Union[ @@ -267,7 +266,6 @@ def closed(self) -> bool: # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] - # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 4ed5c8eb0b..a7cd2c0cc9 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -8,10 +8,11 @@ class PlotAccessor: Make plots of Series or DataFrame with the `matplotlib` backend. **Examples:** - For Series: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + + For Series: + >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -57,9 +58,6 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -96,7 +94,6 @@ def line( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -164,7 +161,6 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], @@ -233,7 +229,6 @@ def bar( Basic plot. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) >>> ax = df.plot.bar(x='lab', y='val', rot=0) @@ -296,7 +291,6 @@ def scatter( in a DataFrame's columns. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], ... [6.4, 3.2, 1], [5.9, 3.0, 2]], ... columns=['length', 'width', 'species']) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index a7344d49d4..44eefeddd7 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -30,7 +30,6 @@ class KMeans(_BaseKMeans): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> from bigframes.ml.cluster import KMeans >>> X = bpd.DataFrame({"feat0": [1, 1, 1, 10, 10, 10], "feat1": [2, 4, 0, 2, 4, 0]}) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index c3c3a77b71..e487a2e7c1 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,7 +24,6 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], ... "column": [0,1] * 7, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index f13c52bfb6..3535edc8f9 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -24,7 +24,6 @@ class PCA(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import PCA - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [-1, -2, -3, 1, 2, 3], "feat1": [-1, -1, -2, 1, 1, 2]}) >>> pca = PCA(n_components=2).fit(X) >>> pca.predict(X) # doctest:+SKIP diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 42eab24c82..175ad86b21 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -22,7 +22,6 @@ class SimpleImputer(_BaseImputer): >>> import bigframes.pandas as bpd >>> from bigframes.ml.impute import SimpleImputer - >>> bpd.options.display.progress_bar = None >>> X_train = bpd.DataFrame({"feat0": [7.0, 4.0, 10.0], "feat1": [2.0, None, 5.0], "feat2": [3.0, 6.0, 9.0]}) >>> imp_mean = SimpleImputer().fit(X_train) >>> X_test = bpd.DataFrame({"feat0": [None, 4.0, 10.0], "feat1": [2.0, None, None], "feat2": [3.0, 6.0, 9.0]}) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 21ba5a3bf8..7543edd10b 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -66,7 +66,6 @@ class LinearRegression(RegressorMixin, LinearModel): >>> from bigframes.ml.linear_model import LinearRegression >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index a85c6fae8d..d449a1040c 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -25,7 +25,6 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): >>> from bigframes.ml.linear_model import LogisticRegression >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index fd6e8678ea..e60cc8cec4 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -30,7 +30,6 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 2, 1, 3]) >>> y_pred = bpd.DataFrame([0, 1, 2, 3]) @@ -80,7 +79,6 @@ def confusion_matrix( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([2, 0, 2, 2, 0, 1]) >>> y_pred = bpd.DataFrame([0, 0, 2, 2, 0, 2]) @@ -132,7 +130,6 @@ def recall_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -181,7 +178,6 @@ def precision_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -232,7 +228,6 @@ def f1_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 9262ffbd3d..cd5bd2cbcd 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -33,7 +33,6 @@ def auc(x, y) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> x = bpd.DataFrame([1, 1, 2, 2]) >>> y = bpd.DataFrame([2, 3, 4, 5]) @@ -89,7 +88,6 @@ def roc_auc_score(y_true, y_score) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) @@ -139,7 +137,6 @@ def roc_curve( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([1, 1, 2, 2]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 1c14e8068b..85f0c1ecf9 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -46,7 +46,6 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -73,7 +72,6 @@ def mean_squared_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -100,7 +98,6 @@ def mean_absolute_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py index ec16fa8cf9..326589be7d 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py @@ -69,7 +69,6 @@ class KFold(_BaseKFold): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import KFold - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> kf = KFold(n_splits=3, random_state=42) @@ -162,7 +161,6 @@ def train_test_split( >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import train_test_split - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]}) >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index b93c47ea04..6f84018853 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -19,7 +19,6 @@ def cross_validate(estimator, X, y=None, *, cv=None): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import cross_validate, KFold >>> from bigframes.ml.linear_model import LinearRegression - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> model = LinearRegression() diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 5476a9fb3c..64a5786f17 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -25,7 +25,6 @@ class OneHotEncoder(BaseEstimator): >>> from bigframes.ml.preprocessing import OneHotEncoder >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> enc = OneHotEncoder() >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) From c85d47fad87bfbaacaf6bdc33c285d29aae4369c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 21:59:14 +0000 Subject: [PATCH 02/63] fix docs --- bigframes/bigquery/_operations/datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index c4aba91a29..e27a3de0c8 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -69,7 +69,7 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** - >>> import bigframes.pandas as bpd + >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) From 78bfccffa58d2a260f235170e86c2c94ae321c0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 23:45:28 +0000 Subject: [PATCH 03/63] fix unit tests --- bigframes/bigquery/_operations/ai.py | 10 +++---- bigframes/core/tools/datetimes.py | 2 +- bigframes/operations/ai.py | 10 +++---- bigframes/operations/base.py | 2 +- bigframes/operations/semantics.py | 12 ++++----- bigframes/pandas/__init__.py | 1 - bigframes/session/__init__.py | 26 ++++++++++++++++--- scripts/publish_api_coverage.py | 1 + tests/unit/conftest.py | 24 +++++++++++++++++ tests/unit/test_pandas.py | 26 ++++++++++++------- .../pandas/core/tools/datetimes.py | 1 + 11 files changed, 82 insertions(+), 33 deletions(-) create mode 100644 tests/unit/conftest.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 7698c2c95c..3a9c7b130e 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -201,7 +201,7 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... 1 {'result': 4, 'full_response': '{"candidates":... @@ -275,7 +275,7 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... 1 {'result': 4.0, 'full_response': '{"candidates... @@ -346,7 +346,7 @@ def if_( **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) + >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True 1 True @@ -395,7 +395,7 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) + >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df creature type @@ -445,7 +445,7 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) + >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 1 1.0 diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index fd7561f4b4..0e5594d498 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -42,7 +42,7 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, - session: Optional[bigframes.session.Session], + session: Optional[bigframes.session.Session] = None, ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index dbbf16afc3..253b838e90 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -114,7 +114,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -132,7 +132,7 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -263,7 +263,7 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -352,7 +352,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -602,7 +602,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index ebb5767264..7d4c996ea5 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -135,7 +135,7 @@ def __init__( # explicitly chose even if it is None. This is important for the # polars backend where the implicit column labels are integers. if not isinstance(data, blocks.Block): - block = block.with_column_labels([name]) + block = block.with_column_labels([name or getattr(data, "name", None)]) self._block: blocks.Block = block diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index b4f7af1aca..176e0ad83a 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -246,7 +246,7 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -319,7 +319,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -432,7 +432,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -554,7 +554,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -794,7 +794,7 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -994,7 +994,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 19ea282762..0193dc629d 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -226,7 +226,6 @@ def to_datetime( format=format, unit=unit, ) - return bigframes.core.tools.to_datetime() to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 11621e8ea7..54755482f3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -2291,6 +2291,12 @@ def read_gbq_object_table( # interchangeably. # ========================================================================= def cut(self, *args, **kwargs) -> bigframes.series.Series: + """Cuts a BigQuery DataFrames object. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.cut` for full documentation. + """ import bigframes.core.reshape.tile return bigframes.core.reshape.tile.cut( @@ -2299,7 +2305,7 @@ def cut(self, *args, **kwargs) -> bigframes.series.Series: **kwargs, ) - def DataFrame(self, *args, **kwargs) -> bigframes.dataframe.DataFrame: + def DataFrame(self, *args, **kwargs): """Constructs a DataFrame. Included for compatibility between bpd and Session. @@ -2310,7 +2316,7 @@ def DataFrame(self, *args, **kwargs) -> bigframes.dataframe.DataFrame: return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) - def MultiIndex(self, *args, **kwargs) -> bigframes.core.indexes.MultiIndex: + def MultiIndex(self, *args, **kwargs): """Constructs a MultiIndex. Included for compatibility between bpd and Session. @@ -2325,7 +2331,7 @@ def MultiIndex(self, *args, **kwargs) -> bigframes.core.indexes.MultiIndex: MultiIndex.from_frame = bigframes.core.indexes.MultiIndex.from_frame # type: ignore MultiIndex.from_arrays = bigframes.core.indexes.MultiIndex.from_arrays # type: ignore - def Index(self, *args, **kwargs) -> bigframes.core.indexes.Index: + def Index(self, *args, **kwargs): """Constructs a Index. Included for compatibility between bpd and Session. @@ -2336,7 +2342,7 @@ def Index(self, *args, **kwargs) -> bigframes.core.indexes.Index: return bigframes.core.indexes.Index(*args, session=self, **kwargs) - def Series(self, *args, **kwargs) -> bigframes.series.Series: + def Series(self, *args, **kwargs): """Constructs a Series. Included for compatibility between bpd and Session. @@ -2350,6 +2356,12 @@ def Series(self, *args, **kwargs) -> bigframes.series.Series: def to_datetime( self, *args, **kwargs ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + """Converts a BigQuery DataFrames object to datetime dtype. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.to_datetime` for full documentation. + """ import bigframes.core.tools return bigframes.core.tools.to_datetime( @@ -2359,6 +2371,12 @@ def to_datetime( ) def to_timedelta(self, *args, **kwargs): + """Converts a BigQuery DataFrames object to timedelta/duration dtype. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.to_timedelta` for full documentation. + """ import bigframes.pandas.core.tools.timedeltas return bigframes.pandas.core.tools.timedeltas.to_timedelta( diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 8f305bcc0f..6c94c06456 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -25,6 +25,7 @@ import pandas.core.indexes.accessors import pandas.core.strings.accessor import pandas.core.window.rolling +import sklearn # noqa import bigframes import bigframes.core.groupby diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000000..a9b26afeef --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +@pytest.fixture(scope="session") +def polars_session(): + pytest.importorskip("polars") + + from bigframes.testing import polars_session + + return polars_session.TestSession() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 73e0b7f2d6..5e75e6b20f 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -64,8 +64,12 @@ def test_method_matches_session(method_name: str): pandas_method = getattr(bigframes.pandas, method_name) pandas_doc = inspect.getdoc(pandas_method) assert pandas_doc is not None, "docstrings are required" - assert re.sub(leading_whitespace, "", pandas_doc) == re.sub( - leading_whitespace, "", session_doc + + pandas_doc_stripped = re.sub(leading_whitespace, "", pandas_doc) + session_doc_stripped = re.sub(leading_whitespace, "", session_doc) + assert ( + pandas_doc_stripped == session_doc_stripped + or ":`bigframes.pandas" in session_doc_stripped ) # Add `eval_str = True` so that deferred annotations are turned into their @@ -75,18 +79,20 @@ def test_method_matches_session(method_name: str): eval_str=True, globals={**vars(bigframes.session), **{"dataframe": bigframes.dataframe}}, ) - pandas_signature = inspect.signature(pandas_method, eval_str=True) - assert [ - # Kind includes position, which will be an offset. - parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) - for parameter in pandas_signature.parameters.values() - ] == [ + session_args = [ # Kind includes position, which will be an offset. parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) for parameter in session_signature.parameters.values() # Don't include the first parameter, which is `self: Session` - ][ - 1: + ][1:] + pandas_signature = inspect.signature(pandas_method, eval_str=True) + pandas_args = [ + # Kind includes position, which will be an offset. + parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) + for parameter in pandas_signature.parameters.values() + ] + assert session_args == pandas_args or ["args", "kwargs"] == [ + parameter.name for parameter in session_args ] assert pandas_signature.return_annotation == session_signature.return_annotation diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 189dabcf24..105277dbf0 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -21,6 +21,7 @@ def to_datetime( utc=False, format=None, unit=None, + session=None, ) -> Union[pd.Timestamp, datetime, series.Series]: """ This function converts a scalar, array-like or Series to a datetime object. From 210dc9abdebca934183659d6580cafdcbc1a99f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 23:51:29 +0000 Subject: [PATCH 04/63] skip sklearn test --- scripts/publish_api_coverage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 6c94c06456..181b8c3365 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -25,7 +25,6 @@ import pandas.core.indexes.accessors import pandas.core.strings.accessor import pandas.core.window.rolling -import sklearn # noqa import bigframes import bigframes.core.groupby @@ -205,6 +204,9 @@ def generate_pandas_api_coverage(): def generate_sklearn_api_coverage(): """Explore all SKLearn modules, and for each item contained generate a regex to detect it being imported, and record whether we implement it""" + + import sklearn # noqa + sklearn_modules = [ "sklearn", "sklearn.model_selection", From bed4069f29f1caf01094aad3b174de452a7a9736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 23:55:37 +0000 Subject: [PATCH 05/63] fix snapshot --- .../snapshots/test_blob_ops/test_obj_get_access_url/out.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql index 4a963b4972..25004c424d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql @@ -21,5 +21,5 @@ WITH `bfcte_0` AS ( ) SELECT `bfcol_0` AS `rowindex`, - `bfcol_10` AS `string_col` + `bfcol_10` AS `0` FROM `bfcte_3` \ No newline at end of file From 20cae2d5370c193469cb60f476dbecd68dcc02aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 15:00:39 +0000 Subject: [PATCH 06/63] plumb through session for from_tuples and from_arrays --- bigframes/core/indexes/multi.py | 41 ++++++++++++++++++++++++++++++--- bigframes/session/__init__.py | 12 ++++------ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index 182d1f101c..dc81f812b5 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -14,13 +14,16 @@ from __future__ import annotations -from typing import cast, Hashable, Iterable, Sequence +from typing import cast, Hashable, Iterable, Optional, Sequence, TYPE_CHECKING import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas from bigframes.core.indexes.base import Index +if TYPE_CHECKING: + import bigframes.session + class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ @@ -31,10 +34,12 @@ def from_tuples( tuples: Iterable[tuple[Hashable, ...]], sortorder: int | None = None, names: Sequence[Hashable] | Hashable | None = None, + *, + session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index)) + return cast(MultiIndex, Index(pd_index, session=session)) @classmethod def from_arrays( @@ -42,7 +47,37 @@ def from_arrays( arrays, sortorder: int | None = None, names=None, + *, + session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index)) + return cast(MultiIndex, Index(pd_index, session=session)) + + +class MultiIndexAccessor: + """Proxy to MultiIndex constructors to allow a session to be passed in.""" + + def __init__(self, session: bigframes.session.Session): + self._session = session + + def __call__(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :class:`bigframes.pandas.MultiIndex`. + """ + return MultiIndex(*args, session=self._session, **kwargs) + + def from_arrays(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_arrays`. + """ + return MultiIndex.from_arrays(*args, session=self._session, **kwargs) + + def from_tuples(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_tuples`. + """ + return MultiIndex.from_tuples(*args, session=self._session, **kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 54755482f3..1250cfa9e8 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,6 +68,7 @@ import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.indexes +import bigframes.core.indexes.multi import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -2316,20 +2317,17 @@ def DataFrame(self, *args, **kwargs): return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) - def MultiIndex(self, *args, **kwargs): + @property + def MultiIndex(self) -> bigframes.core.indexes.multi.MultiIndexAccessor: """Constructs a MultiIndex. Included for compatibility between bpd and Session. See :class:`bigframes.pandas.MulitIndex` for full documentation. """ - import bigframes.core.indexes - - return bigframes.core.indexes.MultiIndex(*args, session=self, **kwargs) + import bigframes.core.indexes.multi - MultiIndex.from_tuples = bigframes.core.indexes.MultiIndex.from_tuples # type: ignore - MultiIndex.from_frame = bigframes.core.indexes.MultiIndex.from_frame # type: ignore - MultiIndex.from_arrays = bigframes.core.indexes.MultiIndex.from_arrays # type: ignore + return bigframes.core.indexes.multi.MultiIndexAccessor(self) def Index(self, *args, **kwargs): """Constructs a Index. From 1dc648b8a33d90648f5ffe1bec7a9ab6ad8f3b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 15:10:53 +0000 Subject: [PATCH 07/63] add from_frame --- bigframes/core/indexes/multi.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index a4133927bc..a611442b88 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -100,6 +100,13 @@ def from_arrays(self, *args, **kwargs) -> MultiIndex: """ return MultiIndex.from_arrays(*args, session=self._session, **kwargs) + def from_frame(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_frame`. + """ + return cast(MultiIndex, MultiIndex.from_frame(*args, **kwargs)) + def from_tuples(self, *args, **kwargs) -> MultiIndex: """Construct a MultiIndex using the associated Session. From 9de6f9fbf2244fdd886e4a2d762b1cc70cedebdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:08:00 +0000 Subject: [PATCH 08/63] make sure polars session isnt skipped on Kokoro --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index a46dc36b3e..099d17f631 100644 --- a/noxfile.py +++ b/noxfile.py @@ -115,7 +115,7 @@ # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. "3.10": ["tests", "scikit-learn", "anywidget"], - "3.11": ["tests", "scikit-learn", "polars", "anywidget"], + LATEST_FULLY_SUPPORTED_PYTHON: ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars", "anywidget"], } From 5d23dee5fab501399bd3998436b1ba7e21e7080d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:09:26 +0000 Subject: [PATCH 09/63] fix apply doctest --- third_party/bigframes_vendored/pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c6ec5dfaf1..5e9c9e0113 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1783,7 +1783,7 @@ def apply( dtype: Int64 >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP - >>> hours + >>> hours # doctest: +SKIP 0 0.0 1 0.5 2 1.0 From 20d7c27543e51dc2a1e1bb5d796026b07bfc9343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:18:55 +0000 Subject: [PATCH 10/63] make doctest conftest available everywhere --- bigframes/conftest.py | 21 +++++++++---------- .../{pandas => }/conftest.py | 21 +++++++++---------- 2 files changed, 20 insertions(+), 22 deletions(-) rename third_party/bigframes_vendored/{pandas => }/conftest.py (77%) diff --git a/bigframes/conftest.py b/bigframes/conftest.py index e1f3f6d84c..f418c9feba 100644 --- a/bigframes/conftest.py +++ b/bigframes/conftest.py @@ -22,24 +22,23 @@ import bigframes._config -@pytest.fixture(scope="session") -def polars_session(): - pytest.importorskip("polars") - - from bigframes.testing import polars_session - - return polars_session.TestSession() - - @pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace, polars_session): +def default_doctest_imports(doctest_namespace): """ Avoid some boilerplate in pandas-inspired tests. See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture """ + try: + from bigframes.testing import polars_session + + bpd = polars_session.TestSession() + except ImportError: + # Don't skip doctest if polars isn't available. + import bigframes.pandas as bpd # type: ignore + doctest_namespace["np"] = np doctest_namespace["pd"] = pd doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = polars_session + doctest_namespace["bpd"] = bpd bigframes._config.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/conftest.py b/third_party/bigframes_vendored/conftest.py similarity index 77% rename from third_party/bigframes_vendored/pandas/conftest.py rename to third_party/bigframes_vendored/conftest.py index e1f3f6d84c..cafd6a1b7c 100644 --- a/third_party/bigframes_vendored/pandas/conftest.py +++ b/third_party/bigframes_vendored/conftest.py @@ -22,24 +22,23 @@ import bigframes._config -@pytest.fixture(scope="session") -def polars_session(): - pytest.importorskip("polars") - - from bigframes.testing import polars_session - - return polars_session.TestSession() - - @pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace, polars_session): +def default_doctest_imports(doctest_namespace): """ Avoid some boilerplate in pandas-inspired tests. See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture """ + try: + from bigframes.testing import polars_session + + bpd = polars_session.TestSession() + except ImportError: + # Don't skip doctest if polars isn't available. + import bigframes.pandas as bpd + doctest_namespace["np"] = np doctest_namespace["pd"] = pd doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = polars_session + doctest_namespace["bpd"] = bpd bigframes._config.options.display.progress_bar = None From fbe606e0c61d94344cbb3ab45541eb620874df18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:44:56 +0000 Subject: [PATCH 11/63] add python version flexibility for to_dict --- third_party/bigframes_vendored/pandas/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5e9c9e0113..b089c65d3b 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -549,8 +549,8 @@ def to_dict( >>> s.to_dict() {np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4} - >>> s.to_dict(into=OrderedDict) - OrderedDict([(np.int64(0), 1), (np.int64(1), 2), (np.int64(2), 3), (np.int64(3), 4)]) + >>> s.to_dict(into=OrderedDict) # doctest:+ELLIPSIS + OrderedDict(...) >>> dd = defaultdict(list) >>> s.to_dict(into=dd) From 171f3ece378e77993e630cedbc343e987ea1ccd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 18:54:17 +0000 Subject: [PATCH 12/63] disambiguate explicit names --- bigframes/operations/base.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 7d4c996ea5..38aa1f4b9b 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -15,7 +15,7 @@ from __future__ import annotations import typing -from typing import List, Sequence, Union +from typing import Any, List, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -34,6 +34,8 @@ import bigframes.series as series import bigframes.session +_NO_NAME_SENTINEL = object() + class SeriesMethods: def __init__( @@ -134,8 +136,17 @@ def __init__( # If we didn't get a block make sure the name is what the user # explicitly chose even if it is None. This is important for the # polars backend where the implicit column labels are integers. - if not isinstance(data, blocks.Block): - block = block.with_column_labels([name or getattr(data, "name", None)]) + if name: + default_name: Any = name + elif hasattr(data, "name"): + default_name = getattr(data, "name", None) + elif hasattr(data, "_name"): + default_name = getattr(data, "_name", None) + else: + default_name = _NO_NAME_SENTINEL + + if default_name is not _NO_NAME_SENTINEL: + block = block.with_column_labels([default_name]) self._block: blocks.Block = block @@ -165,8 +176,7 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - result = series.Series(block.select_column(result_id)) - result.name = getattr(self, "name", None) + result = series.Series(block.select_column(result_id), name=self._name) return result def _apply_binary_op( From ded5c1e548629b106c811af20ca79ed509a7989e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 19:17:27 +0000 Subject: [PATCH 13/63] disambiguate explicit name none versus no name --- bigframes/operations/base.py | 46 ++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 38aa1f4b9b..91226ac7b6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,8 +14,9 @@ from __future__ import annotations +import enum import typing -from typing import Any, List, Sequence, Union +from typing import List, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -34,7 +35,17 @@ import bigframes.series as series import bigframes.session -_NO_NAME_SENTINEL = object() + +class Default(enum.Enum): + """Sentinel that can disambiguate explicit None from missing. + + See https://stackoverflow.com/a/76606310/101923 + """ + + token = 0 + + +DEFAULT = Default.token class SeriesMethods: @@ -45,7 +56,7 @@ def __init__( dtype: typing.Optional[ bigframes.dtypes.DtypeString | bigframes.dtypes.Dtype ] = None, - name: str | None = None, + name: str | None | Default = DEFAULT, copy: typing.Optional[bool] = None, *, session: typing.Optional[bigframes.session.Session] = None, @@ -73,6 +84,16 @@ def __init__( f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}" ) + if name is DEFAULT: + if isinstance(data, blocks.Block): + name = data.column_labels[0] + elif hasattr(data, "name"): + name = getattr(data, "name") + elif hasattr(data, "_name"): + name = getattr(data, "_name") + else: + name = None + if isinstance(data, blocks.Block): block = data elif isinstance(data, SeriesMethods): @@ -109,6 +130,7 @@ def __init__( block = data_block if block: + # Data was a bigframes object. assert len(block.value_columns) == 1 assert len(block.column_labels) == 1 if index is not None: # reindexing operation @@ -121,6 +143,7 @@ def __init__( bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: + # Data was local. if isinstance(dtype, str) and dtype.lower() == "json": dtype = bigframes.dtypes.JSON_DTYPE pd_series = pd.Series( @@ -129,25 +152,12 @@ def __init__( dtype=dtype, # type:ignore name=name, ) + name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore assert block is not None - # If we didn't get a block make sure the name is what the user - # explicitly chose even if it is None. This is important for the - # polars backend where the implicit column labels are integers. - if name: - default_name: Any = name - elif hasattr(data, "name"): - default_name = getattr(data, "name", None) - elif hasattr(data, "_name"): - default_name = getattr(data, "_name", None) - else: - default_name = _NO_NAME_SENTINEL - - if default_name is not _NO_NAME_SENTINEL: - block = block.with_column_labels([default_name]) - + block = block.with_column_labels([name]) self._block: blocks.Block = block @property From 841bc64dad7304fbabf8fbab5ae0a8799a836a6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 19:39:32 +0000 Subject: [PATCH 14/63] fix for column name comparison in pandas bin op --- bigframes/core/blocks.py | 2 +- noxfile.py | 4 +--- .../snapshots/test_blob_ops/test_obj_get_access_url/out.sql | 2 +- tests/unit/test_dataframe_polars.py | 3 ++- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f9896784bb..cf3518ff29 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2471,7 +2471,7 @@ def _align_series_block_axis_1( def _align_pd_series_axis_1( self, other: pd.Series, how: str ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: - if self.column_labels.equals(other.index): + if self.column_labels.astype("object").equals(other.index.astype("object")): columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: if not (self.column_labels.is_unique and other.index.is_unique): diff --git a/noxfile.py b/noxfile.py index 099d17f631..703937d453 100644 --- a/noxfile.py +++ b/noxfile.py @@ -46,9 +46,7 @@ "3.11", ] -# pytest-retry is not yet compatible with pytest 8.x. -# https://github.com/str0zzapreti/pytest-retry/issues/32 -PYTEST_VERSION = "pytest<8.0.0dev" +PYTEST_VERSION = "pytest==8.4.2" SPHINX_VERSION = "sphinx==4.5.0" LINT_PATHS = [ "docs", diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql index 25004c424d..4a963b4972 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql @@ -21,5 +21,5 @@ WITH `bfcte_0` AS ( ) SELECT `bfcol_0` AS `rowindex`, - `bfcol_10` AS `0` + `bfcol_10` AS `string_col` FROM `bfcte_3` \ No newline at end of file diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index a6f5c3d1ef..c95c647fa8 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2319,7 +2319,8 @@ def test_binop_with_self_aggregate(session, scalars_dfs): df_columns = ["int64_col", "float64_col", "int64_too"] bf_df = scalars_df[df_columns] - bf_result = (bf_df - bf_df.mean()).to_pandas() + bf_deviation = bf_df - bf_df.mean() + bf_result = bf_deviation.to_pandas() pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() From 81f49a6cce0622ffbcf739dc374392bc6fae74da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:07:39 +0000 Subject: [PATCH 15/63] avoid setting column labels in special case of Series(block) --- bigframes/operations/base.py | 16 ++++------------ bigframes/session/__init__.py | 14 +++++++------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 91226ac7b6..7d6a1c3b68 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -84,16 +84,6 @@ def __init__( f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}" ) - if name is DEFAULT: - if isinstance(data, blocks.Block): - name = data.column_labels[0] - elif hasattr(data, "name"): - name = getattr(data, "name") - elif hasattr(data, "_name"): - name = getattr(data, "_name") - else: - name = None - if isinstance(data, blocks.Block): block = data elif isinstance(data, SeriesMethods): @@ -139,6 +129,8 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) + if name is not DEFAULT: + block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -150,14 +142,14 @@ def __init__( data=data, index=index, # type:ignore dtype=dtype, # type:ignore - name=name, + name=name if name is not DEFAULT else None, ) name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore + block = block.with_column_labels([name]) assert block is not None - block = block.with_column_labels([name]) self._block: blocks.Block = block @property diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index d12117dd73..0490152003 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1829,7 +1829,7 @@ def udf( Turning an arbitrary python function into a BigQuery managed python udf: >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) + >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) # doctest: +SKIP ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1842,8 +1842,8 @@ def udf( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) - >>> hours + >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP + >>> hours # doctest: +SKIP 0 0.0 1 0.5 2 1.0 @@ -1856,7 +1856,7 @@ def udf( packages (optionally with the package version) via `packages` param. >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf( + >>> @bpd.udf( # doctest: +SKIP ... dataset="bigfranes_testing", ... name=bq_name, ... packages=["cryptography"] @@ -1873,14 +1873,14 @@ def udf( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) + >>> hashes = names.apply(get_hash) # doctest: +SKIP You can clean-up the BigQuery functions created above using the BigQuery client from the BigQuery DataFrames session: >>> session = bpd.get_global_session() - >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) - >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) + >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) # doctest: +SKIP + >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) # doctest: +SKIP Args: input_types (type or sequence(type), Optional): From 5b605054b439e5a910340df444979adbf5cb7c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:30:18 +0000 Subject: [PATCH 16/63] revert doctest changes --- bigframes/bigquery/_operations/ai.py | 11 +- bigframes/bigquery/_operations/approx_agg.py | 1 + bigframes/conftest.py | 44 -- bigframes/operations/semantics.py | 20 +- bigframes/operations/strings.py | 1 + .../bigframes_vendored/geopandas/geoseries.py | 9 + .../bigframes_vendored/pandas/AUTHORS.md | 1 + .../bigframes_vendored/pandas/README.md | 2 + .../pandas/core/arrays/arrow/accessors.py | 25 +- .../pandas/core/arrays/datetimelike.py | 7 +- .../pandas/core/computation/eval.py | 3 + .../pandas/core/computation/expr.py | 3 + .../pandas/core/computation/ops.py | 1 + .../bigframes_vendored/pandas/core/frame.py | 436 +++++++++++++++-- .../bigframes_vendored/pandas/core/generic.py | 45 +- .../pandas/core/groupby/__init__.py | 127 ++++- .../pandas/core/indexes/accessor.py | 47 ++ .../pandas/core/indexes/base.py | 108 ++++- .../pandas/core/indexes/datetimes.py | 24 + .../pandas/core/indexes/multi.py | 4 + .../pandas/core/reshape/tile.py | 3 + .../bigframes_vendored/pandas/core/series.py | 443 ++++++++++++++++-- .../pandas/core/strings/accessor.py | 104 +++- .../pandas/core/tools/datetimes.py | 5 +- .../pandas/core/tools/timedeltas.py | 1 + .../bigframes_vendored/pandas/io/gbq.py | 1 + .../bigframes_vendored/pandas/io/parquet.py | 2 + .../pandas/io/parsers/readers.py | 4 + .../bigframes_vendored/pandas/io/pickle.py | 2 + .../pandas/pandas/_typing.py | 2 + .../pandas/plotting/_core.py | 12 +- .../sklearn/cluster/_kmeans.py | 1 + .../sklearn/decomposition/_mf.py | 1 + .../sklearn/decomposition/_pca.py | 1 + .../sklearn/impute/_base.py | 1 + .../sklearn/linear_model/_base.py | 1 + .../sklearn/linear_model/_logistic.py | 1 + .../sklearn/metrics/_classification.py | 5 + .../sklearn/metrics/_ranking.py | 3 + .../sklearn/metrics/_regression.py | 3 + .../sklearn/model_selection/_split.py | 2 + .../sklearn/model_selection/_validation.py | 1 + .../sklearn/preprocessing/_encoder.py | 1 + 43 files changed, 1370 insertions(+), 149 deletions(-) delete mode 100644 bigframes/conftest.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 0213e81658..0c5eba9496 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -52,13 +52,14 @@ def generate( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> country = bpd.Series(["Japan", "Canada"]) - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) # doctest: +SKIP + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) 0 {'result': 'Tokyo\\n', 'full_response': '{"cand... 1 {'result': 'Ottawa\\n', 'full_response': '{"can... dtype: struct>, status: string>[pyarrow] - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") # doctest: +SKIP + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") 0 Tokyo\\n 1 Ottawa\\n Name: result, dtype: string @@ -146,6 +147,7 @@ def generate_bool( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "col_1": ["apple", "bear", "pear"], ... "col_2": ["fruit", "animal", "animal"] @@ -223,6 +225,7 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... @@ -297,6 +300,7 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... @@ -368,6 +372,7 @@ def if_( **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True @@ -417,6 +422,7 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df @@ -467,6 +473,7 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 73b6fdbb73..696f8f5a66 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -40,6 +40,7 @@ def approx_top_count( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) >>> bbq.approx_top_count(s, number=2) [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] diff --git a/bigframes/conftest.py b/bigframes/conftest.py deleted file mode 100644 index f418c9feba..0000000000 --- a/bigframes/conftest.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import bigframes._config - - -@pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace): - """ - Avoid some boilerplate in pandas-inspired tests. - - See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture - """ - try: - from bigframes.testing import polars_session - - bpd = polars_session.TestSession() - except ImportError: - # Don't skip doctest if polars isn't available. - import bigframes.pandas as bpd # type: ignore - - doctest_namespace["np"] = np - doctest_namespace["pd"] = pd - doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = bpd - bigframes._config.options.display.progress_bar = None diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 176e0ad83a..9fa5450748 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -52,6 +52,7 @@ def agg( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -246,7 +247,8 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -319,7 +321,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -432,7 +435,8 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -554,7 +558,8 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -692,6 +697,7 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True @@ -794,7 +800,8 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -994,7 +1001,8 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index c69993849a..4743483954 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -68,6 +68,7 @@ def reverse(self) -> series.Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) >>> s.str.reverse() diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index 20587b4d57..92a58b3dc6 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -18,6 +18,7 @@ class GeoSeries: >>> import bigframes.geopandas >>> import bigframes.pandas as bpd >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -72,6 +73,7 @@ def x(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -98,6 +100,7 @@ def y(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -126,6 +129,7 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry + >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Polygon, LineString, Point >>> s = geopandas.GeoSeries( @@ -167,6 +171,7 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None >>> x = [2.5, 5, -3.0] >>> y = [0.5, 1, 1.5] @@ -205,6 +210,7 @@ def from_wkt(cls, data, index=None) -> bigframes.geopandas.GeoSeries: >>> import bigframes as bpd >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None >>> wkts = [ ... 'POINT (1 1)', @@ -240,6 +246,7 @@ def to_wkt(self) -> bigframes.series.Series: >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -272,6 +279,7 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -403,6 +411,7 @@ def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignor >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. diff --git a/third_party/bigframes_vendored/pandas/AUTHORS.md b/third_party/bigframes_vendored/pandas/AUTHORS.md index 396bcbf9dd..84fcfe05e3 100644 --- a/third_party/bigframes_vendored/pandas/AUTHORS.md +++ b/third_party/bigframes_vendored/pandas/AUTHORS.md @@ -47,6 +47,7 @@ file to indicate the copyright and license terms: Other licenses can be found in the LICENSES directory. +License ======= pandas is distributed under a 3-clause ("Simplified" or "New") BSD diff --git a/third_party/bigframes_vendored/pandas/README.md b/third_party/bigframes_vendored/pandas/README.md index f92a629a4c..1aa5068d5e 100644 --- a/third_party/bigframes_vendored/pandas/README.md +++ b/third_party/bigframes_vendored/pandas/README.md @@ -60,6 +60,7 @@ Here are just a few of the things that pandas does well: generation and frequency conversion, moving window statistics, date shifting and lagging + [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures @@ -119,6 +120,7 @@ python setup.py install or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): + ```sh python -m pip install -e . --no-build-isolation --no-use-pep517 ``` diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 9f6dfc1c74..fe15e7b40d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -19,12 +19,14 @@ def len(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.len() 0 3 @@ -43,12 +45,14 @@ def __getitem__(self, key: int | slice): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list[0] 0 1 @@ -79,13 +83,15 @@ def field(self, name_or_index: str | int): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -123,13 +129,15 @@ def explode(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -157,13 +165,15 @@ def dtypes(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -190,6 +200,8 @@ def explode(self, column, *, separator: str = "."): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ @@ -197,7 +209,7 @@ def explode(self, column, *, separator: str = "."): ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -221,6 +233,7 @@ def explode(self, column, *, separator: str = "."): Separator/delimiter to use to separate the original column name from the sub-field column name. + Returns: DataFrame: Original DataFrame with exploded struct column(s). diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index eeffbbdb7f..1736a7f9ef 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -15,6 +15,8 @@ def strftime(self, date_format: str): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.to_datetime( ... ['2014-08-15 08:15:12', '2012-02-29 08:15:12+06:00', '2015-08-15 08:15:12+05:00'], ... utc=True @@ -34,7 +36,6 @@ def strftime(self, date_format: str): bigframes.pandas.Series: Series of formatted strings. """ - # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def normalize(self): @@ -50,6 +51,7 @@ def normalize(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', @@ -66,7 +68,6 @@ def normalize(self): bigframes.pandas.Series: Series of the same dtype as the data. """ - # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floor(self, freq: str): @@ -84,6 +85,8 @@ def floor(self, freq: str): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 2f01b7edfc..d3d11a9c2a 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -172,6 +172,9 @@ def eval( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df animal age diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py index ca9e6a60ce..44f649e59d 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/expr.py +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -165,6 +165,7 @@ def _is_type(t): _is_list = _is_type(list) _is_str = _is_type(str) + # partition all AST nodes _all_nodes = frozenset( node @@ -196,9 +197,11 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) + # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + _unsupported_expr_nodes = frozenset( [ "Yield", diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py index a15972fc4c..75b914c876 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/ops.py +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -52,6 +52,7 @@ MATHOPS = _unary_math_ops + _binary_math_ops + LOCAL_TAG = "__pd_eval_local_" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b433c739cc..557c332797 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -39,6 +39,9 @@ def shape(self) -> tuple[int, int]: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2, 3], ... 'col2': [4, 5, 6]}) >>> df.shape @@ -60,6 +63,9 @@ def axes(self) -> list: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes[1:] [Index(['col1', 'col2'], dtype='object')] @@ -72,6 +78,9 @@ def values(self) -> np.ndarray: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.values array([[1, 3], @@ -101,6 +110,8 @@ def T(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -135,6 +146,9 @@ def transpose(self) -> DataFrame: **Square DataFrame with homogeneous dtype** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = bpd.DataFrame(data=d1) >>> df1 @@ -242,6 +256,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) >>> df.select_dtypes(include=['Int64']) col1 @@ -257,6 +274,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: [2 rows x 2 columns] + Args: include (scalar or list-like): A selection of dtypes or strings to be included. @@ -362,6 +380,9 @@ def to_numpy( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_numpy() array([[1, 3], @@ -398,9 +419,11 @@ def to_gbq( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Write a DataFrame to a BigQuery table. - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> # destination_table = PROJECT_ID + "." + DATASET_ID + "." + TABLE_NAME >>> df.to_gbq("bigframes-dev.birds.test-numbers", if_exists="replace") @@ -487,6 +510,7 @@ def to_gbq( If an invalid value is provided for ``if_exists`` that is not one of ``fail``, ``replace``, or ``append``. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -506,6 +530,8 @@ def to_parquet( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" >>> df.to_parquet(path=gcs_bucket) @@ -560,6 +586,9 @@ def to_dict( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() {'col1': {np.int64(0): 1, np.int64(1): 2}, 'col2': {np.int64(0): 3, np.int64(1): 4}} @@ -637,17 +666,12 @@ def to_excel( **Examples:** - >>> import tempfile >>> import bigframes.pandas as bpd + >>> import tempfile + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - - >>> try: - ... import openpyxl - ... df.to_excel(tempfile.TemporaryFile()) - ... - ... except ImportError: - ... pass # openpyxl is required. + >>> df.to_excel(tempfile.TemporaryFile()) Args: excel_writer (path-like, file-like, or ExcelWriter object): @@ -679,6 +703,9 @@ def to_latex( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_latex()) \begin{tabular}{lrr} @@ -727,6 +754,9 @@ def to_records( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_records() rec.array([(0, 1, 3), (1, 2, 4)], @@ -784,6 +814,9 @@ def to_string( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_string()) col1 col2 @@ -881,6 +914,9 @@ def to_html( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_html())
@@ -988,6 +1024,9 @@ def to_markdown( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_markdown()) | | col1 | col2 | @@ -1019,6 +1058,9 @@ def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" >>> df.to_pickle(path=gcs_bucket) @@ -1038,6 +1080,9 @@ def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | No **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> import tempfile >>> df.to_orc(tempfile.TemporaryFile()) @@ -1145,6 +1190,9 @@ def insert(self, loc, column, value, allow_duplicates=False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. @@ -1195,6 +1243,9 @@ def drop( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) >>> df @@ -1233,6 +1284,7 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame: + >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -1317,6 +1369,7 @@ def align( Join method is specified for each axis Index. + Args: other (DataFrame or Series): join ({'outer', 'inner', 'left', 'right'}, default 'outer'): @@ -1349,6 +1402,9 @@ def rename( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df A B @@ -1418,6 +1474,9 @@ def set_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], ... 'sale': [55, 40, 84, 31]}) @@ -1557,6 +1616,10 @@ def reset_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import numpy as np >>> df = bpd.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -1596,6 +1659,7 @@ class max_speed You can also use ``reset_index`` with ``MultiIndex``. + >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), @@ -1636,6 +1700,7 @@ class name speed max [4 rows x 2 columns] + Args: level (int, str, tuple, or list, default None): Only remove the given levels from the index. Removes all levels by @@ -1730,9 +1795,12 @@ def dropna( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [pd.NA, "1940-04-25", pd.NA]}) + ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) >>> df name toy born 0 Alfred @@ -1821,6 +1889,7 @@ def dropna( ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + Returns: bigframes.pandas.DataFrame: DataFrame with NA entries dropped from it. @@ -1839,6 +1908,9 @@ def isin(self, values): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) >>> df @@ -1892,6 +1964,9 @@ def keys(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1910,6 +1985,8 @@ def iterrows(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1934,6 +2011,8 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1965,6 +2044,9 @@ def items(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, ... index=['panda', 'polar', 'koala']) @@ -2003,6 +2085,9 @@ def where(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2092,6 +2177,9 @@ def mask(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2192,8 +2280,11 @@ def sort_values( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ - ... 'col1': ['A', 'A', 'B', pd.NA, 'D', 'C'], + ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] @@ -2333,6 +2424,9 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2373,6 +2467,9 @@ def __eq__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2401,6 +2498,9 @@ def __invert__(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'a':[True, False, True], 'b':[-1, 0, 1]}) >>> ~df a b @@ -2427,6 +2527,9 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2466,6 +2569,9 @@ def __ne__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2503,6 +2609,9 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2543,6 +2652,9 @@ def __le__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2580,6 +2692,9 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2620,6 +2735,9 @@ def __lt__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2657,6 +2775,9 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2697,6 +2818,9 @@ def __ge__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2734,6 +2858,9 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'angles': [0, 3, 4], ... 'degrees': [360, 180, 360]}, ... index=['circle', 'triangle', 'rectangle']) @@ -2772,6 +2899,9 @@ def __gt__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2806,6 +2936,9 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2847,6 +2980,9 @@ def __add__(self, other) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'height': [1.5, 2.6], ... 'weight': [500, 800] @@ -2919,6 +3055,9 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2979,6 +3118,9 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3020,6 +3162,9 @@ def __sub__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can subtract a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3065,6 +3210,9 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3123,6 +3271,9 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3164,6 +3315,9 @@ def __mul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3209,6 +3363,9 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3250,6 +3407,9 @@ def __rmul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3295,6 +3455,9 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3336,6 +3499,9 @@ def __truediv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3381,6 +3547,9 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3439,6 +3608,9 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3480,6 +3652,9 @@ def __floordiv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can divide by a scalar: >>> df = bpd.DataFrame({"a": [15, 15, 15], "b": [30, 30, 30]}) @@ -3525,6 +3700,9 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3583,6 +3761,9 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3624,6 +3805,9 @@ def __mod__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can modulo with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3669,6 +3853,9 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3728,6 +3915,9 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3770,6 +3960,9 @@ def __pow__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can exponentiate with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3816,6 +4009,9 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3909,6 +4105,9 @@ def combine( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 @@ -3956,6 +4155,9 @@ def combine_first(self, other) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) @@ -3983,6 +4185,10 @@ def explode( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) @@ -4038,6 +4244,9 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4069,6 +4278,9 @@ def cov(self, *, numeric_only) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4105,6 +4317,9 @@ def corrwith( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] >>> df1 = bpd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) @@ -4138,6 +4353,9 @@ def update( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) >>> new_df = bpd.DataFrame({'B': [4, 5, 6], @@ -4200,6 +4418,9 @@ def groupby( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) @@ -4294,18 +4515,17 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> def minutes_to_hours(x: int) -> float: - ... return x / 60 - >>> minutes_to_hours = bpd.deploy_remote_function( - ... minutes_to_hours, - ... reuse=False, - ... cloud_function_service_account="default", - ... ) # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + ... def minutes_to_hours(x: int) -> float: + ... return x/60 >>> df_minutes = bpd.DataFrame( ... {"system_minutes" : [0, 30, 60, 90, 120], @@ -4320,8 +4540,8 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: [5 rows x 2 columns] - >>> df_hours = df_minutes.map(minutes_to_hours) # doctest: +SKIP - >>> df_hours # doctest: +SKIP + >>> df_hours = df_minutes.map(minutes_to_hours) + >>> df_hours system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4337,11 +4557,11 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> df_minutes = bpd.DataFrame( ... { - ... "system_minutes" : [0, 30, 60, None, 90, 120, pd.NA], - ... "user_minutes" : [0, 15, 75, 90, 6, None, pd.NA] + ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] ... }, dtype="Int64") - >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') # doctest: +SKIP - >>> df_hours # doctest: +SKIP + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') + >>> df_hours system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4392,6 +4612,9 @@ def join( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Join two DataFrames by specifying how to handle the operation: >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) @@ -4445,6 +4668,7 @@ def join( [1 rows x 4 columns] + Another option to join using the key columns is to use the on parameter: >>> df1.join(df2, on="col2", how="right") @@ -4540,6 +4764,9 @@ def merge( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Merge DataFrames df1 and df2 by specifying type of merge: >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) @@ -4670,6 +4897,7 @@ def round(self, decimals): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -4752,6 +4980,10 @@ def apply(self, func, *, axis=0, args=(), **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -4776,14 +5008,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] ... result += row["col2"]*row["col2"] ... return result - >>> df[["col1", "col2"]].apply(foo, axis=1) # doctest: +SKIP + >>> df[["col1", "col2"]].apply(foo, axis=1) 0 11 1 19 dtype: Int64 @@ -4791,7 +5023,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -4808,8 +5040,8 @@ def apply(self, func, *, axis=0, args=(), **kwargs): ... "chemistry": [88, 56, 72], ... "algebra": [78, 91, 79] ... }, index=["Alice", "Bob", "Charlie"]) - >>> stats = df.apply(marks_analyzer, axis=1) # doctest: +SKIP - >>> stats # doctest: +SKIP + >>> stats = df.apply(marks_analyzer, axis=1) + >>> stats Alice [77.67 78. 77.19 76.71] Bob [75.67 80. 74.15 72.56] Charlie [75.33 75. 75.28 75.22] @@ -4832,14 +5064,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x ... result += y/z ... return result - >>> df.apply(foo, axis=1) # doctest: +SKIP + >>> df.apply(foo, axis=1) 0 2.6 1 3.8 dtype: Float64 @@ -4899,6 +5131,9 @@ def any(self, *, axis=0, bool_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -4943,6 +5178,9 @@ def all(self, axis=0, *, bool_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -4984,6 +5222,8 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df A B @@ -5028,6 +5268,9 @@ def min(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5070,6 +5313,9 @@ def max(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5111,6 +5357,9 @@ def sum(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5150,6 +5399,9 @@ def mean(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5190,6 +5442,8 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5226,6 +5480,7 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5262,6 +5517,9 @@ def var(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5284,6 +5542,7 @@ def var(self, axis=0, *, numeric_only: bool = False): 1 0.5 dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -5303,6 +5562,9 @@ def skew(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], ... 'B': [5, 4, 3, 2, 1], ... 'C': [2, 2, 3, 2, 2]}) @@ -5341,6 +5603,9 @@ def kurt(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5378,6 +5643,9 @@ def std(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5417,6 +5685,9 @@ def count(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -5468,6 +5739,8 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5558,6 +5831,8 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5605,6 +5880,7 @@ def nsmallest(self, n: int, columns, keep: str = "first"): [1 rows x 3 columns] + Args: n (int): Number of rows to return. @@ -5636,6 +5912,9 @@ def idxmin(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5663,6 +5942,9 @@ def idxmax(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5694,6 +5976,9 @@ def melt(self, id_vars, value_vars, var_name, value_name): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -5742,6 +6027,7 @@ def melt(self, id_vars, value_vars, var_name, value_name): [10 rows x 3 columns] + Args: id_vars (tuple, list, or ndarray, optional): Column(s) to use as identifier variables. @@ -5765,6 +6051,9 @@ def nunique(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) >>> df A B @@ -5791,6 +6080,9 @@ def cummin(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5820,6 +6112,9 @@ def cummax(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5849,6 +6144,9 @@ def cumsum(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5883,6 +6181,9 @@ def cumprod(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5921,6 +6222,9 @@ def diff( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5966,6 +6270,9 @@ def agg(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6028,6 +6335,8 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df A B C @@ -6050,6 +6359,7 @@ def describe(self, include: None | Literal["all"] = None): [8 rows x 2 columns] + Using describe with include = "all": >>> df.describe(include="all") A B C @@ -6096,6 +6406,9 @@ def pivot(self, *, columns, index=None, values=None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... "foo": ["one", "one", "one", "two", "two"], ... "bar": ["A", "B", "C", "A", "B"], @@ -6164,6 +6477,8 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], @@ -6254,6 +6569,9 @@ def stack(self, level=-1): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6290,6 +6608,9 @@ def unstack(self, level=-1): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6328,6 +6649,9 @@ def index(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can access the index of a DataFrame via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6378,6 +6702,9 @@ def columns(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can access the column labels of a DataFrame via ``columns`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6423,8 +6750,11 @@ def value_counts( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], - ... 'num_wings': [2, 0, 0, 0, pd.NA]}, + ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], ... dtype='Int64') >>> df @@ -6501,6 +6831,9 @@ def eval(self, expr: str) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B @@ -6558,6 +6891,7 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 4 columns] + Args: expr (str): The expression string to evaluate. @@ -6573,6 +6907,9 @@ def query(self, expr: str) -> DataFrame | None: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) @@ -6645,6 +6982,9 @@ def interpolate(self, method: str = "linear"): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], ... 'B': [None, 6, None, 2, None, 3], @@ -6692,6 +7032,9 @@ def fillna(self, value): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -6767,6 +7110,8 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], ... 'string_col': ["a", "b", "c", "b"], @@ -6805,6 +7150,7 @@ def replace( [4 rows x 2 columns] + Args: to_replace (str, regex, list, int, float or None): How to find the values that will be replaced. @@ -6860,6 +7206,9 @@ def iat(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -6891,6 +7240,9 @@ def at(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -6937,6 +7289,9 @@ def dot(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7028,6 +7383,9 @@ def __matmul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7085,6 +7443,9 @@ def __len__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, 1, 2], ... 'b': [3, 4, 5] @@ -7105,6 +7466,10 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) >>> np.array(df) @@ -7136,6 +7501,9 @@ def __getitem__(self, key): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], @@ -7179,6 +7547,7 @@ def __getitem__(self, key): You can specify a pandas Index with desired column labels. + >>> import pandas as pd >>> df[pd.Index(["age", "location"])] age location 0 20 WA @@ -7207,6 +7576,9 @@ def __setitem__(self, key, value): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index e8079e573b..273339efcf 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -38,6 +38,9 @@ def size(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) >>> s.size 3 @@ -62,6 +65,9 @@ def __iter__(self) -> Iterator: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -100,6 +106,9 @@ def astype(self, dtype): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Create a DataFrame: >>> d = {'col1': [1, 2], 'col2': [3, 4]} @@ -143,7 +152,7 @@ def astype(self, dtype): Note that this is equivalent of using ``to_datetime`` with ``unit='us'``: - >>> bpd.to_datetime(ser, unit='us', utc=True) # doctest: +SKIP + >>> bpd.to_datetime(ser, unit='us', utc=True) 0 2034-02-08 11:13:20.246789+00:00 1 2021-06-19 17:20:44.123101+00:00 2 2003-06-05 17:30:34.120101+00:00 @@ -341,6 +350,9 @@ def get(self, key, default=None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame( ... [ ... [24.3, 75.7, "high"], @@ -449,6 +461,9 @@ def head(self, n: int = 5): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df @@ -547,6 +562,8 @@ def sample( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, @@ -626,6 +643,9 @@ def dtypes(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) >>> df.dtypes float Float64 @@ -648,6 +668,9 @@ def copy(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Modification in the original Series will not affect the copy Series: >>> s = bpd.Series([1, 2], index=["a", "b"]) @@ -718,6 +741,10 @@ def ffill(self, *, limit: Optional[int] = None): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -743,6 +770,7 @@ def ffill(self, *, limit: Optional[int] = None): [4 rows x 4 columns] + Fill NA/NaN values in Series: >>> series = bpd.Series([1, np.nan, 2, 3]) @@ -762,6 +790,7 @@ def ffill(self, *, limit: Optional[int] = None): maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. + Returns: bigframes.pandas.DataFrame or bigframes.pandas.Series or None: Object with missing values filled. @@ -796,9 +825,13 @@ def isna(self) -> NDFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], - ... born=[pd.NA, "1940-04-25", "1940-04-25"], + ... born=[bpd.NA, "1940-04-25", "1940-04-25"], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'], ... )) @@ -830,7 +863,7 @@ def isna(self) -> NDFrame: Show which entries in a Series are NA: - >>> ser = bpd.Series([5, None, 6, np.nan, pd.NA]) + >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) >>> ser 0 5 1 @@ -1035,6 +1068,8 @@ def rolling( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() 0 @@ -1119,6 +1154,10 @@ def pipe( Constructing a income DataFrame from a dictionary. + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) >>> df diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 8dba97ff07..1e39ec8f94 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -45,6 +45,8 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df A B C @@ -84,6 +86,8 @@ def any(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).any() @@ -121,6 +125,8 @@ def all(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).all() @@ -157,6 +163,10 @@ def count(self): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) >>> ser.groupby(level=0).count() @@ -192,6 +202,9 @@ def mean( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -250,6 +263,9 @@ def median( For SeriesGroupBy: >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).median() @@ -288,6 +304,7 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -326,6 +343,10 @@ def std( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).std() @@ -369,6 +390,10 @@ def var( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).var() @@ -410,6 +435,9 @@ def rank( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame( ... { ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], @@ -482,6 +510,10 @@ def skew( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', ... 'Parrot', 'Parrot', 'Parrot'], @@ -514,6 +546,9 @@ def kurt( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurt() @@ -544,6 +579,9 @@ def kurtosis( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurtosis() @@ -568,8 +606,9 @@ def first(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() B C @@ -608,6 +647,8 @@ def last(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) >>> df.groupby("A").last() @@ -644,6 +685,9 @@ def sum( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).sum() @@ -686,6 +730,10 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).prod() @@ -718,6 +766,10 @@ def min( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).min() @@ -763,6 +815,9 @@ def max( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).max() @@ -804,6 +859,9 @@ def cumcount(self, ascending: bool = True): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b', 'c'] >>> ser = bpd.Series([5, 1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).cumcount() @@ -839,6 +897,10 @@ def cumprod(self, *args, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumprod() @@ -874,6 +936,10 @@ def cumsum(self, *args, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumsum() @@ -909,6 +975,10 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummin() @@ -944,6 +1014,10 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummax() @@ -981,6 +1055,10 @@ def diff(self): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).diff() @@ -1023,6 +1101,10 @@ def shift(self, periods: int = 1): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).shift(1) @@ -1063,6 +1145,9 @@ def rolling(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).rolling(2).min() @@ -1119,6 +1204,9 @@ def expanding(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).expanding().min() @@ -1142,6 +1230,9 @@ def head(self, n: int = 5): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) >>> df.groupby('A').head(1) @@ -1168,6 +1259,9 @@ def size(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For SeriesGroupBy: >>> lst = ['a', 'a', 'b'] @@ -1219,6 +1313,9 @@ def __iter__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For SeriesGroupBy: >>> lst = ["a", "a", "b"] @@ -1280,6 +1377,10 @@ def agg(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) min max @@ -1309,6 +1410,10 @@ def aggregate(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) min max @@ -1338,6 +1443,10 @@ def nunique(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) >>> ser.groupby(level=0).nunique() @@ -1385,6 +1494,10 @@ def agg(self, func, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1441,6 +1554,10 @@ def aggregate(self, func, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1497,6 +1614,10 @@ def nunique(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], ... 'value1': [1, 5, 5, 2, 5, 5], @@ -1529,6 +1650,10 @@ def value_counts( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 0e74b3e178..0dd487d056 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -12,6 +12,9 @@ def day(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") ... ) @@ -39,6 +42,9 @@ def dayofweek(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -70,6 +76,9 @@ def day_of_week(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -97,7 +106,9 @@ def dayofyear(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -123,7 +134,9 @@ def day_of_year(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -155,6 +168,7 @@ def date(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -175,7 +189,9 @@ def hour(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -199,7 +215,9 @@ def minute(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -223,6 +241,9 @@ def month(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") ... ) @@ -246,6 +267,9 @@ def isocalendar(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() ... ) @@ -263,9 +287,11 @@ def isocalendar(self): [9 rows x 3 columns] + Returns: DataFrame With columns year, week and day. + """ @property @@ -274,7 +300,9 @@ def second(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -303,6 +331,7 @@ def time(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -324,6 +353,7 @@ def quarter(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -344,6 +374,9 @@ def year(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") ... ) @@ -367,6 +400,9 @@ def days(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -382,6 +418,9 @@ def seconds(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -397,6 +436,9 @@ def microseconds(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -411,6 +453,9 @@ def total_seconds(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s 0 1 days 00:01:01.000001 @@ -427,6 +472,7 @@ def tz(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -449,6 +495,7 @@ def unit(self) -> str: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 04f7f5938d..eba47fc1f9 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -32,6 +32,9 @@ def name(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3], name='x') >>> idx Index([1, 2, 3], dtype='Int64', name='x') @@ -60,6 +63,9 @@ def values(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -80,6 +86,9 @@ def ndim(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -112,6 +121,9 @@ def size(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -144,6 +156,9 @@ def is_monotonic_increasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bool(bpd.Index([1, 2, 3]).is_monotonic_increasing) True @@ -166,6 +181,9 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bool(bpd.Index([3, 2, 1]).is_monotonic_decreasing) True @@ -188,6 +206,9 @@ def from_frame(cls, frame) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) @@ -225,6 +246,9 @@ def shape(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -244,6 +268,9 @@ def nlevels(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> mi = bpd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) >>> mi MultiIndex([('a', 'b', 'c')], @@ -263,6 +290,9 @@ def is_unique(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 5, 7, 7]) >>> idx.is_unique False @@ -283,6 +313,9 @@ def has_duplicates(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 5, 7, 7]) >>> bool(idx.has_duplicates) True @@ -303,6 +336,9 @@ def dtype(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -328,6 +364,9 @@ def T(self) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -364,6 +403,9 @@ def copy( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['a', 'b', 'c']) >>> new_idx = idx.copy() >>> idx is new_idx @@ -396,10 +438,14 @@ def astype(self, dtype): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') + Args: dtype (str, data type, or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, @@ -441,6 +487,9 @@ def get_level_values(self, level) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(list('abc')) >>> idx Index(['a', 'b', 'c'], dtype='string') @@ -468,6 +517,9 @@ def to_series(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['Ant', 'Bear', 'Cow'], name='animal') By default, the original index and original name is reused. @@ -519,6 +571,9 @@ def isin(self, values): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1,2,3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -556,6 +611,9 @@ def all(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + True, because nonzero integers are considered True. >>> bool(bpd.Index([1, 2, 3]).all()) @@ -581,6 +639,9 @@ def any(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> index = bpd.Index([0, 1, 2]) >>> bool(index.any()) True @@ -604,6 +665,9 @@ def min(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.min()) 1 @@ -623,6 +687,9 @@ def max(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.max()) 3 @@ -646,6 +713,9 @@ def argmin(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Consider dataset containing cereal calories >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -680,6 +750,9 @@ def get_loc( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> unique_index = bpd.Index(list('abc')) >>> unique_index.get_loc('b') 1 @@ -721,6 +794,9 @@ def argmax(self) -> int: Consider dataset containing cereal calories + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) >>> s @@ -752,6 +828,9 @@ def nunique(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -781,6 +860,9 @@ def sort_values( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([10, 100, 1, 1000]) >>> idx Index([10, 100, 1, 1000], dtype='Int64') @@ -822,6 +904,10 @@ def value_counts( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 @@ -875,6 +961,10 @@ def fillna(self, value) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) Index([0.0, 0.0, 3.0], dtype='Float64') @@ -902,6 +992,9 @@ def rename(self, name, *, inplace): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') Index(['A', 'C', 'A', 'B'], dtype='string', name='grade') @@ -929,6 +1022,9 @@ def drop(self, labels) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['a', 'b', 'c']) >>> idx.drop(['a']) Index(['b', 'c'], dtype='string') @@ -946,6 +1042,10 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() Index([1.0, 3.0], dtype='Float64') @@ -970,9 +1070,11 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Generate an pandas.Index with duplicate values. - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) The keep parameter controls which duplicate values are removed. @@ -1011,6 +1113,8 @@ def unique(self, level: Hashable | int | None = None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 1, 2, 3, 3]) >>> idx.unique() Index([1, 2, 3], dtype='Int64') @@ -1030,6 +1134,8 @@ def item(self, *args, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1], index=['a']) >>> s.index.item() 'a' diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 973d5c763a..105a376728 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -15,6 +15,10 @@ def year(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year Index([2025], dtype='Int64') @@ -27,6 +31,10 @@ def month(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month Index([2], dtype='Int64') @@ -39,6 +47,10 @@ def day(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day Index([15], dtype='Int64') @@ -51,6 +63,10 @@ def day_of_week(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week Index([5], dtype='Int64') @@ -63,6 +79,10 @@ def dayofweek(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek Index([5], dtype='Int64') @@ -75,6 +95,10 @@ def weekday(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday Index([5], dtype='Int64') diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py index 018e638de3..a882aa40e3 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/multi.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -25,6 +25,8 @@ def from_tuples( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) @@ -60,6 +62,8 @@ def from_arrays( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) MultiIndex([(1, 'red'), diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 0f42433384..697c17f23c 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -34,6 +34,8 @@ def cut( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 5, 10]) >>> s 0 0 @@ -71,6 +73,7 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: + >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b089c65d3b..932959a826 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -38,6 +38,9 @@ def dt(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series 0 2000-01-01 00:00:00 @@ -107,6 +110,9 @@ def index(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can access the index of a Series via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -155,10 +161,13 @@ def shape(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape (4,) - >>> s = bpd.Series(['Alice', 'Bob', pd.NA]) + >>> s = bpd.Series(['Alice', 'Bob', bpd.NA]) >>> s.shape (3,) """ @@ -171,6 +180,9 @@ def dtype(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.dtype Int64Dtype() @@ -188,6 +200,9 @@ def name(self) -> Hashable: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For a Series: >>> s = bpd.Series([1, 2, 3], dtype="Int64", name='Numbers') @@ -233,6 +248,9 @@ def hasnans(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, None]) >>> s 0 1.0 @@ -254,6 +272,9 @@ def T(self) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -276,6 +297,9 @@ def transpose(self) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -313,6 +337,10 @@ def reset_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) >>> s.index.name = "idx" @@ -412,6 +440,9 @@ def keys(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s.keys() Index([0, 1, 2], dtype='Int64') @@ -491,6 +522,9 @@ def to_markdown( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) | | animal | @@ -543,14 +577,16 @@ def to_dict( **Examples:** + >>> import bigframes.pandas as bpd >>> from collections import OrderedDict, defaultdict + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.to_dict() {np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4} - >>> s.to_dict(into=OrderedDict) # doctest:+ELLIPSIS - OrderedDict(...) + >>> s.to_dict(into=OrderedDict) + OrderedDict({np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4}) >>> dd = defaultdict(list) >>> s.to_dict(into=dd) @@ -581,6 +617,9 @@ def to_frame(self, name=None) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["a", "b", "c"], ... name="vals") >>> s.to_frame() @@ -675,6 +714,9 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -706,6 +748,10 @@ def to_numpy( **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -757,6 +803,9 @@ def to_pickle(self, path, *, allow_large_results=None, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> original_df = bpd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar @@ -816,6 +865,9 @@ def agg(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) >>> s 0 1 @@ -850,7 +902,10 @@ def count(self): **Examples:** - >>> s = bpd.Series([0.0, 1.0, pd.NA]) + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0.0, 1.0, bpd.NA]) >>> s 0 0.0 1 1.0 @@ -873,6 +928,9 @@ def nunique(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -905,6 +963,9 @@ def unique(self, keep_order=True) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -945,6 +1006,9 @@ def mode(self) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, 4, 8, 2, 4, None]) >>> s.mode() 0 2.0 @@ -967,9 +1031,11 @@ def drop_duplicates( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Generate a Series with duplicated entries. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], ... name='animal') >>> s @@ -1035,6 +1101,7 @@ def duplicated(self, keep="first") -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1105,6 +1172,9 @@ def idxmin(self) -> Hashable: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(data=[1, None, 4, 1], ... index=['A', 'B', 'C', 'D']) >>> s @@ -1131,6 +1201,9 @@ def idxmax(self) -> Hashable: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(data=[1, None, 4, 3, 4], ... index=['A', 'B', 'C', 'D', 'E']) >>> s @@ -1156,6 +1229,8 @@ def round(self, decimals: int = 0) -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() 0 0.0 @@ -1187,6 +1262,9 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) >>> s 0 [1 2 3] @@ -1223,6 +1301,9 @@ def corr(self, other, method="pearson", min_periods=None) -> float: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) >>> s1.corr(s2) @@ -1259,6 +1340,8 @@ def autocorr(self, lag: int = 1) -> float: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS np.float64(0.10355263309024067) @@ -1294,6 +1377,9 @@ def cov( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = bpd.Series([0.12528585, 0.26962463, 0.51111198]) >>> s1.cov(s2) @@ -1317,8 +1403,12 @@ def diff(self) -> Series: Calculates the difference of a Series element compared with another element in the Series (default is element in previous row). + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Difference with previous row >>> s = bpd.Series([1, 1, 2, 3, 5, 8]) @@ -1382,6 +1472,9 @@ def dot(self, other) -> Series | np.ndarray: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) >>> s.dot(other) @@ -1403,6 +1496,7 @@ def dot(self, other) -> Series | np.ndarray: Series and each rows of other if other is a DataFrame or a numpy.ndarray between the Series and each columns of the numpy array. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1435,6 +1529,10 @@ def sort_values( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s 0 @@ -1530,6 +1628,10 @@ def sort_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() 1 c @@ -1588,6 +1690,8 @@ def nlargest( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1672,6 +1776,8 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1758,6 +1864,7 @@ def apply( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None For applying arbitrary python function a `remote_function` is recommended. Let's use ``reuse=False`` flag to make sure a new `remote_function` @@ -1765,13 +1872,9 @@ def apply( to potentially reuse a previously deployed `remote_function` from the same user defined function. - >>> def minutes_to_hours(x: int) -> float: + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + ... def minutes_to_hours(x: int) -> float: ... return x/60 - >>> bpd.deploy_remote_function( # doctest: +SKIP - ... minutes_to_hours, - ... reuse=False, - ... cloud_function_service_account="default", - ... ) >>> minutes = bpd.Series([0, 30, 60, 90, 120]) >>> minutes @@ -1782,8 +1885,8 @@ def apply( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP - >>> hours # doctest: +SKIP + >>> hours = minutes.apply(minutes_to_hours) + >>> hours 0 0.0 1 0.5 2 1.0 @@ -1795,7 +1898,7 @@ def apply( a `remote_function`, you would provide the names of the packages via `packages` param. - >>> @bpd.remote_function( # doctest: +SKIP + >>> @bpd.remote_function( ... reuse=False, ... packages=["cryptography"], ... cloud_function_service_account="default" @@ -1812,11 +1915,11 @@ def apply( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) # doctest: +SKIP + >>> hashes = names.apply(get_hash) You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -1829,8 +1932,8 @@ def apply( ... "I love this product! It's amazing.", ... "Hungry? Wanna eat? Lets go!" ... ]) - >>> features = texts.apply(text_analyzer) # doctest: +SKIP - >>> features # doctest: +SKIP + >>> features = texts.apply(text_analyzer) + >>> features 0 [9 1 0 0] 1 [6 1 1 0] 2 [5 0 1 2] @@ -1903,6 +2006,8 @@ def combine( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -1960,6 +2065,9 @@ def groupby( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can group by a named index level. >>> s = bpd.Series([380, 370., 24., 26.], @@ -1981,6 +2089,7 @@ def groupby( You can also group by more than one index levels. + >>> import pandas as pd >>> s = bpd.Series([380, 370., 24., 26.], ... index=pd.MultiIndex.from_tuples( ... [("Falcon", "Clear"), @@ -2129,6 +2238,9 @@ def drop( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) >>> s A 0 @@ -2144,6 +2256,7 @@ def drop( Drop 2nd level label in MultiIndex Series: + >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -2209,6 +2322,7 @@ def reorder_levels(self, order: Sequence, axis) -> Series: axis ({0 or 'index', 1 or 'columns'}, default 0): For `Series` this parameter is unused and defaults to 0. + Returns: type of caller (new object) """ @@ -2255,6 +2369,10 @@ def interpolate(self, method: str = "linear"): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + Filling in NaN in a Series via linear interpolation. >>> s = bpd.Series([0, 1, np.nan, 3]) @@ -2296,6 +2414,10 @@ def fillna( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s 0 @@ -2348,6 +2470,8 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s 0 1 @@ -2472,6 +2596,10 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + Drop NA values from a Series: >>> ser = bpd.Series([1., 2., np.nan]) @@ -2488,7 +2616,7 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = bpd.Series(['2', pd.NA, '', None, 'I stay'], dtype='object') + >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') >>> ser 0 2 1 @@ -2532,6 +2660,10 @@ def between( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + Boundary values are included by default: >>> s = bpd.Series([2, 0, 4, 8, np.nan]) @@ -2587,6 +2719,10 @@ def case_when( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) >>> b = bpd.Series([0, 3, 4, 5]) @@ -2653,6 +2789,9 @@ def cumprod(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2687,6 +2826,10 @@ def cumsum(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2726,6 +2869,10 @@ def cummax(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2745,6 +2892,7 @@ def cummax(self): 4 5.0 dtype: Float64 + Returns: bigframes.pandas.Series: Return cumulative maximum of scalar or Series. @@ -2760,6 +2908,10 @@ def cummin(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2793,6 +2945,10 @@ def eq(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2834,6 +2990,10 @@ def ne(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2877,6 +3037,10 @@ def le(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2919,6 +3083,10 @@ def lt(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2962,6 +3130,10 @@ def ge(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3005,6 +3177,10 @@ def gt(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3047,7 +3223,10 @@ def add(self, other) -> Series: **Examples:** - >>> a = bpd.Series([1, 2, 3, pd.NA]) + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> a = bpd.Series([1, 2, 3, bpd.NA]) >>> a 0 1 1 2 @@ -3108,6 +3287,9 @@ def __add__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3157,6 +3339,10 @@ def radd(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3218,6 +3404,10 @@ def sub( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3259,6 +3449,9 @@ def __sub__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3308,6 +3501,10 @@ def rsub(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3366,6 +3563,10 @@ def mul(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3408,6 +3609,9 @@ def __mul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3445,6 +3649,10 @@ def rmul(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3502,6 +3710,10 @@ def truediv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3544,6 +3756,9 @@ def __truediv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3581,6 +3796,10 @@ def rtruediv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3639,6 +3858,10 @@ def floordiv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3681,6 +3904,9 @@ def __floordiv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can divide by a scalar: >>> s = bpd.Series([15, 30, 45]) @@ -3718,6 +3944,10 @@ def rfloordiv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3776,6 +4006,10 @@ def mod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3818,6 +4052,9 @@ def __mod__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can modulo with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3854,6 +4091,10 @@ def rmod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3914,6 +4155,9 @@ def pow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3957,6 +4201,9 @@ def __pow__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can exponentiate with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3995,6 +4242,9 @@ def rpow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4054,6 +4304,10 @@ def divmod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4102,6 +4356,10 @@ def rdivmod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4153,6 +4411,10 @@ def combine_first(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) >>> s1.combine_first(s2) @@ -4191,6 +4453,11 @@ def update(self, other) -> None: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) >>> s @@ -4280,6 +4547,10 @@ def any( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + For Series input, the output is a scalar indicating whether any element is True. >>> bpd.Series([False, False]).any() @@ -4312,6 +4583,9 @@ def max( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the max of a Series: >>> s = bpd.Series([1, 3]) @@ -4325,7 +4599,7 @@ def max( Calculating the max of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4351,6 +4625,9 @@ def min( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the min of a Series: >>> s = bpd.Series([1, 3]) @@ -4364,7 +4641,7 @@ def min( Calculating the min of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4389,6 +4666,9 @@ def std( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], ... 'age': [21, 25, 62, 43], ... 'height': [1.61, 1.87, 1.49, 2.01]} @@ -4434,6 +4714,9 @@ def sum(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the sum of a Series: >>> s = bpd.Series([1, 3]) @@ -4447,7 +4730,7 @@ def sum(self): Calculating the sum of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4467,6 +4750,9 @@ def mean(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the mean of a Series: >>> s = bpd.Series([1, 3]) @@ -4480,7 +4766,7 @@ def mean(self): Calculating the mean of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4501,6 +4787,8 @@ def median(self, *, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.median() np.float64(2.0) @@ -4540,6 +4828,8 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) np.float64(2.5) @@ -4590,6 +4880,9 @@ def describe(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['A', 'A', 'B']) >>> s 0 A @@ -4615,6 +4908,9 @@ def skew(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.skew() np.float64(0.0) @@ -4650,6 +4946,9 @@ def kurt(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) >>> s cat 1 @@ -4690,6 +4989,9 @@ def item(self: Series, *args, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -4711,6 +5013,9 @@ def items(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): ... print(f"Index : {index}, Value : {value}") @@ -4730,6 +5035,9 @@ def where(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -4795,6 +5103,9 @@ def mask(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -4838,7 +5149,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -4851,12 +5162,12 @@ def mask(self, cond, other): 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask) # doctest: +SKIP + >>> s.mask(should_mask) 0 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask, "REDACTED") # doctest: +SKIP + >>> s.mask(should_mask, "REDACTED") 0 REDACTED 1 Bob 2 Caroline @@ -4950,6 +5261,9 @@ def argmax(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -4985,6 +5299,9 @@ def argmin(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -5023,6 +5340,9 @@ def rename(self, index, *, inplace, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -5072,6 +5392,9 @@ def rename_axis(self, mapper, *, inplace, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Series >>> s = bpd.Series(["dog", "cat", "monkey"]) @@ -5134,7 +5457,10 @@ def value_counts( **Examples:** - >>> s = bpd.Series([3, 1, 2, 3, 4, pd.NA], dtype="Int64") + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") >>> s 0 3 @@ -5210,6 +5536,8 @@ def str(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["A_Str_Series"]) >>> s 0 A_Str_Series @@ -5237,6 +5565,8 @@ def plot(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") >>> plot @@ -5262,6 +5592,9 @@ def isin(self, values): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', ... 'hippo'], name='animal') >>> s @@ -5325,6 +5658,9 @@ def is_monotonic_increasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing np.True_ @@ -5346,6 +5682,9 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing np.True_ @@ -5386,7 +5725,9 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['cat', 'dog', pd.NA, 'rabbit']) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) >>> s 0 cat 1 dog @@ -5406,7 +5747,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(cloud_function_service_account="default") ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: @@ -5415,7 +5756,7 @@ def map( ... ]) ... return "N/A" - >>> s.map(my_mapper) # doctest: +SKIP + >>> s.map(my_mapper) 0 cAt 1 dOg 2 N/A @@ -5449,6 +5790,9 @@ def iloc(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] @@ -5526,6 +5870,9 @@ def loc(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], ... columns=['max_speed', 'shield']) @@ -5610,6 +5957,9 @@ def iat(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -5642,6 +5992,9 @@ def at(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -5675,6 +6028,9 @@ def values(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.Series([1, 2, 3]).values array([1, 2, 3]) @@ -5694,6 +6050,9 @@ def size(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -5728,6 +6087,10 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> ser = bpd.Series([1, 2, 3]) >>> np.asarray(ser) @@ -5752,6 +6115,9 @@ def __len__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> len(s) 3 @@ -5765,6 +6131,9 @@ def __invert__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series([True, False, True]) >>> ~ser 0 False @@ -5783,6 +6152,9 @@ def __and__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -5819,6 +6191,9 @@ def __or__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -5855,6 +6230,9 @@ def __xor__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -5891,6 +6269,9 @@ def __getitem__(self, indexer): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([15, 30, 45]) >>> s[1] np.int64(30) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 7a37eba341..fe94bf3049 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,6 +20,8 @@ def __getitem__(self, key: typing.Union[int, slice]): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) >>> s.str[0] 0 A @@ -51,10 +53,12 @@ def extract(self, pat: str, flags: int = 0): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + A pattern with two groups will return a DataFrame with two columns. Non-matches will be `NaN`. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['a1', 'b2', 'c3']) >>> s.str.extract(r'([ab])(\\d)') 0 1 @@ -111,6 +115,8 @@ def find(self, sub, start: int = 0, end=None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) >>> ser.str.find("_") 0 3 @@ -139,10 +145,12 @@ def len(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Returns the length (number of characters) in a string. - >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['dog', '', pd.NA]) + >>> s = bpd.Series(['dog', '', bpd.NA]) >>> s.str.len() 0 3 1 0 @@ -164,6 +172,8 @@ def lower(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -187,6 +197,8 @@ def slice(self, start=None, stop=None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["koala", "dog", "chameleon"]) >>> s 0 koala @@ -238,11 +250,13 @@ def strip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([ ... '1. Ant.', ... ' 2. Bee? ', ... '\\t3. Cat!\\n', - ... pd.NA, + ... bpd.NA, ... ]) >>> s.str.strip() 0 1. Ant. @@ -279,6 +293,8 @@ def upper(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -306,6 +322,8 @@ def isnumeric(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False @@ -331,6 +349,8 @@ def isalpha(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalpha() 0 True @@ -355,6 +375,8 @@ def isdigit(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['23', '1a', '1/5', '']) >>> s.str.isdigit() 0 True @@ -379,6 +401,8 @@ def isalnum(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True @@ -415,6 +439,8 @@ def isspace(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) >>> s.str.isspace() 0 True @@ -439,6 +465,8 @@ def islower(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.islower() 0 True @@ -464,6 +492,8 @@ def isupper(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.isupper() 0 False @@ -488,10 +518,12 @@ def isdecimal(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + The `isdecimal` method checks for characters used to form numbers in base 10. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['23', '³', '⅕', '']) >>> s.str.isdecimal() 0 True @@ -518,7 +550,9 @@ def rstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.rstrip() 0 Ant 1 Bee @@ -549,7 +583,9 @@ def lstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.lstrip() 0 Ant 1 Bee @@ -575,6 +611,8 @@ def repeat(self, repeats: int): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['a', 'b', 'c']) >>> s 0 a @@ -607,6 +645,8 @@ def capitalize(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -632,9 +672,11 @@ def cat(self, others, *, join): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can concatenate each string in a Series to another string. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Jane', 'John']) >>> s.str.cat(" Doe") 0 Jane Doe @@ -687,9 +729,11 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Returning a Series of booleans using only a literal pattern. - >>> import bigframes.pandas as bpd >>> s1 = bpd.Series(['Mouse', 'dog', 'house and parrot', '23', None]) >>> s1.str.contains('og') 0 False @@ -789,12 +833,14 @@ def replace( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + When *pat* is a string and *regex* is True, the given *pat* is compiled as a regex. When *repl* is a string, it replaces matching regex patterns as with `re.sub()`. NaN value(s) in the Series are left as is: - >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['foo', 'fuz', pd.NA]) + >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) >>> s.str.replace('f.', 'ba', regex=True) 0 bao 1 baz @@ -804,7 +850,7 @@ def replace( When *pat* is a string and *regex* is False, every *pat* is replaced with *repl* as with `str.replace()`: - >>> s = bpd.Series(['f.o', 'fuz', pd.NA]) + >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) >>> s.str.replace('f.', 'ba', regex=False) 0 bao 1 fuz @@ -850,7 +896,9 @@ def startswith( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['bat', 'Bear', 'caT', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) >>> s 0 bat 1 Bear @@ -893,7 +941,9 @@ def endswith( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['bat', 'bear', 'caT', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) >>> s 0 bat 1 bear @@ -937,6 +987,9 @@ def split( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( ... [ ... "a regular sentence", @@ -978,6 +1031,8 @@ def match(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(["horse", "eagle", "donkey"]) >>> ser.str.match("e") 0 False @@ -1005,6 +1060,8 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r'd.+') 0 False @@ -1035,6 +1092,8 @@ def get(self, i: int): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["apple", "banana", "fig"]) >>> s.str.get(3) 0 l @@ -1063,6 +1122,8 @@ def pad( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["caribou", "tiger"]) >>> s 0 caribou @@ -1109,6 +1170,8 @@ def ljust( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.ljust(8, fillchar='.') 0 dog..... @@ -1139,6 +1202,8 @@ def rjust( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.rjust(8, fillchar='.') 0 .....dog @@ -1173,7 +1238,9 @@ def zfill( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['-1', '1', '1000', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) >>> s 0 -1 1 1 @@ -1211,6 +1278,8 @@ def center( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.center(8, fillchar='.') 0 ..dog... @@ -1240,9 +1309,12 @@ def join(self, sep: str): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import pandas as pd + Example with a list that contains non-string elements. - >>> import bigframes.pandas as bpd >>> s = bpd.Series([['lion', 'elephant', 'zebra'], ... ['dragon'], ... ['duck', 'swan', 'fish', 'guppy']]) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 105277dbf0..9c17b9632e 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -21,7 +21,6 @@ def to_datetime( utc=False, format=None, unit=None, - session=None, ) -> Union[pd.Timestamp, datetime, series.Series]: """ This function converts a scalar, array-like or Series to a datetime object. @@ -38,9 +37,11 @@ def to_datetime( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Converting a Scalar to datetime: - >>> import bigframes.pandas as bpd >>> scalar = 123456.789 >>> bpd.to_datetime(scalar, unit = 's') Timestamp('1970-01-02 10:17:36.789000') diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index 220b15f56e..9442e965fa 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -55,6 +55,7 @@ def to_timedelta( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Converting a Scalar to timedelta diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 3190c92b92..0fdca4dde1 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -61,6 +61,7 @@ def read_gbq( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None If the input is a table ID: diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 7d5c108f93..aec911d2fe 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -27,6 +27,8 @@ def read_parquet( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path, engine="bigquery") diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 9dc7b39873..4757f5ed9d 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -71,6 +71,8 @@ def read_csv( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) >>> df.head(2) @@ -190,6 +192,8 @@ def read_json( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") >>> df.head(2) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 2950cf422a..33088dc019 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -35,6 +35,8 @@ def read_pickle( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) diff --git a/third_party/bigframes_vendored/pandas/pandas/_typing.py b/third_party/bigframes_vendored/pandas/pandas/_typing.py index 76e984a173..e665339fc8 100644 --- a/third_party/bigframes_vendored/pandas/pandas/_typing.py +++ b/third_party/bigframes_vendored/pandas/pandas/_typing.py @@ -100,6 +100,7 @@ Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, datetime] IntStrT = TypeVar("IntStrT", int, str) + # timestamp and timedelta convertible types TimestampConvertibleTypes = Union[ @@ -266,6 +267,7 @@ def closed(self) -> bool: # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] + # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index a7cd2c0cc9..4ed5c8eb0b 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -8,11 +8,10 @@ class PlotAccessor: Make plots of Series or DataFrame with the `matplotlib` backend. **Examples:** - - >>> import bigframes.pandas as bpd - For Series: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -58,6 +57,9 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -94,6 +96,7 @@ def line( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -161,6 +164,7 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], @@ -229,6 +233,7 @@ def bar( Basic plot. >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) >>> ax = df.plot.bar(x='lab', y='val', rot=0) @@ -291,6 +296,7 @@ def scatter( in a DataFrame's columns. >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], ... [6.4, 3.2, 1], [5.9, 3.0, 2]], ... columns=['length', 'width', 'species']) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 44eefeddd7..a7344d49d4 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -30,6 +30,7 @@ class KMeans(_BaseKMeans): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> from bigframes.ml.cluster import KMeans >>> X = bpd.DataFrame({"feat0": [1, 1, 1, 10, 10, 10], "feat1": [2, 4, 0, 2, 4, 0]}) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index e487a2e7c1..c3c3a77b71 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,6 +24,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], ... "column": [0,1] * 7, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 3535edc8f9..f13c52bfb6 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -24,6 +24,7 @@ class PCA(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import PCA + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [-1, -2, -3, 1, 2, 3], "feat1": [-1, -1, -2, 1, 1, 2]}) >>> pca = PCA(n_components=2).fit(X) >>> pca.predict(X) # doctest:+SKIP diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 175ad86b21..42eab24c82 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -22,6 +22,7 @@ class SimpleImputer(_BaseImputer): >>> import bigframes.pandas as bpd >>> from bigframes.ml.impute import SimpleImputer + >>> bpd.options.display.progress_bar = None >>> X_train = bpd.DataFrame({"feat0": [7.0, 4.0, 10.0], "feat1": [2.0, None, 5.0], "feat2": [3.0, 6.0, 9.0]}) >>> imp_mean = SimpleImputer().fit(X_train) >>> X_test = bpd.DataFrame({"feat0": [None, 4.0, 10.0], "feat1": [2.0, None, None], "feat2": [3.0, 6.0, 9.0]}) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 7543edd10b..21ba5a3bf8 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -66,6 +66,7 @@ class LinearRegression(RegressorMixin, LinearModel): >>> from bigframes.ml.linear_model import LinearRegression >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index d449a1040c..a85c6fae8d 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -25,6 +25,7 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): >>> from bigframes.ml.linear_model import LogisticRegression >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index e60cc8cec4..fd6e8678ea 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -30,6 +30,7 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 2, 1, 3]) >>> y_pred = bpd.DataFrame([0, 1, 2, 3]) @@ -79,6 +80,7 @@ def confusion_matrix( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([2, 0, 2, 2, 0, 1]) >>> y_pred = bpd.DataFrame([0, 0, 2, 2, 0, 2]) @@ -130,6 +132,7 @@ def recall_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -178,6 +181,7 @@ def precision_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -228,6 +232,7 @@ def f1_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index cd5bd2cbcd..9262ffbd3d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -33,6 +33,7 @@ def auc(x, y) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> x = bpd.DataFrame([1, 1, 2, 2]) >>> y = bpd.DataFrame([2, 3, 4, 5]) @@ -88,6 +89,7 @@ def roc_auc_score(y_true, y_score) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) @@ -137,6 +139,7 @@ def roc_curve( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([1, 1, 2, 2]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 85f0c1ecf9..1c14e8068b 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -46,6 +46,7 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -72,6 +73,7 @@ def mean_squared_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -98,6 +100,7 @@ def mean_absolute_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py index 326589be7d..ec16fa8cf9 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py @@ -69,6 +69,7 @@ class KFold(_BaseKFold): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import KFold + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> kf = KFold(n_splits=3, random_state=42) @@ -161,6 +162,7 @@ def train_test_split( >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import train_test_split + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]}) >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index 6f84018853..b93c47ea04 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -19,6 +19,7 @@ def cross_validate(estimator, X, y=None, *, cv=None): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import cross_validate, KFold >>> from bigframes.ml.linear_model import LinearRegression + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> model = LinearRegression() diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 64a5786f17..5476a9fb3c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -25,6 +25,7 @@ class OneHotEncoder(BaseEstimator): >>> from bigframes.ml.preprocessing import OneHotEncoder >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> enc = OneHotEncoder() >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) From a97cc937ce8802778d4db4cc305ce516b60017e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:31:17 +0000 Subject: [PATCH 17/63] revert doctest changes --- dummy.pkl | Bin 1150 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dummy.pkl diff --git a/dummy.pkl b/dummy.pkl deleted file mode 100644 index 76a409b1ded309cfc7b30cccd49d85a710e737bd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1150 zcmbVMU2D`p6is$F+m9_uQPyHX!3T{XfkORwP()jV8eBodw=zt!lbx{nSTYmpf`UG@ zwlHsd?cdcqli4LgMKBM!H_4ql=bU>c-@Koq=a@@v&uB5GB8bb11xZD725RGwO8Um+ z3wZb)zJjlMB%f5E?zGF(Lb9r$nFw-P&UF%7emK7)(%AMgSEu&dnXFdB{C z{=&=LLPtUrdZ(b=1CUsxJd#r}1%A`N^i~^V0(?hxqP=#nFMsL9@0w1164RJ79LW+6 z-%^_>%#-~?i_XZxAL*eD(qha$lQ^RSN3+uw*L-0jh^WAcdq=uZQyUnvq@l`pRd0%w z$RveYM52z=dQ_*GObcx2i7bt^AfXewp{w->JNnKC{8}>|zO6|wHD8kNTM^c5T(@z< zM&P?zzlJlvcZF{ETi=l?5BMH}<1Y|Lr;X$cj=|@T)g~#}30bn_PmQG;-wfNn$!p|h zjE8T1B0d9+UahmU2&fpLJ^u Date: Wed, 8 Oct 2025 20:33:23 +0000 Subject: [PATCH 18/63] revert df docstrings --- bigframes/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0146287e15..3527b225e2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1771,6 +1771,7 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 2, 2]}) Download the data from BigQuery and convert it into an in-memory pandas DataFrame. @@ -1892,6 +1893,7 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) Iterate through the results in batches, limiting the total rows yielded @@ -4250,6 +4252,8 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None >>> data = { ... "timestamp_col": pd.date_range( From 922bbf45ac2f9b26409c425f0b8446579422eb11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:57:53 +0000 Subject: [PATCH 19/63] add polars series unit tests --- bigframes/testing/polars_session.py | 11 +- tests/unit/test_series_polars.py | 4897 +++++++++++++++++++++++++++ 2 files changed, 4906 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_series_polars.py diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 29eae20b7a..4d3e6862b9 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -95,10 +95,17 @@ def __init__(self): def read_pandas(self, pandas_dataframe, write_engine="default"): # override read_pandas to always keep data local-only - if isinstance(pandas_dataframe, pandas.Series): + if isinstance(pandas_dataframe, (pandas.Series, pandas.Index)): pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) - return bigframes.dataframe.DataFrame(local_block) + bf_df = bigframes.dataframe.DataFrame(local_block) + if isinstance(pandas_dataframe, pandas.Series): + series = bf_df[bf_df.columns[0]] + series.name = pandas_dataframe.name + return series + if isinstance(pandas_dataframe, pandas.Index): + return bf_df.index + return bf_df @property def bqclient(self): diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py new file mode 100644 index 0000000000..8c24a28f43 --- /dev/null +++ b/tests/unit/test_series_polars.py @@ -0,0 +1,4897 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime as dt +import json +import math +import pathlib +import re +import tempfile +from typing import Generator + +import db_dtypes # type: ignore +import geopandas as gpd # type: ignore +import google.api_core.exceptions +import numpy +from packaging.version import Version +import pandas as pd +import pyarrow as pa # type: ignore +import pytest +import shapely.geometry # type: ignore + +import bigframes +import bigframes.dtypes as dtypes +import bigframes.features +import bigframes.pandas +import bigframes.pandas as bpd +import bigframes.series as series +from bigframes.testing.utils import ( + assert_pandas_df_equal, + assert_series_equal, + convert_pandas_dtypes, + get_first_file_from_wildcard, +) + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="module") +def scalars_pandas_df_index() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + df.index.name = None + return df.set_index("rowindex").sort_index() + + +@pytest.fixture(scope="module") +def scalars_df_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_df_2_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_dfs( + scalars_df_index, + scalars_pandas_df_index, +): + return scalars_df_index, scalars_pandas_df_index + + +def test_series_construct_copy(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df["int64_col"], name="test_series", dtype="Float64" + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_nullable_ints(): + bf_result = series.Series( + [1, 3, bigframes.pandas.NA], index=[0, 4, bigframes.pandas.NA] + ).to_pandas() + + # TODO(b/340885567): fix type error + expected_index = pd.Index( # type: ignore + [0, 4, None], + dtype=pd.Int64Dtype(), + ) + expected = pd.Series([1, 3, pd.NA], dtype=pd.Int64Dtype(), index=expected_index) + + pd.testing.assert_series_equal(bf_result, expected) + + +def test_series_construct_timestamps(): + datetimes = [ + dt.datetime(2020, 1, 20, 20, 20, 20, 20), + dt.datetime(2019, 1, 20, 20, 20, 20, 20), + None, + ] + bf_result = series.Series(datetimes).to_pandas() + pd_result = pd.Series(datetimes, dtype=pd.ArrowDtype(pa.timestamp("us"))) + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_series_construct_copy_with_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_copy_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df.index, + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df.index, + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_pandas(scalars_dfs): + _, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" + ) + pd_result = pd.Series( + scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" + ) + assert bf_result.shape == pd_result.shape + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series_construct_from_list(): + bf_result = series.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64").to_pandas() + pd_result = pd.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_reindex(): + bf_result = series.Series( + series.Series({1: 10, 2: 30, 3: 30}), index=[3, 2], dtype="Int64" + ).to_pandas() + pd_result = pd.Series(pd.Series({1: 10, 2: 30, 3: 30}), index=[3, 2], dtype="Int64") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_from_list_w_index(): + bf_result = series.Series( + [1, 1, 2, 3, 5, 8, 13], index=[10, 20, 30, 40, 50, 60, 70], dtype="Int64" + ).to_pandas() + pd_result = pd.Series( + [1, 1, 2, 3, 5, 8, 13], index=[10, 20, 30, 40, 50, 60, 70], dtype="Int64" + ) + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_empty(session: bigframes.Session): + bf_series: series.Series = series.Series(session=session) + pd_series: pd.Series = pd.Series() + + bf_result = bf_series.empty + pd_result = pd_series.empty + + assert pd_result + assert bf_result == pd_result + + +def test_series_construct_scalar_no_index(): + bf_result = series.Series("hello world", dtype="string[pyarrow]").to_pandas() + pd_result = pd.Series("hello world", dtype="string[pyarrow]") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_scalar_w_index(): + bf_result = series.Series( + "hello world", dtype="string[pyarrow]", index=[0, 2, 1] + ).to_pandas() + pd_result = pd.Series("hello world", dtype="string[pyarrow]", index=[0, 2, 1]) + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_nan(): + bf_result = series.Series(numpy.nan).to_pandas() + pd_result = pd.Series(numpy.nan) + + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_scalar_w_bf_index(): + bf_result = series.Series( + "hello", index=bigframes.pandas.Index([1, 2, 3]) + ).to_pandas() + pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64")) + + pd_result = pd_result.astype("string[pyarrow]") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_from_list_escaped_strings(): + """Check that special characters are supported.""" + strings = [ + "string\nwith\nnewline", + "string\twith\ttabs", + "string\\with\\backslashes", + ] + bf_result = series.Series(strings, name="test_series", dtype="string[pyarrow]") + pd_result = pd.Series(strings, name="test_series", dtype="string[pyarrow]") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series_construct_geodata(): + pd_series = pd.Series( + [ + shapely.geometry.Point(1, 1), + shapely.geometry.Point(2, 2), + shapely.geometry.Point(3, 3), + ], + dtype=gpd.array.GeometryDtype(), + ) + + series = bigframes.pandas.Series(pd_series) + + pd.testing.assert_series_equal( + pd_series, series.to_pandas(), check_index_type=False + ) + + +@pytest.mark.parametrize( + ("dtype"), + [ + pytest.param(pd.Int64Dtype(), id="int"), + pytest.param(pd.Float64Dtype(), id="float"), + pytest.param(pd.StringDtype(storage="pyarrow"), id="string"), + ], +) +def test_series_construct_w_dtype(dtype): + data = [1, 2, 3] + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + series = bigframes.pandas.Series(data, dtype=dtype) + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_struct(): + # The data shows the struct fields are disordered and correctly handled during + # construction. + data = [ + {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)}, + {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)}, + {"a": 1, "c": "numpy", "b": None}, + ] + dtype = pd.ArrowDtype( + pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))]) + ) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_array_string(): + data = [["1", "2", "3"], [], ["4", "5"]] + dtype = pd.ArrowDtype(pa.list_(pa.string())) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + +def test_series_construct_w_dtype_for_array_struct(): + data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]] + dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())]))) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + +def test_series_construct_local_unordered_has_sequential_index(unordered_session): + series = bigframes.pandas.Series( + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ) + expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) + pd.testing.assert_index_equal(series.index.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_series_construct_w_json_dtype(json_type): + data = [ + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', + None, + '{"a": {"b": [1, 2, 3], "c": true}}', + ] + s = bigframes.pandas.Series(data, dtype=json_type) + + assert s.dtype == dtypes.JSON_DTYPE + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + assert s[3] == '["a",{"b":1},null]' + assert pd.isna(s[4]) + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' + + +def test_series_keys(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].keys().to_pandas() + pd_result = scalars_pandas_df["int64_col"].keys() + pd.testing.assert_index_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ["data", "index"], + [ + (["a", "b", "c"], None), + ([1, 2, 3], ["a", "b", "c"]), + ([1, 2, None], ["a", "b", "c"]), + ([1, 2, 3], [pd.NA, "b", "c"]), + ([numpy.nan, 2, 3], ["a", "b", "c"]), + ], +) +def test_series_items(data, index): + bf_series = series.Series(data, index=index) + pd_series = pd.Series(data, index=index) + + for (bf_index, bf_value), (pd_index, pd_value) in zip( + bf_series.items(), pd_series.items() + ): + # TODO(jialuo): Remove the if conditions after b/373699458 is addressed. + if not pd.isna(bf_index) or not pd.isna(pd_index): + assert bf_index == pd_index + if not pd.isna(bf_value) or not pd.isna(pd_value): + assert bf_value == pd_value + + +@pytest.mark.parametrize( + ["col_name", "expected_dtype"], + [ + ("bool_col", pd.BooleanDtype()), + # TODO(swast): Use a more efficient type. + ("bytes_col", pd.ArrowDtype(pa.binary())), + ("date_col", pd.ArrowDtype(pa.date32())), + ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))), + ("float64_col", pd.Float64Dtype()), + ("geography_col", gpd.array.GeometryDtype()), + ("int64_col", pd.Int64Dtype()), + # TODO(swast): Use a more efficient type. + ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_too", pd.Int64Dtype()), + ("string_col", pd.StringDtype(storage="pyarrow")), + ("time_col", pd.ArrowDtype(pa.time64("us"))), + ("timestamp_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ], +) +def test_get_column(scalars_dfs, col_name, expected_dtype): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df[col_name] + series_pandas = series.to_pandas() + assert series_pandas.dtype == expected_dtype + assert series_pandas.shape[0] == scalars_pandas_df.shape[0] + + +def test_get_column_w_json(json_df, json_pandas_df): + series = json_df["json_col"] + series_pandas = series.to_pandas() + assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + assert series_pandas.shape[0] == json_pandas_df.shape[0] + + +def test_series_get_column_default(scalars_dfs): + scalars_df, _ = scalars_dfs + result = scalars_df.get(123123123123123, "default_val") + assert result == "default_val" + + +@pytest.mark.parametrize( + ("key",), + [ + ("hello",), + (2,), + ("int64_col",), + (None,), + ], +) +def test_series_contains(scalars_df_index, scalars_pandas_df_index, key): + bf_result = key in scalars_df_index["int64_col"] + pd_result = key in scalars_pandas_df_index["int64_col"] + + assert bf_result == pd_result + + +def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col) + pd_result = scalars_pandas_df_index.int64_col.equals( + scalars_pandas_df_index.int64_col + ) + + assert pd_result == bf_result + + +def test_series_equals_df(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]]) + pd_result = scalars_pandas_df_index["int64_col"].equals( + scalars_pandas_df_index[["int64_col"]] + ) + + assert pd_result == bf_result + + +def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + bf_result = bf_series.equals(bf_series.astype("Float64")) + pd_result = pd_series.equals(pd_series.astype("Float64")) + + assert pd_result == bf_result + + +def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + bf_result = bf_series.equals(bf_series + 1) + pd_result = pd_series.equals(pd_series + 1) + + assert pd_result == bf_result + + +def test_series_get_with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].get(key) + pd_result = scalars_pandas_df[col_name].get(key) + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index_col", "key"), + ( + ("int64_too", 2), + ("string_col", "Hello, World!"), + ("int64_too", slice(2, 6)), + ), +) +def test_series___getitem__(scalars_dfs, index_col, key): + col_name = "float64_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("key",), + ( + (-2,), + (-1,), + (0,), + (1,), + ), +) +def test_series___getitem___with_int_key(scalars_dfs, key): + col_name = "int64_too" + index_col = "string_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + assert bf_result == pd_result + + +def test_series___getitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index_col", "key", "value"), + ( + ("int64_too", 2, "new_string_value"), + ("string_col", "Hello, World!", "updated_value"), + ("int64_too", 0, None), + ), +) +def test_series___setitem__(scalars_dfs, index_col, key, value): + col_name = "string_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +@pytest.mark.parametrize( + ("key", "value"), + ( + (0, 999), + (1, 888), + (0, None), + (-2345, 777), + ), +) +def test_series___setitem___with_int_key_numeric(scalars_dfs, key, value): + col_name = "int64_col" + index_col = "int64_too" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_series___setitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + value = 123.456 + scalars_df, scalars_pandas_df = scalars_dfs + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + assert bf_series.to_pandas().iloc[key] == pd_series.iloc[key] + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_too",), + ), +) +def test_abs(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].abs().to_pandas() + pd_result = scalars_pandas_df[col_name].abs() + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_too",), + ), +) +def test_series_pos(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (+scalars_df[col_name]).to_pandas() + pd_result = +scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_too",), + ), +) +def test_series_neg(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (-scalars_df[col_name]).to_pandas() + pd_result = -scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("bool_col",), + ("int64_col",), + ), +) +def test_series_invert(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (~scalars_df[col_name]).to_pandas() + pd_result = ~scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + +def test_fillna(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].fillna("Missing").to_pandas() + pd_result = scalars_pandas_df[col_name].fillna("Missing") + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_series_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name].replace("Hello, World!", "Howdy, Planet!").to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace("Hello, World!", "Howdy, Planet!") + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +def test_series_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name] + .replace(["Hello, World!", "T"], "Howdy, Planet!") + .to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace( + ["Hello, World!", "T"], "Howdy, Planet!" + ) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("replacement_dict",), + (({},),), + ids=[ + "empty", + ], +) +def test_series_replace_dict(scalars_dfs, replacement_dict): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].replace(replacement_dict).to_pandas() + pd_result = scalars_pandas_df[col_name].replace(replacement_dict) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("method",), + ( + ("linear",), + ("values",), + ("slinear",), + ("nearest",), + ("zero",), + ("pad",), + ), +) +def test_series_interpolate(method): + pytest.importorskip("scipy") + + values = [None, 1, 2, None, None, 16, None] + index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] + pd_series = pd.Series(values, index) + bf_series = series.Series(pd_series) + + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = pd_series.astype("float64").interpolate(method=method) + bf_result = bf_series.interpolate(method=method).to_pandas() + + # pd uses non-null types, while bf uses nullable types + pd.testing.assert_series_equal( + pd_result, + bf_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index",), + ( + (True,), + (False,), + ), +) +def test_series_dropna(scalars_dfs, ignore_index): + if pd.__version__.startswith("1."): + pytest.skip("ignore_index parameter not supported in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas() + pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index) + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("agg",), + ( + ("sum",), + ("size",), + ), +) +def test_series_agg_single_string(scalars_dfs, agg): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].agg(agg) + pd_result = scalars_pandas_df["int64_col"].agg(agg) + assert math.isclose(pd_result, bf_result) + + +def test_series_agg_multi_string(scalars_dfs): + aggregations = [ + "sum", + "mean", + "std", + "var", + "min", + "max", + "nunique", + "count", + "size", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df["int64_col"].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("string_col",), + ("int64_col",), + ), +) +def test_max(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].max() + pd_result = scalars_pandas_df[col_name].max() + assert pd_result == bf_result + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("string_col",), + ("int64_col",), + ), +) +def test_min(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].min() + pd_result = scalars_pandas_df[col_name].min() + assert pd_result == bf_result + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_std(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].std() + pd_result = scalars_pandas_df[col_name].std() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_kurt(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].kurt() + pd_result = scalars_pandas_df[col_name].kurt() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_skew(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].skew() + pd_result = scalars_pandas_df[col_name].skew() + assert math.isclose(pd_result, bf_result) + + +def test_skew_undefined(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].iloc[:2].skew() + pd_result = scalars_pandas_df["int64_col"].iloc[:2].skew() + # both should be pd.NA + assert pd_result is bf_result + + +def test_kurt_undefined(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].iloc[:3].kurt() + pd_result = scalars_pandas_df["int64_col"].iloc[:3].kurt() + # both should be pd.NA + assert pd_result is bf_result + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_var(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].var() + pd_result = scalars_pandas_df[col_name].var() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("bool_col",), + ("int64_col",), + ), +) +def test_mode_stat(scalars_df_index, scalars_pandas_df_index, col_name): + bf_result = scalars_df_index[col_name].mode().to_pandas() + pd_result = scalars_pandas_df_index[col_name].mode() + + ## Mode implicitly resets index, and bigframes default indices use nullable Int64 + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x, y: x + y), + (lambda x, y: x - y), + (lambda x, y: x * y), + (lambda x, y: x / y), + (lambda x, y: x // y), + (lambda x, y: x < y), + (lambda x, y: x > y), + (lambda x, y: x <= y), + (lambda x, y: x >= y), + ], + ids=[ + "add", + "subtract", + "multiply", + "divide", + "floordivide", + "less_than", + "greater_than", + "less_than_equal", + "greater_than_equal", + ], +) +@pytest.mark.parametrize( + ("other_scalar"), + [ + -1, + 0, + 14, + # TODO(tswast): Support pd.NA, + ], +) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_series_int_int_operators_scalar( + scalars_dfs, operator, other_scalar, reverse_operands +): + scalars_df, scalars_pandas_df = scalars_dfs + + maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator + + bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) + + assert_series_equal(pd_result, bf_result) + + +def test_series_pow_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["int64_col"] ** 2).to_pandas() + pd_result = scalars_pandas_df["int64_col"] ** 2 + + assert_series_equal(pd_result, bf_result) + + +def test_series_pow_scalar_reverse(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() + pd_result = 0.8 ** scalars_pandas_df["int64_col"] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x, y: x & y), + (lambda x, y: x | y), + (lambda x, y: x ^ y), + ], + ids=[ + "and", + "or", + "xor", + ], +) +@pytest.mark.parametrize(("other_scalar"), [True, False, pd.NA]) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_series_bool_bool_operators_scalar( + scalars_dfs, operator, other_scalar, reverse_operands +): + scalars_df, scalars_pandas_df = scalars_dfs + + maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator + + bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) + + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x, y: x + y), + (lambda x, y: x - y), + (lambda x, y: x * y), + (lambda x, y: x / y), + (lambda x, y: x < y), + (lambda x, y: x > y), + (lambda x, y: x <= y), + (lambda x, y: x >= y), + (lambda x, y: x % y), + (lambda x, y: x // y), + (lambda x, y: x & y), + (lambda x, y: x | y), + (lambda x, y: x ^ y), + ], + ids=[ + "add", + "subtract", + "multiply", + "divide", + "less_than", + "greater_than", + "less_than_equal", + "greater_than_equal", + "modulo", + "floordivide", + "bitwise_and", + "bitwise_or", + "bitwise_xor", + ], +) +def test_series_int_int_operators_series(scalars_dfs, operator): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() + pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("int64_too",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("col_y",), + [ + ("int64_col",), + ("int64_too",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("mod",), + ("rmod",), + ], +) +def test_mods(scalars_dfs, col_x, col_y, method): + scalars_df, scalars_pandas_df = scalars_dfs + x_bf = scalars_df[col_x] + y_bf = scalars_df[col_y] + bf_series = getattr(x_bf, method)(y_bf) + # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod + if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): + bf_result = bf_series.to_pandas() + else: + bf_result = bf_series.astype("Float64").to_pandas() + pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) + pd.testing.assert_series_equal(pd_result, bf_result) + + +# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this +# manually with dumb self-correlation instead of parameterized as test_mods is above. +def test_series_corr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .corr(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + +def test_series_autocorr(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["float64_col"].autocorr(2) + pd_result = scalars_pandas_df["float64_col"].autocorr(2) + assert math.isclose(pd_result, bf_result) + + +def test_series_cov(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .cov(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("col_y",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("divmod",), + ("rdivmod",), + ], +) +def test_divmods_series(scalars_dfs, col_x, col_y, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]) + pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( + scalars_pandas_df[col_y] + ) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("other",), + [ + (-1000,), + (678,), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("divmod",), + ("rdivmod",), + ], +) +def test_divmods_scalars(scalars_dfs, col_x, other, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) + pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) + + +@pytest.mark.parametrize( + ("other",), + [ + (3,), + (-6.2,), + ], +) +def test_series_add_scalar(scalars_dfs, other): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (scalars_df["float64_col"] + other).to_pandas() + pd_result = scalars_pandas_df["float64_col"] + other + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("left_col", "right_col"), + [ + ("float64_col", "float64_col"), + ("int64_col", "float64_col"), + ("int64_col", "int64_too"), + ], +) +def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() + pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("left_col", "right_col", "righter_col"), + [ + ("float64_col", "float64_col", "float64_col"), + ("int64_col", "int64_col", "int64_col"), + ], +) +def test_series_add_bigframes_series_nested( + scalars_dfs, left_col, right_col, righter_col +): + """Test that we can correctly add multiple times.""" + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + (scalars_df[left_col] + scalars_df[right_col]) + scalars_df[righter_col] + ).to_pandas() + pd_result = ( + scalars_pandas_df[left_col] + scalars_pandas_df[right_col] + ) + scalars_pandas_df[righter_col] + + assert_series_equal(pd_result, bf_result) + + +def test_series_add_different_table_default_index( + scalars_df_default_index, + scalars_df_2_default_index, +): + bf_result = ( + scalars_df_default_index["float64_col"] + + scalars_df_2_default_index["float64_col"] + ).to_pandas() + pd_result = ( + # Default index may not have a well defined order, but it should at + # least be consistent across to_pandas() calls. + scalars_df_default_index["float64_col"].to_pandas() + + scalars_df_2_default_index["float64_col"].to_pandas() + ) + # TODO(swast): Can remove sort_index() when there's default ordering. + pd.testing.assert_series_equal(bf_result.sort_index(), pd_result.sort_index()) + + +def test_series_add_different_table_with_index( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + scalars_pandas_df = scalars_pandas_df_index + bf_result = scalars_df_index["float64_col"] + scalars_df_2_index["int64_col"] + # When index values are unique, we can emulate with values from the same + # DataFrame. + pd_result = scalars_pandas_df["float64_col"] + scalars_pandas_df["int64_col"] + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): + scalars_pandas_df = scalars_pandas_df_index + bf_result = ( + scalars_df_index["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=True) + ).iloc[::2] + pd_result = ( + scalars_pandas_df["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=True) + ).iloc[::2] + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"].copy() + bf_series.index.name = "int64_col" + df = bf_series.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + pd_series = scalars_pandas_df_index["int64_col"].copy() + pd_series.index.name = "int64_col" + pd_result = pd_series.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_series_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index["int64_col"].copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + +def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"] + bf_result.reset_index(drop=True, inplace=True) + pd_result = scalars_pandas_df_index.sort_index(ascending=False)["float64_col"] + pd_result.reset_index(drop=True, inplace=True) + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("name",), + [ + ("some_name",), + (None,), + ], +) +def test_reset_index_no_drop(scalars_df_index, scalars_pandas_df_index, name): + scalars_pandas_df = scalars_pandas_df_index + kw_args = {"name": name} if name else {} + bf_result = ( + scalars_df_index["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=False, **kw_args) + ) + pd_result = ( + scalars_pandas_df["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=False, **kw_args) + ) + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_copy(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + # Expect mutation on original not to effect_copy + bf_series = scalars_df_index[col_name].copy() + bf_copy = bf_series.copy() + bf_copy.loc[0] = 5.6 + bf_series.loc[0] = 3.4 + + pd_series = scalars_pandas_df_index[col_name].copy() + pd_copy = pd_series.copy() + pd_copy.loc[0] = 5.6 + pd_series.loc[0] = 3.4 + + assert bf_copy.to_pandas().loc[0] != bf_series.to_pandas().loc[0] + pd.testing.assert_series_equal(bf_copy.to_pandas(), pd_copy) + + +def test_isin_raise_error(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_too" + with pytest.raises(TypeError): + scalars_df_index[col_name].isin("whatever").to_pandas() + + +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin(scalars_dfs, col_name, test_set): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].isin(test_set).to_pandas() + pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + scalars_df[col_name].isin(series.Series(test_set, session=session)).to_pandas() + ) + pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +def test_isin_bigframes_index(scalars_dfs, session): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + scalars_df["string_col"] + .isin(bigframes.pandas.Index(["Hello, World!", "Hi", "こんにちは"], session=session)) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df["string_col"] + .isin(pd.Index(["Hello, World!", "Hi", "こんにちは"])) + .astype("boolean") + ) + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin_bigframes_values_as_predicate( + scalars_dfs_maybe_ordered, col_name, test_set +): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_predicate = scalars_df[col_name].isin( + series.Series(test_set, session=scalars_df._session) + ) + bf_result = scalars_df[bf_predicate].to_pandas() + pd_predicate = scalars_pandas_df[col_name].isin(test_set) + pd_result = scalars_pandas_df[pd_predicate] + + pd.testing.assert_frame_equal( + pd_result.reset_index(), + bf_result.reset_index(), + ) + + +def test_isnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_series = scalars_df[col_name].isnull().to_pandas() + pd_series = scalars_pandas_df[col_name].isnull() + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) + + +def test_notnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series = scalars_df[col_name].notnull().to_pandas() + pd_series = scalars_pandas_df[col_name].notnull() + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) + + +def test_eq_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = scalars_df[col_name].eq(0).to_pandas() + pd_result = scalars_pandas_df[col_name].eq(0) + + assert_series_equal(pd_result, bf_result) + + +def test_eq_wider_type_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = scalars_df[col_name].eq(1.0).to_pandas() + pd_result = scalars_pandas_df[col_name].eq(1.0) + + assert_series_equal(pd_result, bf_result) + + +def test_ne_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = (scalars_df[col_name] != 0).to_pandas() + pd_result = scalars_pandas_df[col_name] != 0 + + assert_series_equal(pd_result, bf_result) + + +def test_eq_int_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = (scalars_df[col_name] == 0).to_pandas() + pd_result = scalars_pandas_df[col_name] == 0 + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("string_col",), + ("float64_col",), + ("int64_too",), + ), +) +def test_eq_same_type_series(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = (scalars_df[col_name] == scalars_df[col_name]).to_pandas() + pd_result = scalars_pandas_df[col_name] == scalars_pandas_df[col_name] + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) + + +def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): + bf_original = scalars_df_index["string_col"] + bf_series = scalars_df_index["string_col"] + pd_original = scalars_pandas_df_index["string_col"] + pd_series = scalars_pandas_df_index["string_col"].copy() + bf_series.loc[2] = "This value isn't in the test data." + pd_series.loc[2] = "This value isn't in the test data." + bf_result = bf_series.to_pandas() + pd_result = pd_series + pd.testing.assert_series_equal(bf_result, pd_result) + # Per Copy-on-Write semantics, other references to the original DataFrame + # should remain unchanged. + pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original) + + +def test_at_setitem_row_label_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"].copy() + bf_series.at[1] = 1000 + pd_series.at[1] = 1000 + bf_result = bf_series.to_pandas() + pd_result = pd_series.astype("Int64") + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_ne_obj_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = (scalars_df[col_name] != scalars_df[col_name]).to_pandas() + pd_result = scalars_pandas_df[col_name] != scalars_pandas_df[col_name] + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) + + +def test_indexing_using_unselected_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() + pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_indexing_using_selected_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name][ + scalars_df["string_col"].eq("Hello, World!") + ].to_pandas() + pd_result = scalars_pandas_df[col_name][ + scalars_pandas_df["string_col"].eq("Hello, World!") + ] + + assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("indices"), + [ + ([1, 3, 5]), + ([5, -3, -5, -6]), + ([-2, -4, -6]), + ], +) +def test_take(scalars_dfs, indices): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices).to_pandas() + pd_result = scalars_pandas_df.take(indices) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_nested_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + string_col = scalars_df["string_col"] + int64_too = scalars_df["int64_too"] + bool_col = scalars_df["bool_col"] == bool( + True + ) # Convert from nullable bool to nonnullable bool usable as indexer + bf_result = string_col[int64_too == 0][~bool_col].to_pandas() + + pd_string_col = scalars_pandas_df["string_col"] + pd_int64_too = scalars_pandas_df["int64_too"] + pd_bool_col = scalars_pandas_df["bool_col"] == bool( + True + ) # Convert from nullable bool to nonnullable bool usable as indexer + pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col] + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_binop_opposite_filters(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col1 = scalars_df["int64_col"] + int64_col2 = scalars_df["int64_col"] + bool_col = scalars_df["bool_col"] + bf_result = (int64_col1[bool_col] + int64_col2[bool_col.__invert__()]).to_pandas() + + pd_int64_col1 = scalars_pandas_df["int64_col"] + pd_int64_col2 = scalars_pandas_df["int64_col"] + pd_bool_col = scalars_pandas_df["bool_col"] + pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] + + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) + + +def test_binop_left_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"] + float64_col = scalars_df["float64_col"] + bool_col = scalars_df["bool_col"] + bf_result = (int64_col[bool_col] + float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_float64_col = scalars_pandas_df["float64_col"] + pd_bool_col = scalars_pandas_df["bool_col"] + pd_result = pd_int64_col[pd_bool_col] + pd_float64_col + + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) + + +def test_binop_right_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"] + float64_col = scalars_df["float64_col"] + bool_col = scalars_df["bool_col"] + bf_result = (float64_col + int64_col[bool_col]).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_float64_col = scalars_pandas_df["float64_col"] + pd_bool_col = scalars_pandas_df["bool_col"] + pd_result = pd_float64_col + pd_int64_col[pd_bool_col] + + assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("other",), + [ + ([-1.4, 2.3, None],), + (pd.Index([-1.4, 2.3, None]),), + (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), + ], +) +def test_series_binop_w_other_types(scalars_dfs, other): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas() + pd_result = scalars_pandas_df["int64_col"].head(3) + other + + assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("other",), + [ + ([-1.4, 2.3, None],), + (pd.Index([-1.4, 2.3, None]),), + (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), + ], +) +def test_series_reverse_binop_w_other_types(scalars_dfs, other): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (other + scalars_df["int64_col"].head(3)).to_pandas() + pd_result = other + scalars_pandas_df["int64_col"].head(3) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_combine_first(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7) + bf_result = int64_col.combine_first(float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_result = pd_int64_col.combine_first(pd_float64_col) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_update(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7).copy() + float64_col.update(int64_col) + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy() + pd_float64_col.update(pd_int64_col) + + assert_series_equal( + float64_col.to_pandas(), + pd_float64_col, + ) + + +def test_mean(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].mean() + pd_result = scalars_pandas_df[col_name].mean() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name"), + [ + "int64_col", + # Non-numeric column + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "string_col", + ], +) +def test_median(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].median(exact=False) + pd_max = scalars_pandas_df[col_name].max() + pd_min = scalars_pandas_df[col_name].min() + # Median is approximate, so just check for plausibility. + assert pd_min < bf_result < pd_max + + +def test_numeric_literal(scalars_dfs): + scalars_df, _ = scalars_dfs + col_name = "numeric_col" + assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + bf_result = scalars_df[col_name] + 42 + assert bf_result.size == scalars_df[col_name].size + assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + + +def test_series_small_repr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + col_name = "int64_col" + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name] + assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) + + +def test_sum(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].sum() + pd_result = scalars_pandas_df[col_name].sum() + assert pd_result == bf_result + + +def test_product(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_result = scalars_df[col_name].product() + pd_result = scalars_pandas_df[col_name].product() + assert math.isclose(pd_result, bf_result) + + +def test_cumprod(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumprod NA mask are different in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_result = scalars_df[col_name].cumprod() + pd_result = scalars_pandas_df[col_name].cumprod() + pd.testing.assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_count(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].count() + pd_result = scalars_pandas_df[col_name].count() + assert pd_result == bf_result + + +def test_nunique(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = (scalars_df[col_name] % 3).nunique() + pd_result = (scalars_pandas_df[col_name] % 3).nunique() + assert pd_result == bf_result + + +def test_all(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].all() + pd_result = scalars_pandas_df[col_name].all() + assert pd_result == bf_result + + +def test_any(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].any() + pd_result = scalars_pandas_df[col_name].any() + assert pd_result == bf_result + + +def test_groupby_sum(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby([scalars_df["bool_col"], ~scalars_df["bool_col"]]) + .sum() + ) + pd_series = ( + scalars_pandas_df[col_name] + .groupby([scalars_pandas_df["bool_col"], ~scalars_pandas_df["bool_col"]]) + .sum() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + check_exact=False, + ) + + +def test_groupby_std(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = scalars_df[col_name].groupby(scalars_df["string_col"]).std() + pd_series = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"]) + .std() + .astype(pd.Float64Dtype()) + ) + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + check_exact=False, + ) + + +def test_groupby_var(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = scalars_df[col_name].groupby(scalars_df["string_col"]).var() + pd_series = ( + scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() + ) + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + check_exact=False, + ) + + +def test_groupby_level_sum(scalars_dfs): + # TODO(tbergeron): Use a non-unique index once that becomes possible in tests + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + + bf_series = scalars_df[col_name].groupby(level=0).sum() + pd_series = scalars_pandas_df[col_name].groupby(level=0).sum() + # TODO(swast): Update groupby to use index based on group by key(s). + pd.testing.assert_series_equal( + pd_series.sort_index(), + bf_series.to_pandas().sort_index(), + ) + + +def test_groupby_level_list_sum(scalars_dfs): + # TODO(tbergeron): Use a non-unique index once that becomes possible in tests + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + + bf_series = scalars_df[col_name].groupby(level=["rowindex"]).sum() + pd_series = scalars_pandas_df[col_name].groupby(level=["rowindex"]).sum() + # TODO(swast): Update groupby to use index based on group by key(s). + pd.testing.assert_series_equal( + pd_series.sort_index(), + bf_series.to_pandas().sort_index(), + ) + + +def test_groupby_mean(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).mean() + ) + pd_series = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .mean() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + ) + + +def test_groupby_median_exact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = ( + scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() + ) + pd_result = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .median() + ) + + assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_groupby_median_inexact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby(scalars_df["string_col"], dropna=False) + .median(exact=False) + ) + pd_max = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .max() + ) + pd_min = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .min() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + + # Median is approximate, so just check that it's plausible. + assert ((pd_min <= bf_result) & (bf_result <= pd_max)).all() + + +def test_groupby_prod(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = scalars_df[col_name].groupby(scalars_df["int64_col"]).prod() + pd_series = ( + scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() + ).astype(pd.Float64Dtype()) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + ) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x: x.cumsum()), + (lambda x: x.cumcount()), + (lambda x: x.cummin()), + (lambda x: x.cummax()), + # Pandas 2.2 casts to cumprod to float. + (lambda x: x.cumprod().astype("Float64")), + (lambda x: x.diff()), + (lambda x: x.shift(2)), + (lambda x: x.shift(-2)), + ], + ids=[ + "cumsum", + "cumcount", + "cummin", + "cummax", + "cumprod", + "diff", + "shiftpostive", + "shiftnegative", + ], +) +def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator): + col_name = "int64_col" + group_key = "int64_too" # has some duplicates values, good for grouping + bf_series = ( + operator(scalars_df_index[col_name].groupby(scalars_df_index[group_key])) + ).to_pandas() + pd_series = operator( + scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key]) + ).astype(bf_series.dtype) + + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +@pytest.mark.parametrize( + ("label", "col_name"), + [ + (0, "bool_col"), + (1, "int64_col"), + ], +) +def test_drop_label(scalars_df_index, scalars_pandas_df_index, label, col_name): + bf_series = scalars_df_index[col_name].drop(label).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop(label) + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_series = scalars_df_index[col_name].drop([1, 3]).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop([1, 3]) + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +@pytest.mark.parametrize( + ("col_name",), + [ + ("bool_col",), + ("int64_too",), + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_name): + bf_series = scalars_df_index[col_name].drop_duplicates(keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop_duplicates(keep=keep) + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +@pytest.mark.parametrize( + ("col_name",), + [ + ("bool_col",), + ("int64_too",), + ], +) +def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): + bf_uniq = scalars_df_index[col_name].unique().to_numpy(na_value=None) + pd_uniq = scalars_pandas_df_index[col_name].unique() + numpy.array_equal(pd_uniq, bf_uniq) + + +@pytest.mark.parametrize( + ("col_name",), + [ + ("bool_col",), + ("int64_too",), + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_duplicated(scalars_df_index, scalars_pandas_df_index, keep, col_name): + bf_series = scalars_df_index[col_name].duplicated(keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[col_name].duplicated(keep=keep) + pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + + +def test_shape(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].shape + pd_result = scalars_pandas_df["string_col"].shape + + assert pd_result == bf_result + + +def test_len(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = len(scalars_df["string_col"]) + pd_result = len(scalars_pandas_df["string_col"]) + + assert pd_result == bf_result + + +def test_size(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].size + pd_result = scalars_pandas_df["string_col"].size + + assert pd_result == bf_result + + +def test_series_hasnans_true(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].hasnans + pd_result = scalars_pandas_df["string_col"].hasnans + + assert pd_result == bf_result + + +def test_series_hasnans_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dropna().hasnans + pd_result = scalars_pandas_df["string_col"].dropna().hasnans + + assert pd_result == bf_result + + +def test_empty_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].empty + pd_result = scalars_pandas_df["string_col"].empty + + assert pd_result == bf_result + + +def test_empty_true_row_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"][ + scalars_df["string_col"] == "won't find this" + ].empty + pd_result = scalars_pandas_df["string_col"][ + scalars_pandas_df["string_col"] == "won't find this" + ].empty + + assert pd_result + assert pd_result == bf_result + + +def test_series_names(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].copy() + bf_result.index.name = "new index name" + bf_result.name = "new series name" + + pd_result = scalars_pandas_df["string_col"].copy() + pd_result.index.name = "new index name" + pd_result.name = "new series name" + + assert pd_result.name == bf_result.name + assert pd_result.index.name == bf_result.index.name + + +def test_dtype(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dtype + pd_result = scalars_pandas_df["string_col"].dtype + + assert pd_result == bf_result + + +def test_dtypes(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_col"].dtypes + pd_result = scalars_pandas_df["int64_col"].dtypes + + assert pd_result == bf_result + + +def test_head(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].head(2).to_pandas() + pd_result = scalars_pandas_df["string_col"].head(2) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_tail(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].tail(2).to_pandas() + pd_result = scalars_pandas_df["string_col"].tail(2) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_head_then_scalar_operation(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() + pd_result = scalars_pandas_df["float64_col"].head(1) + 4 + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_head_then_series_operation(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = ( + scalars_df["float64_col"].head(4) + scalars_df["float64_col"].head(2) + ).to_pandas() + pd_result = scalars_pandas_df["float64_col"].head(4) + scalars_pandas_df[ + "float64_col" + ].head(2) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_peek(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + peek_result = scalars_df["float64_col"].peek(n=3, force=False) + + pd.testing.assert_series_equal( + peek_result, + scalars_pandas_df["float64_col"].reindex_like(peek_result), + ) + assert len(peek_result) == 3 + + +def test_series_peek_with_large_results_not_allowed(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + session = scalars_df._block.session + slot_millis_sum = session.slot_millis_sum + peek_result = scalars_df["float64_col"].peek( + n=3, force=False, allow_large_results=False + ) + + # The metrics won't be fully updated when we call query_and_wait. + print(session.slot_millis_sum - slot_millis_sum) + assert session.slot_millis_sum - slot_millis_sum < 500 + pd.testing.assert_series_equal( + peek_result, + scalars_pandas_df["float64_col"].reindex_like(peek_result), + ) + assert len(peek_result) == 3 + + +def test_series_peek_multi_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.set_index(["string_col", "bool_col"])["float64_col"] + bf_series.name = ("2-part", "name") + pd_series = scalars_pandas_df.set_index(["string_col", "bool_col"])["float64_col"] + pd_series.name = ("2-part", "name") + peek_result = bf_series.peek(n=3, force=False) + pd.testing.assert_series_equal( + peek_result, + pd_series.reindex_like(peek_result), + ) + + +def test_series_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col > 0]["float64_col"].peek( + n=3, force=False + ) + pd_result = scalars_pandas_df[scalars_pandas_df.int64_col > 0]["float64_col"] + pd.testing.assert_series_equal( + peek_result, + pd_result.reindex_like(peek_result), + ) + + +def test_series_peek_force(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + cumsum_df = scalars_df[["int64_col", "int64_too"]].cumsum() + df_filtered = cumsum_df[cumsum_df.int64_col > 0]["int64_too"] + peek_result = df_filtered.peek(n=3, force=True) + pd_cumsum_df = scalars_pandas_df[["int64_col", "int64_too"]].cumsum() + pd_result = pd_cumsum_df[pd_cumsum_df.int64_col > 0]["int64_too"] + pd.testing.assert_series_equal( + peek_result, + pd_result.reindex_like(peek_result), + ) + + +def test_series_peek_force_float(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + cumsum_df = scalars_df[["int64_col", "float64_col"]].cumsum() + df_filtered = cumsum_df[cumsum_df.float64_col > 0]["float64_col"] + peek_result = df_filtered.peek(n=3, force=True) + pd_cumsum_df = scalars_pandas_df[["int64_col", "float64_col"]].cumsum() + pd_result = pd_cumsum_df[pd_cumsum_df.float64_col > 0]["float64_col"] + pd.testing.assert_series_equal( + peek_result, + pd_result.reindex_like(peek_result), + ) + + +def test_shift(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_result = scalars_df_index[col_name].shift().to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].shift().astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_ffill(scalars_df_index, scalars_pandas_df_index): + col_name = "numeric_col" + bf_result = scalars_df_index[col_name].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df_index[col_name].ffill(limit=1) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_bfill(scalars_df_index, scalars_pandas_df_index): + col_name = "numeric_col" + bf_result = scalars_df_index[col_name].bfill(limit=2).to_pandas() + pd_result = scalars_pandas_df_index[col_name].bfill(limit=2) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") + + col_name = "int64_col" + bf_result = scalars_df_index[col_name].cumsum().to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") + + col_name = "int64_col" + bf_result = ( + scalars_df_index.sort_values(by="rowindex_2")[col_name].cumsum().to_pandas() + ) + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index.sort_values(by="rowindex_2")[col_name] + .cumsum() + .astype(pd.Int64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_series_nlargest(scalars_df_index, scalars_pandas_df_index, keep): + col_name = "bool_col" + bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas() + pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_diff(scalars_df_index, scalars_pandas_df_index, periods): + bf_result = scalars_df_index["int64_col"].diff(periods=periods).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index["int64_col"] + .diff(periods=periods) + .astype(pd.Int64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): + col_name = "bool_col" + bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas() + pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("na_option", "method", "ascending", "numeric_only", "pct"), + [ + ("keep", "average", True, True, False), + ("top", "min", False, False, True), + ("bottom", "max", False, False, False), + ("top", "first", False, False, True), + ("bottom", "dense", False, False, False), + ], +) +def test_series_rank( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, + numeric_only, + pct, +): + col_name = "int64_too" + bf_result = ( + scalars_df_index[col_name] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_name] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .astype(pd.Float64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cast_float_to_int(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].astype(pd.Int64Dtype()).to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cast_float_to_bool(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].astype(pd.BooleanDtype()).to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].astype(pd.BooleanDtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].cumsum().cumsum().cumsum().to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index[col_name] + .cumsum() + .cumsum() + .cumsum() + .astype(pd.Float64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + col_name = "float64_col" + # set non-unique index to check implicit alignment + bf_series = scalars_df_index.set_index("bool_col")[col_name].fillna(0.0) + pd_series = scalars_pandas_df_index.set_index("bool_col")[col_name].fillna(0.0) + + bf_result = ( + (bf_series + 5) + + (bf_series.cumsum().cumsum().cumsum() + bf_series.rolling(window=3).mean()) + + bf_series.expanding().max() + ).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + (pd_series + 5) + + ( + pd_series.cumsum().cumsum().cumsum().astype(pd.Float64Dtype()) + + pd_series.rolling(window=3).mean() + ) + + pd_series.expanding().max() + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_int_filtered(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + + bf_col = scalars_df_index[col_name] + bf_result = bf_col[bf_col > -2].cumsum().to_pandas() + + pd_col = scalars_pandas_df_index[col_name] + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = pd_col[pd_col > -2].cumsum().astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_float(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].cumsum().to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Float64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cummin_int(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_result = scalars_df_index[col_name].cummin().to_pandas() + pd_result = scalars_pandas_df_index[col_name].cummin() + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cummax_int(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_result = scalars_df_index[col_name].cummax().to_pandas() + pd_result = scalars_pandas_df_index[col_name].cummax() + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("kwargs"), + [ + {}, + {"normalize": True}, + {"ascending": True}, + ], + ids=[ + "default", + "normalize", + "ascending", + ], +) +def test_value_counts(scalars_dfs, kwargs): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + + # Pandas `value_counts` can produce non-deterministic results with tied counts. + # Remove duplicates to enforce a consistent output. + s = scalars_df[col_name].drop(0) + pd_s = scalars_pandas_df[col_name].drop(0) + + bf_result = s.value_counts(**kwargs).to_pandas() + pd_result = pd_s.value_counts(**kwargs) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_value_counts_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas() + pd_result = scalars_pandas_df[col_name].value_counts(dropna=False) + + # Older pandas version may not have these values, bigframes tries to emulate 2.0+ + pd_result.name = "count" + pd_result.index.name = col_name + + assert_series_equal( + bf_result, + pd_result, + # bigframes values_counts does not honor ordering in the original data + ignore_order=True, + ) + + +def test_value_counts_w_cut(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("value_counts results different in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_cut = bigframes.pandas.cut(scalars_df[col_name], 3, labels=False) + pd_cut = pd.cut(scalars_pandas_df[col_name], 3, labels=False) + + bf_result = bf_cut.value_counts().to_pandas() + pd_result = pd_cut.value_counts() + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.Int64Dtype()), + ) + + +def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): + + bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() + pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50000000000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50000000000), + (-1, -7, -2), + (None, -7, -2), + (-1, None, -2), + (-7, -1, 2), + (-7, -1, None), + (-7, 7, None), + (7, -7, -2), + ], +) +def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): + bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_at(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index["string_col"].at[index] + pd_result = scalars_pandas_df_index["string_col"].at[index] + + assert bf_result == pd_result + + +def test_iat(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].iat[3] + pd_result = scalars_pandas_df_index["int64_too"].iat[3] + + assert bf_result == pd_result + + +def test_iat_error(scalars_df_index, scalars_pandas_df_index): + with pytest.raises(ValueError): + scalars_pandas_df_index["int64_too"].iat["asd"] + with pytest.raises(ValueError): + scalars_df_index["int64_too"].iat["asd"] + + +def test_series_add_prefix(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].add_prefix("prefix_").to_pandas() + + pd_result = scalars_pandas_df_index["int64_too"].add_prefix("prefix_") + + # Index will be object type in pandas, string type in bigframes, but same values + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].add_suffix("_suffix").to_pandas() + + pd_result = scalars_pandas_df_index["int64_too"].add_suffix("_suffix") + + # Index will be object type in pandas, string type in bigframes, but same values + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index["float64_col"].filter(items=[5, 1, 3]).to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(items=[5, 1, 3]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Ignore ordering as pandas order differently depending on version + assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True) + + +def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index["float64_col"].filter(like="ello").to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(like="ello") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_filter_regex(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index["float64_col"].filter(regex="^[GH].*").to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(regex="^[GH].*") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_reindex(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]).to_pandas() + ) + + pd_result = scalars_pandas_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too")["float64_col"].reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_series_reindex_like(scalars_df_index, scalars_pandas_df_index): + bf_reindex_target = scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) + bf_result = ( + scalars_df_index["int64_too"].reindex_like(bf_reindex_target).to_pandas() + ) + + pd_reindex_target = scalars_pandas_df_index["float64_col"].reindex( + index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index["int64_too"].reindex_like(pd_reindex_target) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_series(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_col"] + .where(scalars_df_index["bool_col"], scalars_df_index["int64_too"]) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].where( + scalars_pandas_df_index["bool_col"], scalars_pandas_df_index["int64_too"] + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_different_indices(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_col"] + .iloc[::2] + .where( + scalars_df_index["bool_col"].iloc[2:], + scalars_df_index["int64_too"].iloc[:5], + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index["int64_col"] + .iloc[::2] + .where( + scalars_pandas_df_index["bool_col"].iloc[2:], + scalars_pandas_df_index["int64_too"].iloc[:5], + ) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_default(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_col"].where(scalars_df_index["bool_col"]).to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].where( + scalars_pandas_df_index["bool_col"] + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_callable(scalars_df_index, scalars_pandas_df_index): + def _is_positive(x): + return x > 0 + + # Both cond and other are callable. + bf_result = ( + scalars_df_index["int64_col"] + .where(cond=_is_positive, other=lambda x: x * 10) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].where( + cond=_is_positive, other=lambda x: x * 10 + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): + col_bf = scalars_df_index["int64_col"] + lower_bf = scalars_df_index["int64_too"] - 1 + upper_bf = scalars_df_index["int64_too"] + 1 + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered) + + col_pd = scalars_pandas_df_index["int64_col"] + lower_pd = scalars_pandas_df_index["int64_too"] - 1 + upper_pd = scalars_pandas_df_index["int64_too"] + 1 + pd_result = col_pd.clip(lower_pd, upper_pd) + + assert_series_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_too"] + bf_result = col_bf.clip(-100, 3.14151593).to_pandas() + + col_pd = scalars_pandas_df_index["int64_too"] + # pandas doesn't work with Int64 and clip with floats + pd_result = col_pd.astype("int64").clip(-100, 3.14151593).astype("Float64") + + assert_series_equal(bf_result, pd_result) + + +def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_col"].iloc[::2] + lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 + upper_bf = scalars_df_index["int64_too"].iloc[:5] + 1 + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() + + col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] + lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 + upper_pd = scalars_pandas_df_index["int64_too"].iloc[:5] + 1 + pd_result = col_pd.clip(lower_pd, upper_pd) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_col"].iloc[::2] + lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 + bf_result = col_bf.clip(lower_bf, None).to_pandas() + + col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] + lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 + pd_result = col_pd.clip(lower_pd, None) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_dot(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"] @ scalars_df["int64_too"] + + pd_result = scalars_pandas_df["int64_too"] @ scalars_pandas_df["int64_too"] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("left", "right", "inclusive"), + [ + (-234892, 55555, "left"), + (-234892, 55555, "both"), + (-234892, 55555, "neither"), + (-234892, 55555, "right"), + ], +) +def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusive): + bf_result = ( + scalars_df_index["int64_col"].between(left, right, inclusive).to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].between(left, right, inclusive) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()), + ) + + +def test_series_case_when(scalars_dfs_maybe_ordered): + pytest.importorskip( + "pandas", + minversion="2.2.0", + reason="case_when added in pandas 2.2.0", + ) + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + # TODO(tswast): pandas case_when appears to assume True when a value is + # null. I suspect this should be considered a bug in pandas. + + # Generate 150 conditions to test case_when with a large number of conditions + bf_conditions = ( + [((bf_series > 645).fillna(True), bf_series - 1)] + + [((bf_series > (-100 + i * 5)).fillna(True), i) for i in range(148, 0, -1)] + + [((bf_series <= -100).fillna(True), pd.NA)] + ) + + pd_conditions = ( + [((pd_series > 645), pd_series - 1)] + + [((pd_series > (-100 + i * 5)), i) for i in range(148, 0, -1)] + + [(pd_series <= -100, pd.NA)] + ) + + assert len(bf_conditions) == 150 + + bf_result = bf_series.case_when(bf_conditions).to_pandas() + pd_result = pd_series.case_when(pd_conditions) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.Int64Dtype()), + ) + + +def test_series_case_when_change_type(scalars_dfs_maybe_ordered): + pytest.importorskip( + "pandas", + minversion="2.2.0", + reason="case_when added in pandas 2.2.0", + ) + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + # TODO(tswast): pandas case_when appears to assume True when a value is + # null. I suspect this should be considered a bug in pandas. + + bf_conditions = [ + ((bf_series > 645).fillna(True), scalars_df["string_col"]), + ((bf_series <= -100).fillna(True), pd.NA), + (True, "not_found"), + ] + + pd_conditions = [ + ((pd_series > 645).fillna(True), scalars_pandas_df["string_col"]), + ((pd_series <= -100).fillna(True), pd.NA), + # pandas currently fails if both the condition and the value are literals. + ([True] * len(pd_series), ["not_found"] * len(pd_series)), + ] + + bf_result = bf_series.case_when(bf_conditions).to_pandas() + pd_result = pd_series.case_when(pd_conditions) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype("string[pyarrow]"), + ) + + +def test_to_frame(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_col"].to_frame().to_pandas() + pd_result = scalars_pandas_df["int64_col"].to_frame() + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_to_frame_no_name(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_col"].rename(None).to_frame().to_pandas() + pd_result = scalars_pandas_df["int64_col"].rename(None).to_frame() + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): + path = gcs_folder + "test_series_to_json*.jsonl" + scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") + gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True) + + pd.testing.assert_series_equal( + gcs_df["int64_col"].astype(pd.Int64Dtype()), + scalars_pandas_df_index["int64_col"], + check_dtype=False, + check_index=False, + ) + + +def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): + path = gcs_folder + "test_series_to_csv*.csv" + scalars_df_index["int64_col"].to_csv(path) + gcs_df = pd.read_csv(get_first_file_from_wildcard(path)) + + pd.testing.assert_series_equal( + gcs_df["int64_col"].astype(pd.Int64Dtype()), + scalars_pandas_df_index["int64_col"], + check_dtype=False, + check_index=False, + ) + + +def test_to_latex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_col"].to_latex() + pd_result = scalars_pandas_df_index["int64_col"].to_latex() + + assert bf_result == pd_result + + +def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_json() + pd_result = scalars_pandas_df_index.int64_col.to_json() + + assert bf_result == pd_result + + +def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_json(bf_result_file) + scalars_pandas_df_index.int64_col.to_json(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_series_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.int64_col.to_csv() + + assert bf_result == pd_result + + +def test_series_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_csv(bf_result_file) + scalars_pandas_df_index.int64_col.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_to_dict(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_dict() + + pd_result = scalars_pandas_df_index["int64_too"].to_dict() + + assert bf_result == pd_result + + +def test_to_excel(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip("openpyxl") + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index["int64_too"].to_excel(bf_result_file) + scalars_pandas_df_index["int64_too"].to_excel(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_to_pickle(scalars_df_index, scalars_pandas_df_index): + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index["int64_too"].to_pickle(bf_result_file) + scalars_pandas_df_index["int64_too"].to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_to_string(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_string() + + pd_result = scalars_pandas_df_index["int64_too"].to_string() + + assert bf_result == pd_result + + +def test_to_list(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_list() + + pd_result = scalars_pandas_df_index["int64_too"].to_list() + + assert bf_result == pd_result + + +def test_to_numpy(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_numpy() + + pd_result = scalars_pandas_df_index["int64_too"].to_numpy() + + assert (bf_result == pd_result).all() + + +def test_to_xarray(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip("xarray") + bf_result = scalars_df_index["int64_too"].to_xarray() + + pd_result = scalars_pandas_df_index["int64_too"].to_xarray() + + assert bf_result.equals(pd_result) + + +def test_to_markdown(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_markdown() + + pd_result = scalars_pandas_df_index["int64_too"].to_markdown() + + assert bf_result == pd_result + + +def test_series_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].values + + pd_result = scalars_pandas_df_index["int64_too"].values + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_series_equal( + pd.Series(bf_result), pd.Series(pd_result), check_dtype=False + ) + + +def test_series___array__(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["float64_col"].__array__() + + pd_result = scalars_pandas_df_index["float64_col"].__array__() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + numpy.array_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("ascending", "na_position"), + [ + (True, "first"), + (True, "last"), + (False, "first"), + (False, "last"), + ], +) +def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_position): + # Test needs values to be unique + bf_result = ( + scalars_df_index["int64_col"] + .sort_values(ascending=ascending, na_position=na_position) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].sort_values( + ascending=ascending, na_position=na_position + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_sort_values_inplace(scalars_df_index, scalars_pandas_df_index): + # Test needs values to be unique + bf_series = scalars_df_index["int64_col"].copy() + bf_series.sort_values(ascending=False, inplace=True) + bf_result = bf_series.to_pandas() + pd_result = scalars_pandas_df_index["int64_col"].sort_values(ascending=False) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("ascending"), + [ + (True,), + (False,), + ], +) +def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending): + bf_result = ( + scalars_df_index["int64_too"].sort_index(ascending=ascending).to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=ascending) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_sort_index_inplace(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_too"].copy() + bf_series.sort_index(ascending=False, inplace=True) + bf_result = bf_series.to_pandas() + pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=False) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_mask_default_value(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_col_masked = bf_col.mask(bf_col % 2 == 1) + bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_col_masked = pd_col.mask(pd_col % 2 == 1) + pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_mask_custom_value(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_col_masked = bf_col.mask(bf_col % 2 == 1, -1) + bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_col_masked = pd_col.mask(pd_col % 2 == 1, -1) + pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) + + # TODO(shobs): There is a pd.NA value in the original series, which is not + # odd so should be left as is, but it is being masked in pandas. + # Accidentally the bigframes bahavior matches, but it should be updated + # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 + assert_pandas_df_equal(bf_result, pd_result) + + +def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index): + def _ten_times(x): + return x * 10 + + # Both cond and other are callable. + bf_result = ( + scalars_df_index["int64_col"] + .mask(cond=lambda x: x > 0, other=_ten_times) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].mask( + cond=lambda x: x > 0, other=_ten_times + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("lambda_",), + [ + pytest.param(lambda x: x > 0), + pytest.param( + lambda x: True if x > 0 else False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "lambda_arithmatic", + "lambda_arbitrary", + ], +) +def test_mask_lambda(scalars_dfs, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.mask(lambda_).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.mask(lambda_) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_mask_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x < 1000000 + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.mask(foo).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.mask(foo) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("column", "to_type"), + [ + ("int64_col", "Float64"), + ("int64_col", "Int64"), # No-op + ("int64_col", pd.Float64Dtype()), + ("int64_col", "string[pyarrow]"), + ("int64_col", "boolean"), + ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("int64_col", "time64[us][pyarrow]"), + ("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), + ("bool_col", "Int64"), + ("bool_col", "string[pyarrow]"), + ("bool_col", "Float64"), + ("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), + ("string_col", "binary[pyarrow]"), + ("bytes_col", "string[pyarrow]"), + # pandas actually doesn't let folks convert to/from naive timestamp and + # raises a deprecation warning to use tz_localize/tz_convert instead, + # but BigQuery always stores values as UTC and doesn't have to deal + # with timezone conversions, so we'll allow it. + ("timestamp_col", "date32[day][pyarrow]"), + ("timestamp_col", "time64[us][pyarrow]"), + ("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))), + ("datetime_col", "date32[day][pyarrow]"), + pytest.param( + "datetime_col", + "string[pyarrow]", + marks=pytest.mark.skipif( + pd.__version__.startswith("2.2"), + reason="pandas 2.2 uses T as date/time separator whereas earlier versions use space", + ), + ), + ("datetime_col", "time64[us][pyarrow]"), + ("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("date_col", "string[pyarrow]"), + ("date_col", pd.ArrowDtype(pa.timestamp("us"))), + ("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("time_col", "string[pyarrow]"), + # TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int + # ("float64_col", "Int64"), + # TODO(bmil): decide whether to fix Ibis bug: BigQuery backend + # formats floats with no decimal places if they have no fractional + # part, and does not switch to scientific notation for > 10^15 + # ("float64_col", "string[pyarrow]") + # TODO(bmil): add any other compatible conversions per + # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions + ], +) +def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() + pd_result = scalars_pandas_df_index[column].astype(to_type) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_astype_python(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype(float, errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + +def test_astype_safe(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype("Float64", errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + +def test_series_astype_w_invalid_error(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + +def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + column = "numeric_col" + to_type = "Int64" + bf_result = scalars_df_index[column].astype(to_type).to_pandas() + # Truncate to int to avoid TypeError + pd_result = ( + scalars_pandas_df_index[column] + .apply(lambda x: None if pd.isna(x) else math.trunc(x)) + .astype(to_type) + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("column", "to_type"), + [ + ("timestamp_col", "int64[pyarrow]"), + ("datetime_col", "int64[pyarrow]"), + ("time_col", "int64[pyarrow]"), + ], +) +def test_date_time_astype_int( + scalars_df_index, scalars_pandas_df_index, column, to_type +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + bf_result = scalars_df_index[column].astype(to_type).to_pandas() + pd_result = scalars_pandas_df_index[column].astype(to_type) + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert bf_result.dtype == "Int64" + + +def test_string_astype_int(): + pd_series = pd.Series(["4", "-7", "0", " -03"]) + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("Int64") + bf_result = bf_series.astype("Int64").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_float(): + pd_series = pd.Series( + ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] + ) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("Float64") + bf_result = bf_series.astype("Float64").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_date(): + if int(pa.__version__.split(".")[0]) < 15: + pytest.skip( + "Avoid pyarrow.lib.ArrowNotImplementedError: " + "Unsupported cast from string to date32 using function cast_date32." + ) + + pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( + pd.ArrowDtype(pa.string()) + ) + + bf_series = series.Series(pd_series) + + # TODO(b/340885567): fix type error + pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore + bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_datetime(): + pd_series = pd.Series( + ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) + bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_timestamp(): + pd_series = pd.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + bf_result = bf_series.astype( + pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_timestamp_astype_string(): + bf_series = series.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + + expected_result = pd.Series( + [ + "2014-08-15 08:15:12+00", + "2015-08-15 03:15:12.654754+00", + "2016-02-28 16:00:00+00", + ] + ) + bf_result = bf_series.astype(pa.string()).to_pandas() + + pd.testing.assert_series_equal( + bf_result, expected_result, check_index_type=False, check_dtype=False + ) + assert bf_result.dtype == "string[pyarrow]" + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_float_astype_json(errors): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_result.index = expected_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + + +def test_float_astype_json_str(): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype("json") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_result.index = expected_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_string_astype_json(errors): + data = [ + "1", + None, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', + ] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_string_astype_json_in_safe_mode(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +def test_string_astype_json_raise_error(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + with pytest.raises( + google.api_core.exceptions.BadRequest, + match="syntax error while parsing value", + ): + bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas() + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"), + pytest.param( + ['"str"', None], + dtypes.TIME_DTYPE, + id="invalid", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_json_astype_others(data, to_type, errors): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + + bf_result = bf_series.astype(to_type, errors=errors) + assert bf_result.dtype == to_type + + load_data = [json.loads(item) if item is not None else None for item in data] + expected = pd.Series(load_data, dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_raise_error(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + with pytest.raises(google.api_core.exceptions.BadRequest): + bf_series.astype(to_type, errors="raise").to_pandas() + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_in_safe_mode(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + bf_result = bf_series.astype(to_type, errors="null") + assert bf_result.dtype == to_type + + expected = pd.Series([None, None], dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + "index", + [0, 5, -2], +) +def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.string_col.iloc[index] + pd_result = scalars_pandas_df_index.string_col.iloc[index] + + assert bf_result == pd_result + + +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): + with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): + scalars_df_index.string_col.iloc[99] + + +def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.loc[scalars_df_index.bool_col].to_pandas() + pd_result = scalars_pandas_df_index.string_col.loc[scalars_pandas_df_index.bool_col] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_loc_bool_series_default_index( + scalars_df_default_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_default_index.string_col.loc[ + scalars_df_default_index.bool_col + ].to_pandas() + pd_result = scalars_pandas_df_default_index.string_col.loc[ + scalars_pandas_df_default_index.bool_col + ] + + assert_pandas_df_equal( + bf_result.to_frame(), + pd_result.to_frame(), + ) + + +def test_argmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.argmin() + pd_result = scalars_pandas_df_index.string_col.argmin() + assert bf_result == pd_result + + +def test_argmax(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_too.argmax() + pd_result = scalars_pandas_df_index.int64_too.argmax() + assert bf_result == pd_result + + +def test_series_idxmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.idxmin() + pd_result = scalars_pandas_df_index.string_col.idxmin() + assert bf_result == pd_result + + +def test_series_idxmax(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_too.idxmax() + pd_result = scalars_pandas_df_index.int64_too.idxmax() + assert bf_result == pd_result + + +def test_getattr_attribute_error_when_pandas_has(scalars_df_index): + # asof is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): + scalars_df_index.string_col.asof() + + +def test_getattr_attribute_error(scalars_df_index): + with pytest.raises(AttributeError): + scalars_df_index.string_col.not_a_method() + + +def test_rename(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename("newname") + pd_result = scalars_pandas_df_index.string_col.rename("newname") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_nonstring(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename((4, 2)) + pd_result = scalars_pandas_df_index.string_col.rename((4, 2)) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_dict_same_type(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename({1: 100, 2: 200}) + pd_result = scalars_pandas_df_index.string_col.rename({1: 100, 2: 200}) + + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_axis(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename_axis("newindexname") + pd_result = scalars_pandas_df_index.string_col.rename_axis("newindexname") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values + + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + + bf_result = scalars_df_index.string_col.loc[index_list] + pd_result = scalars_pandas_df_index.string_col.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + index_list = [3, 2, 1, 3, 2, 1] + + bf_result = scalars_df_index.bool_col.loc[index_list] + pd_result = scalars_pandas_df_index.bool_col.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] + + bf_result = scalars_df_multiindex.int64_too.loc[index_list] + pd_result = scalars_pandas_df_multiindex.int64_too.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list(scalars_df_index, scalars_pandas_df_index): + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df_index.string_col.iloc[index_list] + pd_result = scalars_pandas_df_index.string_col.iloc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list_nameless(scalars_df_index, scalars_pandas_df_index): + index_list = [0, 0, 0, 5, 4, 7] + + bf_series = scalars_df_index.string_col.rename(None) + bf_result = bf_series.iloc[index_list] + pd_series = scalars_pandas_df_index.string_col.rename(None) + pd_result = pd_series.iloc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_nameless(scalars_df_index, scalars_pandas_df_index): + index_list = [0, 0, 0, 5, 4, 7] + + bf_series = scalars_df_index.string_col.rename(None) + bf_result = bf_series.loc[index_list] + + pd_series = scalars_pandas_df_index.string_col.rename(None) + pd_result = pd_series.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.date_col.loc[bf_string_series] + pd_result = scalars_pandas_df_index.date_col.loc[pd_string_series] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + + bf_result = scalars_df_multiindex.int64_too.loc[bf_string_series] + pd_result = scalars_pandas_df_multiindex.int64_too.loc[pd_string_series] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.date_col.loc[bf_index] + pd_result = scalars_pandas_df_index.date_col.loc[pd_index] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.date_col.loc[index] + pd_result = scalars_pandas_df_index.date_col.loc[index] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.date_col.loc[index] + pd_result = scalars_pandas_df_index.date_col.loc[index] + assert bf_result == pd_result + + +def test_series_bool_interpretation_error(scalars_df_index): + with pytest.raises(ValueError): + True if scalars_df_index["string_col"] else False + + +def test_query_job_setters(scalars_dfs): + # if allow_large_results=False, might not create query job + with bigframes.option_context("compute.allow_large_results", True): + job_ids = set() + df, _ = scalars_dfs + series = df["int64_col"] + assert series.query_job is not None + repr(series) + job_ids.add(series.query_job.job_id) + series.to_pandas() + job_ids.add(series.query_job.job_id) + assert len(job_ids) == 2 + + +@pytest.mark.parametrize( + ("series_input",), + [ + ([1, 2, 3, 4, 5],), + ([1, 1, 3, 5, 5],), + ([1, pd.NA, 4, 5, 5],), + ([1, 3, 2, 5, 4],), + ([pd.NA, pd.NA],), + ([1, 1, 1, 1, 1],), + ], +) +def test_is_monotonic_increasing(series_input): + scalars_df = series.Series(series_input, dtype=pd.Int64Dtype()) + scalars_pandas_df = pd.Series(series_input, dtype=pd.Int64Dtype()) + assert ( + scalars_df.is_monotonic_increasing == scalars_pandas_df.is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + ("series_input",), + [ + ([1],), + ([5, 4, 3, 2, 1],), + ([5, 5, 3, 1, 1],), + ([1, pd.NA, 4, 5, 5],), + ([5, pd.NA, 4, 2, 1],), + ([1, 1, 1, 1, 1],), + ], +) +def test_is_monotonic_decreasing(series_input): + scalars_df = series.Series(series_input) + scalars_pandas_df = pd.Series(series_input) + assert ( + scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing + ) + + +def test_map_dict_input(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + local_map = dict() + # construct a local map, incomplete to cover behavior + for s in scalars_pandas_df.string_col[:-3]: + if isinstance(s, str): + local_map[s] = ord(s[0]) + + pd_result = scalars_pandas_df.string_col.map(local_map) + pd_result = pd_result.astype("Int64") # pandas type differences + bf_result = scalars_df.string_col.map(local_map) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_map_series_input(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + new_index = scalars_pandas_df.int64_too.drop_duplicates() + pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] + pd_map_series.index = new_index + bf_map_series = series.Series( + pd_map_series, session=scalars_df._get_block().expr.session + ) + + pd_result = scalars_pandas_df.int64_too.map(pd_map_series) + bf_result = scalars_df.int64_too.map(bf_map_series) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_map_series_input_duplicates_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + new_index = scalars_pandas_df.int64_too + pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] + pd_map_series.index = new_index + bf_map_series = series.Series( + pd_map_series, session=scalars_df._get_block().expr.session + ) + + with pytest.raises(pd.errors.InvalidIndexError): + scalars_pandas_df.int64_too.map(pd_map_series) + with pytest.raises(pd.errors.InvalidIndexError): + scalars_df.int64_too.map(bf_map_series, verify_integrity=True) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size + + +def test_series_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, pd_i in zip( + scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"] + ): + assert bf_i == pd_i + + +@pytest.mark.parametrize( + ( + "col", + "lambda_", + ), + [ + pytest.param("int64_col", lambda x: x * x + x + 1), + pytest.param("int64_col", lambda x: x % 2 == 1), + pytest.param("string_col", lambda x: x + "_suffix"), + ], + ids=[ + "lambda_int_int", + "lambda_int_bool", + "lambda_str_str", + ], +) +def test_apply_lambda(scalars_dfs, col, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df[col] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(lambda_) + + bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() + + pd_col = scalars_pandas_df[col] + if pd.__version__[:3] in ("2.2", "2.3"): + pd_result = pd_col.apply(lambda_, by_row=False) + else: + pd_result = pd_col.apply(lambda_) + + # ignore dtype check, which are Int64 and object respectively + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) + + +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.log), + pytest.param(numpy.sqrt), + pytest.param(numpy.sin), + ], + ids=[ + "log", + "sqrt", + "sin", + ], +) +def test_apply_numpy_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(ufunc) + + bf_result = bf_col.apply(ufunc, by_row=False).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(ufunc) + + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.add), + pytest.param(numpy.divide), + ], + ids=[ + "add", + "divide", + ], +) +def test_combine_series_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"].dropna() + bf_result = bf_col.combine(bf_col, ufunc).to_pandas() + + pd_col = scalars_pandas_df["int64_col"].dropna() + pd_result = pd_col.combine(pd_col, ufunc) + + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_combine_scalar_ufunc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"].dropna() + bf_result = bf_col.combine(2.5, numpy.add).to_pandas() + + pd_col = scalars_pandas_df["int64_col"].dropna() + pd_result = pd_col.combine(2.5, numpy.add) + + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_apply_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x * x + 2 * x + 3 + + bf_col = scalars_df["int64_col"] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(foo) + + bf_result = bf_col.apply(foo, by_row=False).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + + if pd.__version__[:3] in ("2.2", "2.3"): + pd_result = pd_col.apply(foo, by_row=False) + else: + pd_result = pd_col.apply(foo) + + # ignore dtype check, which are Int64 and object respectively + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) + + +@pytest.mark.parametrize( + ("col", "lambda_", "exception"), + [ + pytest.param("int64_col", {1: 2, 3: 4}, ValueError), + pytest.param("int64_col", numpy.square, TypeError), + pytest.param("string_col", lambda x: x.capitalize(), AttributeError), + ], + ids=[ + "not_callable", + "numpy_ufunc", + "custom_lambda", + ], +) +def test_apply_not_supported(scalars_dfs, col, lambda_, exception): + scalars_df, _ = scalars_dfs + + bf_col = scalars_df[col] + with pytest.raises(exception): + bf_col.apply(lambda_, by_row=False) + + +def test_series_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + column = "int64_too" + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param([1, 2, 3], id="int"), + pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"), + pytest.param( + [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array" + ), + pytest.param( + [ + {"A": {"x": 1.0}, "B": "b"}, + {"A": {"y": 2.0}, "B": "bb"}, + {"A": {"z": 4.0}}, + {}, + numpy.nan, + ], + id="struct_array", + ), + ], +) +def test_series_explode(data): + s = bigframes.pandas.Series(data) + pd_s = s.to_pandas() + pd.testing.assert_series_equal( + s.explode().to_pandas(), + pd_s.explode(), + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("index", "ignore_index"), + [ + pytest.param(None, True, id="default_index"), + pytest.param(None, False, id="ignore_default_index"), + pytest.param([5, 1, 3, 2], True, id="unordered_index"), + pytest.param([5, 1, 3, 2], False, id="ignore_unordered_index"), + pytest.param(["z", "x", "a", "b"], True, id="str_index"), + pytest.param(["z", "x", "a", "b"], False, id="ignore_str_index"), + pytest.param( + pd.Index(["z", "x", "a", "b"], name="idx"), True, id="str_named_index" + ), + pytest.param( + pd.Index(["z", "x", "a", "b"], name="idx"), + False, + id="ignore_str_named_index", + ), + pytest.param( + pd.MultiIndex.from_frame( + pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) + ), + True, + id="multi_index", + ), + pytest.param( + pd.MultiIndex.from_frame( + pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) + ), + False, + id="ignore_multi_index", + ), + ], +) +def test_series_explode_w_index(index, ignore_index): + data = [[], [200.0, 23.12], [4.5, -9.0], [1.0]] + s = bigframes.pandas.Series(data, index=index) + pd_s = pd.Series(data, index=index) + # TODO(b/340885567): fix type error + pd.testing.assert_series_equal( + s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore + pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_series_explode_reserve_order(ignore_index, ordered): + data = [numpy.random.randint(0, 10, 10) for _ in range(10)] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + + # TODO(b/340885567): fix type error + res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) # type: ignore + # TODO(b/340885567): fix type error + pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) # type: ignore + pd_res.index = pd_res.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + res if ordered else res.sort_index(), + pd_res, + ) + + +def test_series_explode_w_aggregate(): + data = [[1, 2, 3], [], numpy.nan, [3, 4]] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + assert s.explode().sum() == pd_s.explode().sum() + + +def test_series_construct_empty_array(): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + s = bigframes.pandas.Series([[]]) + expected = pd.Series( + [[]], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + index=pd.Index([0], dtype=pd.Int64Dtype()), + ) + pd.testing.assert_series_equal( + expected, + s.to_pandas(), + ) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param(numpy.nan, id="null"), + pytest.param([numpy.nan], id="null_array"), + pytest.param([[]], id="empty_array"), + pytest.param([numpy.nan, []], id="null_and_empty_array"), + ], +) +def test_series_explode_null(data): + s = bigframes.pandas.Series(data) + pd.testing.assert_series_equal( + s.explode().to_pandas(), + s.to_pandas().explode(), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("append", "level", "col", "rule"), + [ + pytest.param(False, None, "timestamp_col", "75D"), + pytest.param(True, 1, "timestamp_col", "25W"), + pytest.param(False, None, "datetime_col", "3ME"), + pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), + ], +) +def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] + scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ + "int64_col" + ] + bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_struct_get_field_by_attribute( + nested_structs_df, nested_structs_pandas_df +): + if Version(pd.__version__) < Version("2.2.0"): + pytest.skip("struct accessor is not supported before pandas 2.2") + + bf_series = nested_structs_df["person"] + df_series = nested_structs_pandas_df["person"] + + pd.testing.assert_series_equal( + bf_series.address.city.to_pandas(), + df_series.struct.field("address").struct.field("city"), + check_dtype=False, + check_index=False, + ) + pd.testing.assert_series_equal( + bf_series.address.country.to_pandas(), + df_series.struct.field("address").struct.field("country"), + check_dtype=False, + check_index=False, + ) + + +def test_series_struct_fields_in_dir(nested_structs_df): + series = nested_structs_df["person"] + + assert "age" in dir(series) + assert "address" in dir(series) + assert "city" in dir(series.address) + assert "country" in dir(series.address) + + +def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): + series = nested_structs_df["person"] + + assert series.name == "person" + + +def test_series_to_pandas_dry_run(scalars_df_index): + bf_series = scalars_df_index["int64_col"] + + result = bf_series.to_pandas(dry_run=True) + + assert isinstance(result, pd.Series) + assert len(result) > 0 + + +def test_series_item(session): + # Test with a single item + bf_s_single = bigframes.pandas.Series([42], session=session) + pd_s_single = pd.Series([42]) + assert bf_s_single.item() == pd_s_single.item() + + +def test_series_item_with_multiple(session): + # Test with multiple items + bf_s_multiple = bigframes.pandas.Series([1, 2, 3], session=session) + pd_s_multiple = pd.Series([1, 2, 3]) + + try: + pd_s_multiple.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_s_multiple.item() + + +def test_series_item_with_empty(session): + # Test with an empty Series + bf_s_empty = bigframes.pandas.Series([], dtype="Int64", session=session) + pd_s_empty = pd.Series([], dtype="Int64") + + try: + pd_s_empty.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_s_empty.item() From 765b678b34a7976aef1017d2a1fdb34d7a4cfbe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 21:38:20 +0000 Subject: [PATCH 20/63] restore a test --- tests/unit/test_series_polars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 8c24a28f43..d26bdd93d2 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -376,9 +376,9 @@ def test_series_construct_w_dtype_for_array_struct(): ) -def test_series_construct_local_unordered_has_sequential_index(unordered_session): +def test_series_construct_local_unordered_has_sequential_index(session): series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) pd.testing.assert_index_equal(series.index.to_pandas(), expected) From 4aa47a865899292e930c33e015ee92d5c35919f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 21:39:19 +0000 Subject: [PATCH 21/63] Revert "restore a test" This reverts commit 765b678b34a7976aef1017d2a1fdb34d7a4cfbe4. --- tests/unit/test_series_polars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index d26bdd93d2..8c24a28f43 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -376,9 +376,9 @@ def test_series_construct_w_dtype_for_array_struct(): ) -def test_series_construct_local_unordered_has_sequential_index(session): +def test_series_construct_local_unordered_has_sequential_index(unordered_session): series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) pd.testing.assert_index_equal(series.index.to_pandas(), expected) From f75f5bf46c10b9da4d89f763d0a0a0c9b749084b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:10:43 +0000 Subject: [PATCH 22/63] skip null --- tests/unit/test_series_polars.py | 42 ++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 8c24a28f43..b7a2d17022 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -75,6 +75,20 @@ def scalars_pandas_df_index() -> pd.DataFrame: return df.set_index("rowindex").sort_index() +@pytest.fixture(scope="module") +def scalars_df_default_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index).reset_index(drop=False) + + +@pytest.fixture(scope="module") +def scalars_df_2_default_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index).reset_index(drop=False) + + @pytest.fixture(scope="module") def scalars_df_index( session: bigframes.Session, scalars_pandas_df_index @@ -376,9 +390,9 @@ def test_series_construct_w_dtype_for_array_struct(): ) -def test_series_construct_local_unordered_has_sequential_index(unordered_session): +def test_series_construct_local_unordered_has_sequential_index(session): series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) pd.testing.assert_index_equal(series.index.to_pandas(), expected) @@ -469,13 +483,6 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): assert series_pandas.shape[0] == scalars_pandas_df.shape[0] -def test_get_column_w_json(json_df, json_pandas_df): - series = json_df["json_col"] - series_pandas = series.to_pandas() - assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert series_pandas.shape[0] == json_pandas_df.shape[0] - - def test_series_get_column_default(scalars_dfs): scalars_df, _ = scalars_dfs result = scalars_df.get(123123123123123, "default_val") @@ -1062,7 +1069,22 @@ def test_series_pow_scalar_reverse(scalars_dfs): "xor", ], ) -@pytest.mark.parametrize(("other_scalar"), [True, False, pd.NA]) +@pytest.mark.parametrize( + ("other_scalar"), + [ + True, + False, + pytest.param( + pd.NA, + marks=[ + pytest.mark.skip( + reason="https://github.com/pola-rs/polars/issues/24809" + ) + ], + id="NULL", + ), + ], +) @pytest.mark.parametrize(("reverse_operands"), [True, False]) def test_series_bool_bool_operators_scalar( scalars_dfs, operator, other_scalar, reverse_operands From a7058acefe8abb6927a6ca59a42b86f4149ce70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:29:43 +0000 Subject: [PATCH 23/63] skip unsupported tests --- tests/unit/test_series_polars.py | 123 ++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index b7a2d17022..3eb1a3c095 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -1579,6 +1579,7 @@ def test_isin_bigframes_index(scalars_dfs, session): ) +@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") @pytest.mark.parametrize( ( "col_name", @@ -1946,14 +1947,42 @@ def test_mean(scalars_dfs): @pytest.mark.parametrize( ("col_name"), [ - "int64_col", + pytest.param( + "int64_col", + marks=[ + pytest.mark.skip( + reason="pyarrow.lib.ArrowInvalid: Float value 27778.500000 was truncated converting to int64" + ) + ], + ), # Non-numeric column - "bytes_col", + pytest.param( + "bytes_col", + marks=[ + pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: `median` operation not supported for dtype `binary`" + ) + ], + ), "date_col", "datetime_col", - "time_col", + pytest.param( + "time_col", + marks=[ + pytest.mark.skip( + reason="pyarrow.lib.ArrowInvalid: Casting from time64[ns] to time64[us] would lose data: 42651538080500" + ) + ], + ), "timestamp_col", - "string_col", + pytest.param( + "string_col", + marks=[ + pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: `median` operation not supported for dtype `str`" + ) + ], + ), ], ) def test_median(scalars_dfs, col_name): @@ -2146,6 +2175,9 @@ def test_groupby_mean(scalars_dfs): ) +@pytest.mark.skip( + reason="Aggregate op QuantileOp(q=0.5, should_floor_result=False) not yet supported in polars engine." +) def test_groupby_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" @@ -2164,6 +2196,9 @@ def test_groupby_median_exact(scalars_dfs): ) +@pytest.mark.skip( + reason="pyarrow.lib.ArrowInvalid: Float value -1172.500000 was truncated converting to int64" +) def test_groupby_median_inexact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" @@ -2204,6 +2239,7 @@ def test_groupby_prod(scalars_dfs): ) +@pytest.mark.skip(reason="AssertionError: Series are different") @pytest.mark.parametrize( ("operator"), [ @@ -2270,6 +2306,7 @@ def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip(reason="AssertionError: Series.index are different") @pytest.mark.parametrize( ("col_name",), [ @@ -2294,6 +2331,7 @@ def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_na ) +@pytest.mark.skip(reason="TypeError: boolean value of NA is ambiguous") @pytest.mark.parametrize( ("col_name",), [ @@ -2307,6 +2345,7 @@ def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): numpy.array_equal(pd_uniq, bf_uniq) +@pytest.mark.skip(reason="AssertionError: Series are different") @pytest.mark.parametrize( ("col_name",), [ @@ -2639,6 +2678,9 @@ def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op RankOp() not yet supported in polars engine." +) @pytest.mark.parametrize( ("keep",), [ @@ -2700,6 +2742,9 @@ def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op RankOp() not yet supported in polars engine." +) @pytest.mark.parametrize( ("keep",), [ @@ -2719,6 +2764,9 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op DenseRankOp() not yet supported in polars engine." +) @pytest.mark.parametrize( ("na_option", "method", "ascending", "numeric_only", "pct"), [ @@ -2810,6 +2858,9 @@ def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip( + reason="NotImplementedError: min_period not yet supported for polars engine" +) def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") @@ -2941,6 +2992,9 @@ def test_value_counts_with_na(scalars_dfs): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op CutOp(bins=3, right=True, labels=False) not yet supported in polars engine." +) def test_value_counts_w_cut(scalars_dfs): if pd.__version__.startswith("1."): pytest.skip("value_counts results different in pandas 1.x.") @@ -3208,6 +3262,9 @@ def _is_positive(x): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" +) @pytest.mark.parametrize( ("ordered"), [ @@ -3229,6 +3286,9 @@ def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): assert_series_equal(bf_result, pd_result, ignore_order=not ordered) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" +) def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_too"] bf_result = col_bf.clip(-100, 3.14151593).to_pandas() @@ -3240,6 +3300,9 @@ def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): assert_series_equal(bf_result, pd_result) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" +) def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 @@ -3257,6 +3320,9 @@ def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented maximum()" +) def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 @@ -3302,6 +3368,7 @@ def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusi ) +@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") def test_series_case_when(scalars_dfs_maybe_ordered): pytest.importorskip( "pandas", @@ -3340,6 +3407,7 @@ def test_series_case_when(scalars_dfs_maybe_ordered): ) +@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") def test_series_case_when_change_type(scalars_dfs_maybe_ordered): pytest.importorskip( "pandas", @@ -3394,6 +3462,7 @@ def test_to_frame_no_name(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +@pytest.mark.skip(reason="fixture 'gcs_folder' not found") def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_json*.jsonl" scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") @@ -3407,6 +3476,7 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip(reason="fixture 'gcs_folder' not found") def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_csv*.csv" scalars_df_index["int64_col"].to_csv(path) @@ -3723,6 +3793,9 @@ def foo(x): assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: decimal precision should be <= 38 & >= 1" +) @pytest.mark.parametrize("errors", ["raise", "null"]) @pytest.mark.parametrize( ("column", "to_type"), @@ -3784,6 +3857,9 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.skip( + reason="AttributeError: 'DataFrame' object has no attribute 'dtype'. Did you mean: 'dtypes'?" +) def test_series_astype_python(session): input = pd.Series(["hello", "world", "3.11", "4000"]) exepcted = pd.Series( @@ -3795,6 +3871,9 @@ def test_series_astype_python(session): pd.testing.assert_series_equal(result, exepcted) +@pytest.mark.skip( + reason="AttributeError: 'DataFrame' object has no attribute 'dtype'. Did you mean: 'dtypes'?" +) def test_astype_safe(session): input = pd.Series(["hello", "world", "3.11", "4000"]) exepcted = pd.Series( @@ -3846,6 +3925,9 @@ def test_date_time_astype_int( assert bf_result.dtype == "Int64" +@pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: conversion from `str` to `i64` failed in column 'column_0' for 1 out of 4 values: [' -03']" +) def test_string_astype_int(): pd_series = pd.Series(["4", "-7", "0", " -03"]) bf_series = series.Series(pd_series) @@ -3856,6 +3938,9 @@ def test_string_astype_int(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) +@pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: conversion from `str` to `f64` failed in column 'column_0' for 1 out of 10 values: [' -03.235']" +) def test_string_astype_float(): pd_series = pd.Series( ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] @@ -3921,6 +4006,7 @@ def test_string_astype_timestamp(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) +@pytest.mark.skip(reason="AssertionError: Series are different") def test_timestamp_astype_string(): bf_series = series.Series( [ @@ -3945,6 +4031,7 @@ def test_timestamp_astype_string(): assert bf_result.dtype == "string[pyarrow]" +@pytest.mark.skip(reason="AssertionError: Series are different") @pytest.mark.parametrize("errors", ["raise", "null"]) def test_float_astype_json(errors): data = ["1.25", "2500000000", None, "-12323.24"] @@ -3958,6 +4045,7 @@ def test_float_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) +@pytest.mark.skip(reason="AssertionError: Series are different") def test_float_astype_json_str(): data = ["1.25", "2500000000", None, "-12323.24"] bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) @@ -3987,6 +4075,7 @@ def test_string_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +@pytest.mark.skip(reason="AssertionError: Series NA mask are different") def test_string_astype_json_in_safe_mode(): data = ["this is not a valid json string"] bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) @@ -3998,6 +4087,9 @@ def test_string_astype_json_in_safe_mode(): pd.testing.assert_series_equal(bf_result.to_pandas(), expected) +@pytest.mark.skip( + reason="Failed: DID NOT RAISE " +) def test_string_astype_json_raise_error(): data = ["this is not a valid json string"] bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) @@ -4036,6 +4128,9 @@ def test_json_astype_others(data, to_type, errors): pd.testing.assert_series_equal(bf_result.to_pandas(), expected) +@pytest.mark.skip( + reason="Failed: DID NOT RAISE " +) @pytest.mark.parametrize( ("data", "to_type"), [ @@ -4051,6 +4146,7 @@ def test_json_astype_others_raise_error(data, to_type): bf_series.astype(to_type, errors="raise").to_pandas() +@pytest.mark.skip(reason="AssertionError: Series NA mask are different") @pytest.mark.parametrize( ("data", "to_type"), [ @@ -4096,6 +4192,7 @@ def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_inde ) +@pytest.mark.skip(reason="fixture 'scalars_pandas_df_default_index' not found") def test_loc_bool_series_default_index( scalars_df_default_index, scalars_pandas_df_default_index ): @@ -4350,6 +4447,9 @@ def test_series_bool_interpretation_error(scalars_df_index): True if scalars_df_index["string_col"] else False +@pytest.mark.skip( + reason="NotImplementedError: dry_run not implemented for this executor" +) def test_query_job_setters(scalars_dfs): # if allow_large_results=False, might not create query job with bigframes.option_context("compute.allow_large_results", True): @@ -4456,6 +4556,9 @@ def test_map_series_input_duplicates_error(scalars_dfs): scalars_df.int64_too.map(bf_map_series, verify_integrity=True) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented hash()" +) @pytest.mark.parametrize( ("frac", "n", "random_state"), [ @@ -4533,6 +4636,9 @@ def test_apply_lambda(scalars_dfs, col, lambda_): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented log()" +) @pytest.mark.parametrize( ("ufunc",), [ @@ -4812,6 +4918,9 @@ def test_series_explode_null(data): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented IntegerLabelToDatetimeOp(freq=<75 * Days>, label=None, origin='start_day')" +) @pytest.mark.parametrize( ("append", "level", "col", "rule"), [ @@ -4833,6 +4942,7 @@ def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") def test_series_struct_get_field_by_attribute( nested_structs_df, nested_structs_pandas_df ): @@ -4856,6 +4966,7 @@ def test_series_struct_get_field_by_attribute( ) +@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") def test_series_struct_fields_in_dir(nested_structs_df): series = nested_structs_df["person"] @@ -4865,12 +4976,16 @@ def test_series_struct_fields_in_dir(nested_structs_df): assert "country" in dir(series.address) +@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): series = nested_structs_df["person"] assert series.name == "person" +@pytest.mark.skip( + reason="NotImplementedError: dry_run not implemented for this executor" +) def test_series_to_pandas_dry_run(scalars_df_index): bf_series = scalars_df_index["int64_col"] From 62d591130d9696e58e2f7fd8db662afcbf45cd67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:31:42 +0000 Subject: [PATCH 24/63] revert more docs changes --- bigframes/bigquery/_operations/array.py | 6 +++ bigframes/bigquery/_operations/datetime.py | 8 ++++ bigframes/bigquery/_operations/geo.py | 13 +++++++ bigframes/bigquery/_operations/json.py | 12 ++++++ bigframes/bigquery/_operations/search.py | 1 + bigframes/bigquery/_operations/sql.py | 3 ++ bigframes/bigquery/_operations/struct.py | 1 + third_party/bigframes_vendored/conftest.py | 44 ---------------------- 8 files changed, 44 insertions(+), 44 deletions(-) delete mode 100644 third_party/bigframes_vendored/conftest.py diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 239bc9566a..4af1416127 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -40,6 +40,8 @@ def array_length(series: series.Series) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) >>> bbq.array_length(s) 0 4 @@ -76,6 +78,8 @@ def array_agg( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None For a SeriesGroupBy object: @@ -124,6 +128,8 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) >>> bbq.array_to_string(s, delimiter=", ") diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index e27a3de0c8..f8767336dd 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -21,8 +21,11 @@ def unix_seconds(input: series.Series) -> series.Series: **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_seconds(s) 0 86400 @@ -45,8 +48,11 @@ def unix_millis(input: series.Series) -> series.Series: **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_millis(s) 0 86400000 @@ -69,8 +75,10 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_micros(s) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index e5aa383779..9a92a8960d 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -53,6 +53,8 @@ def st_area( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -123,6 +125,8 @@ def st_buffer( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Point(0, 0), @@ -191,6 +195,8 @@ def st_centroid( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -244,6 +250,8 @@ def st_convexhull( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -304,6 +312,7 @@ def st_difference( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -398,6 +407,7 @@ def st_distance( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -479,6 +489,7 @@ def st_intersection( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -572,6 +583,7 @@ def st_isclosed( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point, LineString, Polygon + >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -638,6 +650,7 @@ def st_length( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point, GeometryCollection + >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 4e1f43aab0..656e59af0d 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -49,6 +49,8 @@ def json_set( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) @@ -99,6 +101,7 @@ def json_extract( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") @@ -138,6 +141,7 @@ def json_extract_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_array(s) @@ -200,6 +204,7 @@ def json_extract_string_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_string_array(s) @@ -267,6 +272,7 @@ def json_query( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_query(s, json_path="$.class") @@ -297,6 +303,7 @@ def json_query_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_query_array(s) @@ -348,6 +355,7 @@ def json_value( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"name": "Jakob", "age": "6"}', '{"name": "Jakob", "age": []}']) >>> bbq.json_value(s, json_path="$.age") @@ -384,6 +392,7 @@ def json_value_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_value_array(s) @@ -430,6 +439,7 @@ def to_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json(s) @@ -463,6 +473,7 @@ def to_json_string( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json_string(s) @@ -501,6 +512,7 @@ def parse_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> s diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index b65eed2475..c16c2af1a9 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -111,6 +111,7 @@ def vector_search( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column is used as the search query: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 295412fd75..a2de61fc21 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -36,6 +36,9 @@ def sql_scalar( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import pandas as pd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index a6304677ef..7cb826351c 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -39,6 +39,7 @@ def struct(value: dataframe.DataFrame) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import bigframes.series as series + >>> bpd.options.display.progress_bar = None >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) >>> df = srs.struct.explode() diff --git a/third_party/bigframes_vendored/conftest.py b/third_party/bigframes_vendored/conftest.py deleted file mode 100644 index cafd6a1b7c..0000000000 --- a/third_party/bigframes_vendored/conftest.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import bigframes._config - - -@pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace): - """ - Avoid some boilerplate in pandas-inspired tests. - - See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture - """ - try: - from bigframes.testing import polars_session - - bpd = polars_session.TestSession() - except ImportError: - # Don't skip doctest if polars isn't available. - import bigframes.pandas as bpd - - doctest_namespace["np"] = np - doctest_namespace["pd"] = pd - doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = bpd - bigframes._config.options.display.progress_bar = None From 70021f3b1e61b5bf11407a1eda9a755f3475f577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:35:58 +0000 Subject: [PATCH 25/63] revert more docs --- bigframes/ml/compose.py | 1 + bigframes/operations/ai.py | 17 ++++++++++++----- bigframes/session/__init__.py | 8 ++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 54ce7066cb..92c98695cd 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -69,6 +69,7 @@ class SQLScalarColumnTransformer: >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 253b838e90..ac294b0fbd 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -45,6 +45,7 @@ def filter( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -114,7 +115,8 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -132,7 +134,8 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -263,7 +266,8 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -352,7 +356,8 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -491,6 +496,7 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.ai_operators = True @@ -602,7 +608,8 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0490152003..886072b884 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -618,6 +618,7 @@ def read_gbq_query( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Simple query input: @@ -773,6 +774,7 @@ def read_gbq_table( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). @@ -851,6 +853,7 @@ def read_gbq_table_streaming( >>> import bigframes.streaming as bst >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") @@ -879,6 +882,7 @@ def read_gbq_model(self, model_name: str): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Read an existing BigQuery ML model. @@ -948,6 +952,8 @@ def read_pandas( **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -1825,6 +1831,7 @@ def udf( >>> import bigframes.pandas as bpd >>> import datetime + >>> bpd.options.display.progress_bar = None Turning an arbitrary python function into a BigQuery managed python udf: @@ -1987,6 +1994,7 @@ def read_gbq_function( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. From 93502094fa94de7f0fcca17f5b1cb3aa6e1aa7cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:36:57 +0000 Subject: [PATCH 26/63] revert more docs --- bigframes/series.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index 337a796739..490298d8dd 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -533,6 +533,7 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2]) Download the data from BigQuery and convert it into an in-memory pandas Series. @@ -660,6 +661,7 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2, 2, 3]) Iterate through the results in batches, limiting the total rows yielded @@ -2419,6 +2421,9 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> data = { ... "timestamp_col": pd.date_range( ... start="2021-01-01 13:00:00", periods=30, freq="1s" From 23346b0947c9f38a29429df80c8502c1f155fb35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 00:06:48 +0000 Subject: [PATCH 27/63] fix unit tests python 3.13 --- tests/unit/test_series_polars.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 3eb1a3c095..64814126ea 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -3491,6 +3491,7 @@ def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): def test_to_latex(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip("jinja2") bf_result = scalars_df_index["int64_col"].to_latex() pd_result = scalars_pandas_df_index["int64_col"].to_latex() @@ -3891,21 +3892,6 @@ def test_series_astype_w_invalid_error(session): session.read_pandas(input).astype("Float64", errors="bad_value") -def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - column = "numeric_col" - to_type = "Int64" - bf_result = scalars_df_index[column].astype(to_type).to_pandas() - # Truncate to int to avoid TypeError - pd_result = ( - scalars_pandas_df_index[column] - .apply(lambda x: None if pd.isna(x) else math.trunc(x)) - .astype(to_type) - ) - pd.testing.assert_series_equal(bf_result, pd_result) - - @pytest.mark.parametrize( ("column", "to_type"), [ From 03822d7bf761fd4e98f3b113c498a79bb8ad35dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 00:20:52 +0000 Subject: [PATCH 28/63] add test to reproduce name error --- tests/unit/test_local_engine.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 7d3d532d88..8c8c2dcf0d 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -42,6 +42,14 @@ def small_inline_frame() -> pd.DataFrame: return df +def test_polars_local_engine_series(polars_session: bigframes.Session): + bf_series = bpd.Series([1, 2, 3], session=polars_session) + pd_series = pd.Series([1, 2, 3], dtype=bf_series.dtype) + bf_result = bf_series.to_pandas() + pd_result = pd_series + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + def test_polars_local_engine_add( small_inline_frame: pd.DataFrame, polars_session: bigframes.Session ): From 5df828b675b340a03fc8a69408c0ee57acfab316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 15:42:13 +0000 Subject: [PATCH 29/63] add tests for session scoped methods --- bigframes/core/blocks.py | 2 +- bigframes/core/compile/polars/compiler.py | 4 +- bigframes/dataframe.py | 2 +- bigframes/operations/base.py | 27 +- bigframes/pandas/core/tools/timedeltas.py | 2 +- bigframes/testing/polars_session.py | 11 +- noxfile.py | 6 +- scripts/publish_api_coverage.py | 3 - tests/system/small/test_session_as_bpd.py | 155 + tests/unit/test_dataframe_polars.py | 3 +- tests/unit/test_local_engine.py | 8 - tests/unit/test_series_polars.py | 5020 --------------------- 12 files changed, 171 insertions(+), 5072 deletions(-) create mode 100644 tests/system/small/test_session_as_bpd.py delete mode 100644 tests/unit/test_series_polars.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cf3518ff29..f9896784bb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2471,7 +2471,7 @@ def _align_series_block_axis_1( def _align_pd_series_axis_1( self, other: pd.Series, how: str ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: - if self.column_labels.astype("object").equals(other.index.astype("object")): + if self.column_labels.equals(other.index): columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: if not (self.column_labels.is_unique and other.index.is_unique): diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 059ec72076..f7c742e852 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -493,9 +493,9 @@ def compile_agg_op( if isinstance(op, agg_ops.MedianOp): return pl.median(*inputs) if isinstance(op, agg_ops.AllOp): - return pl.col(inputs).cast(pl.Boolean).all() + return pl.all(*inputs) if isinstance(op, agg_ops.AnyOp): - return pl.col(inputs).cast(pl.Boolean).any() + return pl.any(*inputs) # type: ignore if isinstance(op, agg_ops.NuniqueOp): return pl.col(*inputs).drop_nulls().n_unique() if isinstance(op, agg_ops.MinOp): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3527b225e2..bc2bbb963b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -688,7 +688,7 @@ def _getitem_label(self, key: blocks.Label): return DataFrame(block) if len(col_ids) == 1: - return bigframes.series.Series(block, name=key) + return bigframes.series.Series(block) return DataFrame(block) # Bool Series selects rows diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 7d6a1c3b68..f2bbcb3320 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,7 +14,6 @@ from __future__ import annotations -import enum import typing from typing import List, Sequence, Union @@ -36,18 +35,6 @@ import bigframes.session -class Default(enum.Enum): - """Sentinel that can disambiguate explicit None from missing. - - See https://stackoverflow.com/a/76606310/101923 - """ - - token = 0 - - -DEFAULT = Default.token - - class SeriesMethods: def __init__( self, @@ -56,7 +43,7 @@ def __init__( dtype: typing.Optional[ bigframes.dtypes.DtypeString | bigframes.dtypes.Dtype ] = None, - name: str | None | Default = DEFAULT, + name: str | None = None, copy: typing.Optional[bool] = None, *, session: typing.Optional[bigframes.session.Session] = None, @@ -120,7 +107,6 @@ def __init__( block = data_block if block: - # Data was a bigframes object. assert len(block.value_columns) == 1 assert len(block.column_labels) == 1 if index is not None: # reindexing operation @@ -129,27 +115,23 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) - if name is not DEFAULT: + if name: block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: - # Data was local. if isinstance(dtype, str) and dtype.lower() == "json": dtype = bigframes.dtypes.JSON_DTYPE pd_series = pd.Series( data=data, index=index, # type:ignore dtype=dtype, # type:ignore - name=name if name is not DEFAULT else None, + name=name, ) - name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore - block = block.with_column_labels([name]) assert block is not None - self._block: blocks.Block = block @property @@ -178,8 +160,7 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - result = series.Series(block.select_column(result_id), name=self._name) - return result + return series.Series(block.select_column(result_id)) def _apply_binary_op( self, diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py index 070a41d62d..eb01f9f846 100644 --- a/bigframes/pandas/core/tools/timedeltas.py +++ b/bigframes/pandas/core/tools/timedeltas.py @@ -35,7 +35,7 @@ def to_timedelta( return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) if pdtypes.is_list_like(arg): - return to_timedelta(series.Series(arg), unit, session=session) + return to_timedelta(series.Series(arg, session=session), unit, session=session) return pd.to_timedelta(arg, unit) diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 4d3e6862b9..29eae20b7a 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -95,17 +95,10 @@ def __init__(self): def read_pandas(self, pandas_dataframe, write_engine="default"): # override read_pandas to always keep data local-only - if isinstance(pandas_dataframe, (pandas.Series, pandas.Index)): + if isinstance(pandas_dataframe, pandas.Series): pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) - bf_df = bigframes.dataframe.DataFrame(local_block) - if isinstance(pandas_dataframe, pandas.Series): - series = bf_df[bf_df.columns[0]] - series.name = pandas_dataframe.name - return series - if isinstance(pandas_dataframe, pandas.Index): - return bf_df.index - return bf_df + return bigframes.dataframe.DataFrame(local_block) @property def bqclient(self): diff --git a/noxfile.py b/noxfile.py index 703937d453..a46dc36b3e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -46,7 +46,9 @@ "3.11", ] -PYTEST_VERSION = "pytest==8.4.2" +# pytest-retry is not yet compatible with pytest 8.x. +# https://github.com/str0zzapreti/pytest-retry/issues/32 +PYTEST_VERSION = "pytest<8.0.0dev" SPHINX_VERSION = "sphinx==4.5.0" LINT_PATHS = [ "docs", @@ -113,7 +115,7 @@ # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. "3.10": ["tests", "scikit-learn", "anywidget"], - LATEST_FULLY_SUPPORTED_PYTHON: ["tests", "scikit-learn", "polars", "anywidget"], + "3.11": ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars", "anywidget"], } diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 181b8c3365..8f305bcc0f 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -204,9 +204,6 @@ def generate_pandas_api_coverage(): def generate_sklearn_api_coverage(): """Explore all SKLearn modules, and for each item contained generate a regex to detect it being imported, and record whether we implement it""" - - import sklearn # noqa - sklearn_modules = [ "sklearn", "sklearn.model_selection", diff --git a/tests/system/small/test_session_as_bpd.py b/tests/system/small/test_session_as_bpd.py new file mode 100644 index 0000000000..12d2be43bd --- /dev/null +++ b/tests/system/small/test_session_as_bpd.py @@ -0,0 +1,155 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Check that bpd and Session can be used interchangablely.""" + +from __future__ import annotations + +from typing import cast + +import numpy as np +import pandas.testing + +import bigframes.pandas as bpd +import bigframes.session + + +def test_cut(session: bigframes.session.Session): + sc = [30, 80, 40, 90, 60, 45, 95, 75, 55, 100, 65, 85] + x = [20, 40, 60, 80, 100] + + bpd_result = bpd.cut(sc, x) + session_result = session.cut(sc, x) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) + + +def test_dataframe(session: bigframes.session.Session): + data = {"col": ["local", None, "data"]} + + bpd_result = bpd.DataFrame(data) + session_result = session.DataFrame(data) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_frame_equal(bpd_pd, session_pd) + + +def test_multiindex_from_arrays(session: bigframes.session.Session): + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + + bpd_result = bpd.MultiIndex.from_arrays(arrays, names=("number", "color")) + session_result = session.MultiIndex.from_arrays(arrays, names=("number", "color")) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_index_equal(bpd_pd, session_pd) + + +def test_multiindex_from_tuples(session: bigframes.session.Session): + tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + + bpd_result = bpd.MultiIndex.from_tuples(tuples, names=("number", "color")) + session_result = session.MultiIndex.from_tuples(tuples, names=("number", "color")) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_index_equal(bpd_pd, session_pd) + + +def test_index(session: bigframes.session.Session): + index = [1, 2, 3] + + bpd_result = bpd.Index(index) + session_result = session.Index(index) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_index_equal(bpd_pd, session_pd) + + +def test_series(session: bigframes.session.Session): + series = [1, 2, 3] + + bpd_result = bpd.Series(series) + session_result = session.Series(series) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) + + +def test_to_datetime(session: bigframes.session.Session): + datetimes = ["2018-10-26 12:00:00", "2018-10-26 13:00:15"] + + bpd_result = bpd.to_datetime(datetimes) + session_result = cast(bpd.Series, session.to_datetime(datetimes)) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) + + +def test_to_timedelta(session: bigframes.session.Session): + offsets = np.arange(5) + unit = "s" + + bpd_result = bpd.to_timedelta(offsets, unit=unit) + session_result = session.to_timedelta(offsets, unit=unit) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index c95c647fa8..a6f5c3d1ef 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2319,8 +2319,7 @@ def test_binop_with_self_aggregate(session, scalars_dfs): df_columns = ["int64_col", "float64_col", "int64_too"] bf_df = scalars_df[df_columns] - bf_deviation = bf_df - bf_df.mean() - bf_result = bf_deviation.to_pandas() + bf_result = (bf_df - bf_df.mean()).to_pandas() pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 8c8c2dcf0d..7d3d532d88 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -42,14 +42,6 @@ def small_inline_frame() -> pd.DataFrame: return df -def test_polars_local_engine_series(polars_session: bigframes.Session): - bf_series = bpd.Series([1, 2, 3], session=polars_session) - pd_series = pd.Series([1, 2, 3], dtype=bf_series.dtype) - bf_result = bf_series.to_pandas() - pd_result = pd_series - pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - def test_polars_local_engine_add( small_inline_frame: pd.DataFrame, polars_session: bigframes.Session ): diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py deleted file mode 100644 index 64814126ea..0000000000 --- a/tests/unit/test_series_polars.py +++ /dev/null @@ -1,5020 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime as dt -import json -import math -import pathlib -import re -import tempfile -from typing import Generator - -import db_dtypes # type: ignore -import geopandas as gpd # type: ignore -import google.api_core.exceptions -import numpy -from packaging.version import Version -import pandas as pd -import pyarrow as pa # type: ignore -import pytest -import shapely.geometry # type: ignore - -import bigframes -import bigframes.dtypes as dtypes -import bigframes.features -import bigframes.pandas -import bigframes.pandas as bpd -import bigframes.series as series -from bigframes.testing.utils import ( - assert_pandas_df_equal, - assert_series_equal, - convert_pandas_dtypes, - get_first_file_from_wildcard, -) - -pytest.importorskip("polars") -pytest.importorskip("pandas", minversion="2.0.0") - -CURRENT_DIR = pathlib.Path(__file__).parent -DATA_DIR = CURRENT_DIR.parent / "data" - - -@pytest.fixture(scope="module", autouse=True) -def session() -> Generator[bigframes.Session, None, None]: - import bigframes.core.global_session - from bigframes.testing import polars_session - - session = polars_session.TestSession() - with bigframes.core.global_session._GlobalSessionContext(session): - yield session - - -@pytest.fixture(scope="module") -def scalars_pandas_df_index() -> pd.DataFrame: - """pd.DataFrame pointing at test data.""" - - df = pd.read_json( - DATA_DIR / "scalars.jsonl", - lines=True, - ) - convert_pandas_dtypes(df, bytes_col=True) - - df = df.set_index("rowindex", drop=False) - df.index.name = None - return df.set_index("rowindex").sort_index() - - -@pytest.fixture(scope="module") -def scalars_df_default_index( - session: bigframes.Session, scalars_pandas_df_index -) -> bpd.DataFrame: - return session.read_pandas(scalars_pandas_df_index).reset_index(drop=False) - - -@pytest.fixture(scope="module") -def scalars_df_2_default_index( - session: bigframes.Session, scalars_pandas_df_index -) -> bpd.DataFrame: - return session.read_pandas(scalars_pandas_df_index).reset_index(drop=False) - - -@pytest.fixture(scope="module") -def scalars_df_index( - session: bigframes.Session, scalars_pandas_df_index -) -> bpd.DataFrame: - return session.read_pandas(scalars_pandas_df_index) - - -@pytest.fixture(scope="module") -def scalars_df_2_index( - session: bigframes.Session, scalars_pandas_df_index -) -> bpd.DataFrame: - return session.read_pandas(scalars_pandas_df_index) - - -@pytest.fixture(scope="module") -def scalars_dfs( - scalars_df_index, - scalars_pandas_df_index, -): - return scalars_df_index, scalars_pandas_df_index - - -def test_series_construct_copy(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = series.Series( - scalars_df["int64_col"], name="test_series", dtype="Float64" - ).to_pandas() - pd_result = pd.Series( - scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" - ) - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_nullable_ints(): - bf_result = series.Series( - [1, 3, bigframes.pandas.NA], index=[0, 4, bigframes.pandas.NA] - ).to_pandas() - - # TODO(b/340885567): fix type error - expected_index = pd.Index( # type: ignore - [0, 4, None], - dtype=pd.Int64Dtype(), - ) - expected = pd.Series([1, 3, pd.NA], dtype=pd.Int64Dtype(), index=expected_index) - - pd.testing.assert_series_equal(bf_result, expected) - - -def test_series_construct_timestamps(): - datetimes = [ - dt.datetime(2020, 1, 20, 20, 20, 20, 20), - dt.datetime(2019, 1, 20, 20, 20, 20, 20), - None, - ] - bf_result = series.Series(datetimes).to_pandas() - pd_result = pd.Series(datetimes, dtype=pd.ArrowDtype(pa.timestamp("us"))) - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -def test_series_construct_copy_with_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = series.Series( - scalars_df["int64_col"], - name="test_series", - dtype="Float64", - index=scalars_df["int64_too"], - ).to_pandas() - pd_result = pd.Series( - scalars_pandas_df["int64_col"], - name="test_series", - dtype="Float64", - index=scalars_pandas_df["int64_too"], - ) - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_copy_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = series.Series( - scalars_df.index, - name="test_series", - dtype="Float64", - index=scalars_df["int64_too"], - ).to_pandas() - pd_result = pd.Series( - scalars_pandas_df.index, - name="test_series", - dtype="Float64", - index=scalars_pandas_df["int64_too"], - ) - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_pandas(scalars_dfs): - _, scalars_pandas_df = scalars_dfs - bf_result = series.Series( - scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" - ) - pd_result = pd.Series( - scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" - ) - assert bf_result.shape == pd_result.shape - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -def test_series_construct_from_list(): - bf_result = series.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64").to_pandas() - pd_result = pd.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64") - - # BigQuery DataFrame default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_reindex(): - bf_result = series.Series( - series.Series({1: 10, 2: 30, 3: 30}), index=[3, 2], dtype="Int64" - ).to_pandas() - pd_result = pd.Series(pd.Series({1: 10, 2: 30, 3: 30}), index=[3, 2], dtype="Int64") - - # BigQuery DataFrame default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_from_list_w_index(): - bf_result = series.Series( - [1, 1, 2, 3, 5, 8, 13], index=[10, 20, 30, 40, 50, 60, 70], dtype="Int64" - ).to_pandas() - pd_result = pd.Series( - [1, 1, 2, 3, 5, 8, 13], index=[10, 20, 30, 40, 50, 60, 70], dtype="Int64" - ) - - # BigQuery DataFrame default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_empty(session: bigframes.Session): - bf_series: series.Series = series.Series(session=session) - pd_series: pd.Series = pd.Series() - - bf_result = bf_series.empty - pd_result = pd_series.empty - - assert pd_result - assert bf_result == pd_result - - -def test_series_construct_scalar_no_index(): - bf_result = series.Series("hello world", dtype="string[pyarrow]").to_pandas() - pd_result = pd.Series("hello world", dtype="string[pyarrow]") - - # BigQuery DataFrame default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_scalar_w_index(): - bf_result = series.Series( - "hello world", dtype="string[pyarrow]", index=[0, 2, 1] - ).to_pandas() - pd_result = pd.Series("hello world", dtype="string[pyarrow]", index=[0, 2, 1]) - - # BigQuery DataFrame default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_nan(): - bf_result = series.Series(numpy.nan).to_pandas() - pd_result = pd.Series(numpy.nan) - - pd_result.index = pd_result.index.astype("Int64") - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_scalar_w_bf_index(): - bf_result = series.Series( - "hello", index=bigframes.pandas.Index([1, 2, 3]) - ).to_pandas() - pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64")) - - pd_result = pd_result.astype("string[pyarrow]") - - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_series_construct_from_list_escaped_strings(): - """Check that special characters are supported.""" - strings = [ - "string\nwith\nnewline", - "string\twith\ttabs", - "string\\with\\backslashes", - ] - bf_result = series.Series(strings, name="test_series", dtype="string[pyarrow]") - pd_result = pd.Series(strings, name="test_series", dtype="string[pyarrow]") - - # BigQuery DataFrame default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -def test_series_construct_geodata(): - pd_series = pd.Series( - [ - shapely.geometry.Point(1, 1), - shapely.geometry.Point(2, 2), - shapely.geometry.Point(3, 3), - ], - dtype=gpd.array.GeometryDtype(), - ) - - series = bigframes.pandas.Series(pd_series) - - pd.testing.assert_series_equal( - pd_series, series.to_pandas(), check_index_type=False - ) - - -@pytest.mark.parametrize( - ("dtype"), - [ - pytest.param(pd.Int64Dtype(), id="int"), - pytest.param(pd.Float64Dtype(), id="float"), - pytest.param(pd.StringDtype(storage="pyarrow"), id="string"), - ], -) -def test_series_construct_w_dtype(dtype): - data = [1, 2, 3] - expected = pd.Series(data, dtype=dtype) - expected.index = expected.index.astype("Int64") - series = bigframes.pandas.Series(data, dtype=dtype) - pd.testing.assert_series_equal(series.to_pandas(), expected) - - -def test_series_construct_w_dtype_for_struct(): - # The data shows the struct fields are disordered and correctly handled during - # construction. - data = [ - {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)}, - {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)}, - {"a": 1, "c": "numpy", "b": None}, - ] - dtype = pd.ArrowDtype( - pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))]) - ) - series = bigframes.pandas.Series(data, dtype=dtype) - expected = pd.Series(data, dtype=dtype) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(series.to_pandas(), expected) - - -def test_series_construct_w_dtype_for_array_string(): - data = [["1", "2", "3"], [], ["4", "5"]] - dtype = pd.ArrowDtype(pa.list_(pa.string())) - series = bigframes.pandas.Series(data, dtype=dtype) - expected = pd.Series(data, dtype=dtype) - expected.index = expected.index.astype("Int64") - - # Skip dtype check due to internal issue b/321013333. This issue causes array types - # to be converted to the `object` dtype when calling `to_pandas()`, resulting in - # a mismatch with the expected Pandas type. - if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - check_dtype = True - else: - check_dtype = False - - pd.testing.assert_series_equal( - series.to_pandas(), expected, check_dtype=check_dtype - ) - - -def test_series_construct_w_dtype_for_array_struct(): - data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]] - dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())]))) - series = bigframes.pandas.Series(data, dtype=dtype) - expected = pd.Series(data, dtype=dtype) - expected.index = expected.index.astype("Int64") - - # Skip dtype check due to internal issue b/321013333. This issue causes array types - # to be converted to the `object` dtype when calling `to_pandas()`, resulting in - # a mismatch with the expected Pandas type. - if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - check_dtype = True - else: - check_dtype = False - - pd.testing.assert_series_equal( - series.to_pandas(), expected, check_dtype=check_dtype - ) - - -def test_series_construct_local_unordered_has_sequential_index(session): - series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session - ) - expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) - pd.testing.assert_index_equal(series.index.to_pandas(), expected) - - -@pytest.mark.parametrize( - ("json_type"), - [ - pytest.param(dtypes.JSON_DTYPE), - pytest.param("json"), - ], -) -def test_series_construct_w_json_dtype(json_type): - data = [ - "1", - '"str"', - "false", - '["a", {"b": 1}, null]', - None, - '{"a": {"b": [1, 2, 3], "c": true}}', - ] - s = bigframes.pandas.Series(data, dtype=json_type) - - assert s.dtype == dtypes.JSON_DTYPE - assert s[0] == "1" - assert s[1] == '"str"' - assert s[2] == "false" - assert s[3] == '["a",{"b":1},null]' - assert pd.isna(s[4]) - assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' - - -def test_series_keys(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_col"].keys().to_pandas() - pd_result = scalars_pandas_df["int64_col"].keys() - pd.testing.assert_index_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ["data", "index"], - [ - (["a", "b", "c"], None), - ([1, 2, 3], ["a", "b", "c"]), - ([1, 2, None], ["a", "b", "c"]), - ([1, 2, 3], [pd.NA, "b", "c"]), - ([numpy.nan, 2, 3], ["a", "b", "c"]), - ], -) -def test_series_items(data, index): - bf_series = series.Series(data, index=index) - pd_series = pd.Series(data, index=index) - - for (bf_index, bf_value), (pd_index, pd_value) in zip( - bf_series.items(), pd_series.items() - ): - # TODO(jialuo): Remove the if conditions after b/373699458 is addressed. - if not pd.isna(bf_index) or not pd.isna(pd_index): - assert bf_index == pd_index - if not pd.isna(bf_value) or not pd.isna(pd_value): - assert bf_value == pd_value - - -@pytest.mark.parametrize( - ["col_name", "expected_dtype"], - [ - ("bool_col", pd.BooleanDtype()), - # TODO(swast): Use a more efficient type. - ("bytes_col", pd.ArrowDtype(pa.binary())), - ("date_col", pd.ArrowDtype(pa.date32())), - ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))), - ("float64_col", pd.Float64Dtype()), - ("geography_col", gpd.array.GeometryDtype()), - ("int64_col", pd.Int64Dtype()), - # TODO(swast): Use a more efficient type. - ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))), - ("int64_too", pd.Int64Dtype()), - ("string_col", pd.StringDtype(storage="pyarrow")), - ("time_col", pd.ArrowDtype(pa.time64("us"))), - ("timestamp_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), - ], -) -def test_get_column(scalars_dfs, col_name, expected_dtype): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df[col_name] - series_pandas = series.to_pandas() - assert series_pandas.dtype == expected_dtype - assert series_pandas.shape[0] == scalars_pandas_df.shape[0] - - -def test_series_get_column_default(scalars_dfs): - scalars_df, _ = scalars_dfs - result = scalars_df.get(123123123123123, "default_val") - assert result == "default_val" - - -@pytest.mark.parametrize( - ("key",), - [ - ("hello",), - (2,), - ("int64_col",), - (None,), - ], -) -def test_series_contains(scalars_df_index, scalars_pandas_df_index, key): - bf_result = key in scalars_df_index["int64_col"] - pd_result = key in scalars_pandas_df_index["int64_col"] - - assert bf_result == pd_result - - -def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col) - pd_result = scalars_pandas_df_index.int64_col.equals( - scalars_pandas_df_index.int64_col - ) - - assert pd_result == bf_result - - -def test_series_equals_df(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]]) - pd_result = scalars_pandas_df_index["int64_col"].equals( - scalars_pandas_df_index[["int64_col"]] - ) - - assert pd_result == bf_result - - -def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): - bf_series = scalars_df_index["int64_col"] - pd_series = scalars_pandas_df_index["int64_col"] - - bf_result = bf_series.equals(bf_series.astype("Float64")) - pd_result = pd_series.equals(pd_series.astype("Float64")) - - assert pd_result == bf_result - - -def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index): - bf_series = scalars_df_index["int64_col"] - pd_series = scalars_pandas_df_index["int64_col"] - - bf_result = bf_series.equals(bf_series + 1) - pd_result = pd_series.equals(pd_series + 1) - - assert pd_result == bf_result - - -def test_series_get_with_default_index(scalars_dfs): - col_name = "float64_col" - key = 2 - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].get(key) - pd_result = scalars_pandas_df[col_name].get(key) - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("index_col", "key"), - ( - ("int64_too", 2), - ("string_col", "Hello, World!"), - ("int64_too", slice(2, 6)), - ), -) -def test_series___getitem__(scalars_dfs, index_col, key): - col_name = "float64_col" - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.set_index(index_col, drop=False) - scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) - bf_result = scalars_df[col_name][key] - pd_result = scalars_pandas_df[col_name][key] - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -@pytest.mark.parametrize( - ("key",), - ( - (-2,), - (-1,), - (0,), - (1,), - ), -) -def test_series___getitem___with_int_key(scalars_dfs, key): - col_name = "int64_too" - index_col = "string_col" - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.set_index(index_col, drop=False) - scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) - bf_result = scalars_df[col_name][key] - pd_result = scalars_pandas_df[col_name][key] - assert bf_result == pd_result - - -def test_series___getitem___with_default_index(scalars_dfs): - col_name = "float64_col" - key = 2 - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name][key] - pd_result = scalars_pandas_df[col_name][key] - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("index_col", "key", "value"), - ( - ("int64_too", 2, "new_string_value"), - ("string_col", "Hello, World!", "updated_value"), - ("int64_too", 0, None), - ), -) -def test_series___setitem__(scalars_dfs, index_col, key, value): - col_name = "string_col" - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.set_index(index_col, drop=False) - scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) - - bf_series = scalars_df[col_name] - pd_series = scalars_pandas_df[col_name].copy() - - bf_series[key] = value - pd_series[key] = value - - pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) - - -@pytest.mark.parametrize( - ("key", "value"), - ( - (0, 999), - (1, 888), - (0, None), - (-2345, 777), - ), -) -def test_series___setitem___with_int_key_numeric(scalars_dfs, key, value): - col_name = "int64_col" - index_col = "int64_too" - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.set_index(index_col, drop=False) - scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) - - bf_series = scalars_df[col_name] - pd_series = scalars_pandas_df[col_name].copy() - - bf_series[key] = value - pd_series[key] = value - - pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) - - -def test_series___setitem___with_default_index(scalars_dfs): - col_name = "float64_col" - key = 2 - value = 123.456 - scalars_df, scalars_pandas_df = scalars_dfs - - bf_series = scalars_df[col_name] - pd_series = scalars_pandas_df[col_name].copy() - - bf_series[key] = value - pd_series[key] = value - - assert bf_series.to_pandas().iloc[key] == pd_series.iloc[key] - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_too",), - ), -) -def test_abs(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].abs().to_pandas() - pd_result = scalars_pandas_df[col_name].abs() - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_too",), - ), -) -def test_series_pos(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (+scalars_df[col_name]).to_pandas() - pd_result = +scalars_pandas_df[col_name] - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_too",), - ), -) -def test_series_neg(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (-scalars_df[col_name]).to_pandas() - pd_result = -scalars_pandas_df[col_name] - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("bool_col",), - ("int64_col",), - ), -) -def test_series_invert(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (~scalars_df[col_name]).to_pandas() - pd_result = ~scalars_pandas_df[col_name] - - assert_series_equal(pd_result, bf_result) - - -def test_fillna(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = scalars_df[col_name].fillna("Missing").to_pandas() - pd_result = scalars_pandas_df[col_name].fillna("Missing") - assert_series_equal( - pd_result, - bf_result, - ) - - -def test_series_replace_scalar_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = ( - scalars_df[col_name].replace("Hello, World!", "Howdy, Planet!").to_pandas() - ) - pd_result = scalars_pandas_df[col_name].replace("Hello, World!", "Howdy, Planet!") - - pd.testing.assert_series_equal( - pd_result, - bf_result, - ) - - -def test_series_replace_list_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = ( - scalars_df[col_name] - .replace(["Hello, World!", "T"], "Howdy, Planet!") - .to_pandas() - ) - pd_result = scalars_pandas_df[col_name].replace( - ["Hello, World!", "T"], "Howdy, Planet!" - ) - - pd.testing.assert_series_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ("replacement_dict",), - (({},),), - ids=[ - "empty", - ], -) -def test_series_replace_dict(scalars_dfs, replacement_dict): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = scalars_df[col_name].replace(replacement_dict).to_pandas() - pd_result = scalars_pandas_df[col_name].replace(replacement_dict) - - pd.testing.assert_series_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ("method",), - ( - ("linear",), - ("values",), - ("slinear",), - ("nearest",), - ("zero",), - ("pad",), - ), -) -def test_series_interpolate(method): - pytest.importorskip("scipy") - - values = [None, 1, 2, None, None, 16, None] - index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] - pd_series = pd.Series(values, index) - bf_series = series.Series(pd_series) - - # Pandas can only interpolate on "float64" columns - # https://github.com/pandas-dev/pandas/issues/40252 - pd_result = pd_series.astype("float64").interpolate(method=method) - bf_result = bf_series.interpolate(method=method).to_pandas() - - # pd uses non-null types, while bf uses nullable types - pd.testing.assert_series_equal( - pd_result, - bf_result, - check_index_type=False, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - ("ignore_index",), - ( - (True,), - (False,), - ), -) -def test_series_dropna(scalars_dfs, ignore_index): - if pd.__version__.startswith("1."): - pytest.skip("ignore_index parameter not supported in pandas 1.x.") - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas() - pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index) - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("agg",), - ( - ("sum",), - ("size",), - ), -) -def test_series_agg_single_string(scalars_dfs, agg): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_col"].agg(agg) - pd_result = scalars_pandas_df["int64_col"].agg(agg) - assert math.isclose(pd_result, bf_result) - - -def test_series_agg_multi_string(scalars_dfs): - aggregations = [ - "sum", - "mean", - "std", - "var", - "min", - "max", - "nunique", - "count", - "size", - ] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_col"].agg(aggregations).to_pandas() - pd_result = scalars_pandas_df["int64_col"].agg(aggregations) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("string_col",), - ("int64_col",), - ), -) -def test_max(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].max() - pd_result = scalars_pandas_df[col_name].max() - assert pd_result == bf_result - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("string_col",), - ("int64_col",), - ), -) -def test_min(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].min() - pd_result = scalars_pandas_df[col_name].min() - assert pd_result == bf_result - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_col",), - ), -) -def test_std(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].std() - pd_result = scalars_pandas_df[col_name].std() - assert math.isclose(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_col",), - ), -) -def test_kurt(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].kurt() - pd_result = scalars_pandas_df[col_name].kurt() - assert math.isclose(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_col",), - ), -) -def test_skew(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].skew() - pd_result = scalars_pandas_df[col_name].skew() - assert math.isclose(pd_result, bf_result) - - -def test_skew_undefined(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_col"].iloc[:2].skew() - pd_result = scalars_pandas_df["int64_col"].iloc[:2].skew() - # both should be pd.NA - assert pd_result is bf_result - - -def test_kurt_undefined(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_col"].iloc[:3].kurt() - pd_result = scalars_pandas_df["int64_col"].iloc[:3].kurt() - # both should be pd.NA - assert pd_result is bf_result - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("float64_col",), - ("int64_col",), - ), -) -def test_var(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].var() - pd_result = scalars_pandas_df[col_name].var() - assert math.isclose(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("bool_col",), - ("int64_col",), - ), -) -def test_mode_stat(scalars_df_index, scalars_pandas_df_index, col_name): - bf_result = scalars_df_index[col_name].mode().to_pandas() - pd_result = scalars_pandas_df_index[col_name].mode() - - ## Mode implicitly resets index, and bigframes default indices use nullable Int64 - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("operator"), - [ - (lambda x, y: x + y), - (lambda x, y: x - y), - (lambda x, y: x * y), - (lambda x, y: x / y), - (lambda x, y: x // y), - (lambda x, y: x < y), - (lambda x, y: x > y), - (lambda x, y: x <= y), - (lambda x, y: x >= y), - ], - ids=[ - "add", - "subtract", - "multiply", - "divide", - "floordivide", - "less_than", - "greater_than", - "less_than_equal", - "greater_than_equal", - ], -) -@pytest.mark.parametrize( - ("other_scalar"), - [ - -1, - 0, - 14, - # TODO(tswast): Support pd.NA, - ], -) -@pytest.mark.parametrize(("reverse_operands"), [True, False]) -def test_series_int_int_operators_scalar( - scalars_dfs, operator, other_scalar, reverse_operands -): - scalars_df, scalars_pandas_df = scalars_dfs - - maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator - - bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() - pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) - - assert_series_equal(pd_result, bf_result) - - -def test_series_pow_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (scalars_df["int64_col"] ** 2).to_pandas() - pd_result = scalars_pandas_df["int64_col"] ** 2 - - assert_series_equal(pd_result, bf_result) - - -def test_series_pow_scalar_reverse(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() - pd_result = 0.8 ** scalars_pandas_df["int64_col"] - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("operator"), - [ - (lambda x, y: x & y), - (lambda x, y: x | y), - (lambda x, y: x ^ y), - ], - ids=[ - "and", - "or", - "xor", - ], -) -@pytest.mark.parametrize( - ("other_scalar"), - [ - True, - False, - pytest.param( - pd.NA, - marks=[ - pytest.mark.skip( - reason="https://github.com/pola-rs/polars/issues/24809" - ) - ], - id="NULL", - ), - ], -) -@pytest.mark.parametrize(("reverse_operands"), [True, False]) -def test_series_bool_bool_operators_scalar( - scalars_dfs, operator, other_scalar, reverse_operands -): - scalars_df, scalars_pandas_df = scalars_dfs - - maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator - - bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() - pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) - - assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) - - -@pytest.mark.parametrize( - ("operator"), - [ - (lambda x, y: x + y), - (lambda x, y: x - y), - (lambda x, y: x * y), - (lambda x, y: x / y), - (lambda x, y: x < y), - (lambda x, y: x > y), - (lambda x, y: x <= y), - (lambda x, y: x >= y), - (lambda x, y: x % y), - (lambda x, y: x // y), - (lambda x, y: x & y), - (lambda x, y: x | y), - (lambda x, y: x ^ y), - ], - ids=[ - "add", - "subtract", - "multiply", - "divide", - "less_than", - "greater_than", - "less_than_equal", - "greater_than_equal", - "modulo", - "floordivide", - "bitwise_and", - "bitwise_or", - "bitwise_xor", - ], -) -def test_series_int_int_operators_series(scalars_dfs, operator): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() - pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_x",), - [ - ("int64_col",), - ("int64_too",), - ("float64_col",), - ], -) -@pytest.mark.parametrize( - ("col_y",), - [ - ("int64_col",), - ("int64_too",), - ("float64_col",), - ], -) -@pytest.mark.parametrize( - ("method",), - [ - ("mod",), - ("rmod",), - ], -) -def test_mods(scalars_dfs, col_x, col_y, method): - scalars_df, scalars_pandas_df = scalars_dfs - x_bf = scalars_df[col_x] - y_bf = scalars_df[col_y] - bf_series = getattr(x_bf, method)(y_bf) - # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod - if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): - bf_result = bf_series.to_pandas() - else: - bf_result = bf_series.astype("Float64").to_pandas() - pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) - pd.testing.assert_series_equal(pd_result, bf_result) - - -# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this -# manually with dumb self-correlation instead of parameterized as test_mods is above. -def test_series_corr(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"]) - pd_result = ( - scalars_pandas_df["int64_too"] - .astype("int64") - .corr(scalars_pandas_df["int64_too"].astype("int64")) - ) - assert math.isclose(pd_result, bf_result) - - -def test_series_autocorr(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["float64_col"].autocorr(2) - pd_result = scalars_pandas_df["float64_col"].autocorr(2) - assert math.isclose(pd_result, bf_result) - - -def test_series_cov(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"]) - pd_result = ( - scalars_pandas_df["int64_too"] - .astype("int64") - .cov(scalars_pandas_df["int64_too"].astype("int64")) - ) - assert math.isclose(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_x",), - [ - ("int64_col",), - ("float64_col",), - ], -) -@pytest.mark.parametrize( - ("col_y",), - [ - ("int64_col",), - ("float64_col",), - ], -) -@pytest.mark.parametrize( - ("method",), - [ - ("divmod",), - ("rdivmod",), - ], -) -def test_divmods_series(scalars_dfs, col_x, col_y, method): - scalars_df, scalars_pandas_df = scalars_dfs - bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]) - pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( - scalars_pandas_df[col_y] - ) - # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. - if bf_div_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - else: - pd.testing.assert_series_equal( - pd_div_result, bf_div_result.astype("Float64").to_pandas() - ) - - if bf_mod_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) - else: - pd.testing.assert_series_equal( - pd_mod_result, bf_mod_result.astype("Float64").to_pandas() - ) - - -@pytest.mark.parametrize( - ("col_x",), - [ - ("int64_col",), - ("float64_col",), - ], -) -@pytest.mark.parametrize( - ("other",), - [ - (-1000,), - (678,), - ], -) -@pytest.mark.parametrize( - ("method",), - [ - ("divmod",), - ("rdivmod",), - ], -) -def test_divmods_scalars(scalars_dfs, col_x, other, method): - scalars_df, scalars_pandas_df = scalars_dfs - bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) - pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) - # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. - if bf_div_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - else: - pd.testing.assert_series_equal( - pd_div_result, bf_div_result.astype("Float64").to_pandas() - ) - - if bf_mod_result.dtype == pd.Int64Dtype(): - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) - else: - pd.testing.assert_series_equal( - pd_mod_result, bf_mod_result.astype("Float64").to_pandas() - ) - - -@pytest.mark.parametrize( - ("other",), - [ - (3,), - (-6.2,), - ], -) -def test_series_add_scalar(scalars_dfs, other): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (scalars_df["float64_col"] + other).to_pandas() - pd_result = scalars_pandas_df["float64_col"] + other - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("left_col", "right_col"), - [ - ("float64_col", "float64_col"), - ("int64_col", "float64_col"), - ("int64_col", "int64_too"), - ], -) -def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() - pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("left_col", "right_col", "righter_col"), - [ - ("float64_col", "float64_col", "float64_col"), - ("int64_col", "int64_col", "int64_col"), - ], -) -def test_series_add_bigframes_series_nested( - scalars_dfs, left_col, right_col, righter_col -): - """Test that we can correctly add multiple times.""" - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = ( - (scalars_df[left_col] + scalars_df[right_col]) + scalars_df[righter_col] - ).to_pandas() - pd_result = ( - scalars_pandas_df[left_col] + scalars_pandas_df[right_col] - ) + scalars_pandas_df[righter_col] - - assert_series_equal(pd_result, bf_result) - - -def test_series_add_different_table_default_index( - scalars_df_default_index, - scalars_df_2_default_index, -): - bf_result = ( - scalars_df_default_index["float64_col"] - + scalars_df_2_default_index["float64_col"] - ).to_pandas() - pd_result = ( - # Default index may not have a well defined order, but it should at - # least be consistent across to_pandas() calls. - scalars_df_default_index["float64_col"].to_pandas() - + scalars_df_2_default_index["float64_col"].to_pandas() - ) - # TODO(swast): Can remove sort_index() when there's default ordering. - pd.testing.assert_series_equal(bf_result.sort_index(), pd_result.sort_index()) - - -def test_series_add_different_table_with_index( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - scalars_pandas_df = scalars_pandas_df_index - bf_result = scalars_df_index["float64_col"] + scalars_df_2_index["int64_col"] - # When index values are unique, we can emulate with values from the same - # DataFrame. - pd_result = scalars_pandas_df["float64_col"] + scalars_pandas_df["int64_col"] - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): - scalars_pandas_df = scalars_pandas_df_index - bf_result = ( - scalars_df_index["float64_col"] - .sort_index(ascending=False) - .reset_index(drop=True) - ).iloc[::2] - pd_result = ( - scalars_pandas_df["float64_col"] - .sort_index(ascending=False) - .reset_index(drop=True) - ).iloc[::2] - - # BigQuery DataFrames default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): - bf_series = scalars_df_index["int64_col"].copy() - bf_series.index.name = "int64_col" - df = bf_series.reset_index(allow_duplicates=True, drop=False) - assert df.index.name is None - - bf_result = df.to_pandas() - - pd_series = scalars_pandas_df_index["int64_col"].copy() - pd_series.index.name = "int64_col" - pd_result = pd_series.reset_index(allow_duplicates=True, drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_series_reset_index_duplicates_error(scalars_df_index): - scalars_df_index = scalars_df_index["int64_col"].copy() - scalars_df_index.index.name = "int64_col" - with pytest.raises(ValueError): - scalars_df_index.reset_index(allow_duplicates=False, drop=False) - - -def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"] - bf_result.reset_index(drop=True, inplace=True) - pd_result = scalars_pandas_df_index.sort_index(ascending=False)["float64_col"] - pd_result.reset_index(drop=True, inplace=True) - - # BigQuery DataFrames default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -@pytest.mark.parametrize( - ("name",), - [ - ("some_name",), - (None,), - ], -) -def test_reset_index_no_drop(scalars_df_index, scalars_pandas_df_index, name): - scalars_pandas_df = scalars_pandas_df_index - kw_args = {"name": name} if name else {} - bf_result = ( - scalars_df_index["float64_col"] - .sort_index(ascending=False) - .reset_index(drop=False, **kw_args) - ) - pd_result = ( - scalars_pandas_df["float64_col"] - .sort_index(ascending=False) - .reset_index(drop=False, **kw_args) - ) - - # BigQuery DataFrames default indices use nullable Int64 always - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) - - -def test_copy(scalars_df_index, scalars_pandas_df_index): - col_name = "float64_col" - # Expect mutation on original not to effect_copy - bf_series = scalars_df_index[col_name].copy() - bf_copy = bf_series.copy() - bf_copy.loc[0] = 5.6 - bf_series.loc[0] = 3.4 - - pd_series = scalars_pandas_df_index[col_name].copy() - pd_copy = pd_series.copy() - pd_copy.loc[0] = 5.6 - pd_series.loc[0] = 3.4 - - assert bf_copy.to_pandas().loc[0] != bf_series.to_pandas().loc[0] - pd.testing.assert_series_equal(bf_copy.to_pandas(), pd_copy) - - -def test_isin_raise_error(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_too" - with pytest.raises(TypeError): - scalars_df_index[col_name].isin("whatever").to_pandas() - - -@pytest.mark.parametrize( - ( - "col_name", - "test_set", - ), - [ - ( - "int64_col", - [314159, 2.0, 3, pd.NA], - ), - ( - "int64_col", - [2, 55555, 4], - ), - ( - "float64_col", - [-123.456, 1.25, pd.NA], - ), - ( - "int64_too", - [1, 2, pd.NA], - ), - ( - "string_col", - ["Hello, World!", "Hi", "こんにちは"], - ), - ], -) -def test_isin(scalars_dfs, col_name, test_set): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].isin(test_set).to_pandas() - pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") - pd.testing.assert_series_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ( - "col_name", - "test_set", - ), - [ - ( - "int64_col", - [314159, 2.0, 3, pd.NA], - ), - ( - "int64_col", - [2, 55555, 4], - ), - ( - "float64_col", - [-123.456, 1.25, pd.NA], - ), - ( - "int64_too", - [1, 2, pd.NA], - ), - ( - "string_col", - ["Hello, World!", "Hi", "こんにちは"], - ), - ], -) -def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = ( - scalars_df[col_name].isin(series.Series(test_set, session=session)).to_pandas() - ) - pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") - pd.testing.assert_series_equal( - pd_result, - bf_result, - ) - - -def test_isin_bigframes_index(scalars_dfs, session): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = ( - scalars_df["string_col"] - .isin(bigframes.pandas.Index(["Hello, World!", "Hi", "こんにちは"], session=session)) - .to_pandas() - ) - pd_result = ( - scalars_pandas_df["string_col"] - .isin(pd.Index(["Hello, World!", "Hi", "こんにちは"])) - .astype("boolean") - ) - pd.testing.assert_series_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") -@pytest.mark.parametrize( - ( - "col_name", - "test_set", - ), - [ - ( - "int64_col", - [314159, 2.0, 3, pd.NA], - ), - ( - "int64_col", - [2, 55555, 4], - ), - ( - "float64_col", - [-123.456, 1.25, pd.NA], - ), - ( - "int64_too", - [1, 2, pd.NA], - ), - ( - "string_col", - ["Hello, World!", "Hi", "こんにちは"], - ), - ], -) -def test_isin_bigframes_values_as_predicate( - scalars_dfs_maybe_ordered, col_name, test_set -): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_predicate = scalars_df[col_name].isin( - series.Series(test_set, session=scalars_df._session) - ) - bf_result = scalars_df[bf_predicate].to_pandas() - pd_predicate = scalars_pandas_df[col_name].isin(test_set) - pd_result = scalars_pandas_df[pd_predicate] - - pd.testing.assert_frame_equal( - pd_result.reset_index(), - bf_result.reset_index(), - ) - - -def test_isnull(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "float64_col" - bf_series = scalars_df[col_name].isnull().to_pandas() - pd_series = scalars_pandas_df[col_name].isnull() - - # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but - # the `pd_series.dtype` is `bool`. - assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) - - -def test_notnull(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_series = scalars_df[col_name].notnull().to_pandas() - pd_series = scalars_pandas_df[col_name].notnull() - - # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but - # the `pd_series.dtype` is `bool`. - assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) - - -def test_eq_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_result = scalars_df[col_name].eq(0).to_pandas() - pd_result = scalars_pandas_df[col_name].eq(0) - - assert_series_equal(pd_result, bf_result) - - -def test_eq_wider_type_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_result = scalars_df[col_name].eq(1.0).to_pandas() - pd_result = scalars_pandas_df[col_name].eq(1.0) - - assert_series_equal(pd_result, bf_result) - - -def test_ne_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_result = (scalars_df[col_name] != 0).to_pandas() - pd_result = scalars_pandas_df[col_name] != 0 - - assert_series_equal(pd_result, bf_result) - - -def test_eq_int_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_result = (scalars_df[col_name] == 0).to_pandas() - pd_result = scalars_pandas_df[col_name] == 0 - - assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name",), - ( - ("string_col",), - ("float64_col",), - ("int64_too",), - ), -) -def test_eq_same_type_series(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = (scalars_df[col_name] == scalars_df[col_name]).to_pandas() - pd_result = scalars_pandas_df[col_name] == scalars_pandas_df[col_name] - - # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but - # the `pd_series.dtype` is `bool`. - assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) - - -def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): - bf_original = scalars_df_index["string_col"] - bf_series = scalars_df_index["string_col"] - pd_original = scalars_pandas_df_index["string_col"] - pd_series = scalars_pandas_df_index["string_col"].copy() - bf_series.loc[2] = "This value isn't in the test data." - pd_series.loc[2] = "This value isn't in the test data." - bf_result = bf_series.to_pandas() - pd_result = pd_series - pd.testing.assert_series_equal(bf_result, pd_result) - # Per Copy-on-Write semantics, other references to the original DataFrame - # should remain unchanged. - pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original) - - -def test_at_setitem_row_label_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_series = scalars_df["int64_col"] - pd_series = scalars_pandas_df["int64_col"].copy() - bf_series.at[1] = 1000 - pd_series.at[1] = 1000 - bf_result = bf_series.to_pandas() - pd_result = pd_series.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) - - -def test_ne_obj_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = (scalars_df[col_name] != scalars_df[col_name]).to_pandas() - pd_result = scalars_pandas_df[col_name] != scalars_pandas_df[col_name] - - # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but - # the `pd_series.dtype` is `bool`. - assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) - - -def test_indexing_using_unselected_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() - pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] - - assert_series_equal( - pd_result, - bf_result, - ) - - -def test_indexing_using_selected_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "string_col" - bf_result = scalars_df[col_name][ - scalars_df["string_col"].eq("Hello, World!") - ].to_pandas() - pd_result = scalars_pandas_df[col_name][ - scalars_pandas_df["string_col"].eq("Hello, World!") - ] - - assert_series_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ("indices"), - [ - ([1, 3, 5]), - ([5, -3, -5, -6]), - ([-2, -4, -6]), - ], -) -def test_take(scalars_dfs, indices): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.take(indices).to_pandas() - pd_result = scalars_pandas_df.take(indices) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_nested_filter(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - string_col = scalars_df["string_col"] - int64_too = scalars_df["int64_too"] - bool_col = scalars_df["bool_col"] == bool( - True - ) # Convert from nullable bool to nonnullable bool usable as indexer - bf_result = string_col[int64_too == 0][~bool_col].to_pandas() - - pd_string_col = scalars_pandas_df["string_col"] - pd_int64_too = scalars_pandas_df["int64_too"] - pd_bool_col = scalars_pandas_df["bool_col"] == bool( - True - ) # Convert from nullable bool to nonnullable bool usable as indexer - pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col] - - assert_series_equal( - pd_result, - bf_result, - ) - - -def test_binop_opposite_filters(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - int64_col1 = scalars_df["int64_col"] - int64_col2 = scalars_df["int64_col"] - bool_col = scalars_df["bool_col"] - bf_result = (int64_col1[bool_col] + int64_col2[bool_col.__invert__()]).to_pandas() - - pd_int64_col1 = scalars_pandas_df["int64_col"] - pd_int64_col2 = scalars_pandas_df["int64_col"] - pd_bool_col = scalars_pandas_df["bool_col"] - pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] - - # Passes with ignore_order=False only with some dependency sets - # TODO: Determine desired behavior and make test more strict - assert_series_equal(bf_result, pd_result, ignore_order=True) - - -def test_binop_left_filtered(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - int64_col = scalars_df["int64_col"] - float64_col = scalars_df["float64_col"] - bool_col = scalars_df["bool_col"] - bf_result = (int64_col[bool_col] + float64_col).to_pandas() - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_float64_col = scalars_pandas_df["float64_col"] - pd_bool_col = scalars_pandas_df["bool_col"] - pd_result = pd_int64_col[pd_bool_col] + pd_float64_col - - # Passes with ignore_order=False only with some dependency sets - # TODO: Determine desired behavior and make test more strict - assert_series_equal(bf_result, pd_result, ignore_order=True) - - -def test_binop_right_filtered(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - int64_col = scalars_df["int64_col"] - float64_col = scalars_df["float64_col"] - bool_col = scalars_df["bool_col"] - bf_result = (float64_col + int64_col[bool_col]).to_pandas() - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_float64_col = scalars_pandas_df["float64_col"] - pd_bool_col = scalars_pandas_df["bool_col"] - pd_result = pd_float64_col + pd_int64_col[pd_bool_col] - - assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("other",), - [ - ([-1.4, 2.3, None],), - (pd.Index([-1.4, 2.3, None]),), - (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), - ], -) -def test_series_binop_w_other_types(scalars_dfs, other): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas() - pd_result = scalars_pandas_df["int64_col"].head(3) + other - - assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("other",), - [ - ([-1.4, 2.3, None],), - (pd.Index([-1.4, 2.3, None]),), - (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), - ], -) -def test_series_reverse_binop_w_other_types(scalars_dfs, other): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (other + scalars_df["int64_col"].head(3)).to_pandas() - pd_result = other + scalars_pandas_df["int64_col"].head(3) - - assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_combine_first(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - int64_col = scalars_df["int64_col"].head(7) - float64_col = scalars_df["float64_col"].tail(7) - bf_result = int64_col.combine_first(float64_col).to_pandas() - - pd_int64_col = scalars_pandas_df["int64_col"].head(7) - pd_float64_col = scalars_pandas_df["float64_col"].tail(7) - pd_result = pd_int64_col.combine_first(pd_float64_col) - - assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_update(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - int64_col = scalars_df["int64_col"].head(7) - float64_col = scalars_df["float64_col"].tail(7).copy() - float64_col.update(int64_col) - - pd_int64_col = scalars_pandas_df["int64_col"].head(7) - pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy() - pd_float64_col.update(pd_int64_col) - - assert_series_equal( - float64_col.to_pandas(), - pd_float64_col, - ) - - -def test_mean(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = scalars_df[col_name].mean() - pd_result = scalars_pandas_df[col_name].mean() - assert math.isclose(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("col_name"), - [ - pytest.param( - "int64_col", - marks=[ - pytest.mark.skip( - reason="pyarrow.lib.ArrowInvalid: Float value 27778.500000 was truncated converting to int64" - ) - ], - ), - # Non-numeric column - pytest.param( - "bytes_col", - marks=[ - pytest.mark.skip( - reason="polars.exceptions.InvalidOperationError: `median` operation not supported for dtype `binary`" - ) - ], - ), - "date_col", - "datetime_col", - pytest.param( - "time_col", - marks=[ - pytest.mark.skip( - reason="pyarrow.lib.ArrowInvalid: Casting from time64[ns] to time64[us] would lose data: 42651538080500" - ) - ], - ), - "timestamp_col", - pytest.param( - "string_col", - marks=[ - pytest.mark.skip( - reason="polars.exceptions.InvalidOperationError: `median` operation not supported for dtype `str`" - ) - ], - ), - ], -) -def test_median(scalars_dfs, col_name): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].median(exact=False) - pd_max = scalars_pandas_df[col_name].max() - pd_min = scalars_pandas_df[col_name].min() - # Median is approximate, so just check for plausibility. - assert pd_min < bf_result < pd_max - - -def test_numeric_literal(scalars_dfs): - scalars_df, _ = scalars_dfs - col_name = "numeric_col" - assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) - bf_result = scalars_df[col_name] + 42 - assert bf_result.size == scalars_df[col_name].size - assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) - - -def test_series_small_repr(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - col_name = "int64_col" - bf_series = scalars_df[col_name] - pd_series = scalars_pandas_df[col_name] - assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) - - -def test_sum(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = scalars_df[col_name].sum() - pd_result = scalars_pandas_df[col_name].sum() - assert pd_result == bf_result - - -def test_product(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "float64_col" - bf_result = scalars_df[col_name].product() - pd_result = scalars_pandas_df[col_name].product() - assert math.isclose(pd_result, bf_result) - - -def test_cumprod(scalars_dfs): - if pd.__version__.startswith("1."): - pytest.skip("Series.cumprod NA mask are different in pandas 1.x.") - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "float64_col" - bf_result = scalars_df[col_name].cumprod() - pd_result = scalars_pandas_df[col_name].cumprod() - pd.testing.assert_series_equal( - pd_result, - bf_result.to_pandas(), - ) - - -def test_count(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = scalars_df[col_name].count() - pd_result = scalars_pandas_df[col_name].count() - assert pd_result == bf_result - - -def test_nunique(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = (scalars_df[col_name] % 3).nunique() - pd_result = (scalars_pandas_df[col_name] % 3).nunique() - assert pd_result == bf_result - - -def test_all(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = scalars_df[col_name].all() - pd_result = scalars_pandas_df[col_name].all() - assert pd_result == bf_result - - -def test_any(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - bf_result = scalars_df[col_name].any() - pd_result = scalars_pandas_df[col_name].any() - assert pd_result == bf_result - - -def test_groupby_sum(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_series = ( - scalars_df[col_name] - .groupby([scalars_df["bool_col"], ~scalars_df["bool_col"]]) - .sum() - ) - pd_series = ( - scalars_pandas_df[col_name] - .groupby([scalars_pandas_df["bool_col"], ~scalars_pandas_df["bool_col"]]) - .sum() - ) - # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.to_pandas() - assert_series_equal( - pd_series, - bf_result, - check_exact=False, - ) - - -def test_groupby_std(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_series = scalars_df[col_name].groupby(scalars_df["string_col"]).std() - pd_series = ( - scalars_pandas_df[col_name] - .groupby(scalars_pandas_df["string_col"]) - .std() - .astype(pd.Float64Dtype()) - ) - bf_result = bf_series.to_pandas() - assert_series_equal( - pd_series, - bf_result, - check_exact=False, - ) - - -def test_groupby_var(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_series = scalars_df[col_name].groupby(scalars_df["string_col"]).var() - pd_series = ( - scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() - ) - bf_result = bf_series.to_pandas() - assert_series_equal( - pd_series, - bf_result, - check_exact=False, - ) - - -def test_groupby_level_sum(scalars_dfs): - # TODO(tbergeron): Use a non-unique index once that becomes possible in tests - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - - bf_series = scalars_df[col_name].groupby(level=0).sum() - pd_series = scalars_pandas_df[col_name].groupby(level=0).sum() - # TODO(swast): Update groupby to use index based on group by key(s). - pd.testing.assert_series_equal( - pd_series.sort_index(), - bf_series.to_pandas().sort_index(), - ) - - -def test_groupby_level_list_sum(scalars_dfs): - # TODO(tbergeron): Use a non-unique index once that becomes possible in tests - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - - bf_series = scalars_df[col_name].groupby(level=["rowindex"]).sum() - pd_series = scalars_pandas_df[col_name].groupby(level=["rowindex"]).sum() - # TODO(swast): Update groupby to use index based on group by key(s). - pd.testing.assert_series_equal( - pd_series.sort_index(), - bf_series.to_pandas().sort_index(), - ) - - -def test_groupby_mean(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_series = ( - scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).mean() - ) - pd_series = ( - scalars_pandas_df[col_name] - .groupby(scalars_pandas_df["string_col"], dropna=False) - .mean() - ) - # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.to_pandas() - assert_series_equal( - pd_series, - bf_result, - ) - - -@pytest.mark.skip( - reason="Aggregate op QuantileOp(q=0.5, should_floor_result=False) not yet supported in polars engine." -) -def test_groupby_median_exact(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_result = ( - scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() - ) - pd_result = ( - scalars_pandas_df[col_name] - .groupby(scalars_pandas_df["string_col"], dropna=False) - .median() - ) - - assert_series_equal( - pd_result, - bf_result.to_pandas(), - ) - - -@pytest.mark.skip( - reason="pyarrow.lib.ArrowInvalid: Float value -1172.500000 was truncated converting to int64" -) -def test_groupby_median_inexact(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_series = ( - scalars_df[col_name] - .groupby(scalars_df["string_col"], dropna=False) - .median(exact=False) - ) - pd_max = ( - scalars_pandas_df[col_name] - .groupby(scalars_pandas_df["string_col"], dropna=False) - .max() - ) - pd_min = ( - scalars_pandas_df[col_name] - .groupby(scalars_pandas_df["string_col"], dropna=False) - .min() - ) - # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.to_pandas() - - # Median is approximate, so just check that it's plausible. - assert ((pd_min <= bf_result) & (bf_result <= pd_max)).all() - - -def test_groupby_prod(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - bf_series = scalars_df[col_name].groupby(scalars_df["int64_col"]).prod() - pd_series = ( - scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() - ).astype(pd.Float64Dtype()) - # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.to_pandas() - assert_series_equal( - pd_series, - bf_result, - ) - - -@pytest.mark.skip(reason="AssertionError: Series are different") -@pytest.mark.parametrize( - ("operator"), - [ - (lambda x: x.cumsum()), - (lambda x: x.cumcount()), - (lambda x: x.cummin()), - (lambda x: x.cummax()), - # Pandas 2.2 casts to cumprod to float. - (lambda x: x.cumprod().astype("Float64")), - (lambda x: x.diff()), - (lambda x: x.shift(2)), - (lambda x: x.shift(-2)), - ], - ids=[ - "cumsum", - "cumcount", - "cummin", - "cummax", - "cumprod", - "diff", - "shiftpostive", - "shiftnegative", - ], -) -def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator): - col_name = "int64_col" - group_key = "int64_too" # has some duplicates values, good for grouping - bf_series = ( - operator(scalars_df_index[col_name].groupby(scalars_df_index[group_key])) - ).to_pandas() - pd_series = operator( - scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key]) - ).astype(bf_series.dtype) - - pd.testing.assert_series_equal( - pd_series, - bf_series, - ) - - -@pytest.mark.parametrize( - ("label", "col_name"), - [ - (0, "bool_col"), - (1, "int64_col"), - ], -) -def test_drop_label(scalars_df_index, scalars_pandas_df_index, label, col_name): - bf_series = scalars_df_index[col_name].drop(label).to_pandas() - pd_series = scalars_pandas_df_index[col_name].drop(label) - pd.testing.assert_series_equal( - pd_series, - bf_series, - ) - - -def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - bf_series = scalars_df_index[col_name].drop([1, 3]).to_pandas() - pd_series = scalars_pandas_df_index[col_name].drop([1, 3]) - pd.testing.assert_series_equal( - pd_series, - bf_series, - ) - - -@pytest.mark.skip(reason="AssertionError: Series.index are different") -@pytest.mark.parametrize( - ("col_name",), - [ - ("bool_col",), - ("int64_too",), - ], -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_name): - bf_series = scalars_df_index[col_name].drop_duplicates(keep=keep).to_pandas() - pd_series = scalars_pandas_df_index[col_name].drop_duplicates(keep=keep) - pd.testing.assert_series_equal( - pd_series, - bf_series, - ) - - -@pytest.mark.skip(reason="TypeError: boolean value of NA is ambiguous") -@pytest.mark.parametrize( - ("col_name",), - [ - ("bool_col",), - ("int64_too",), - ], -) -def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): - bf_uniq = scalars_df_index[col_name].unique().to_numpy(na_value=None) - pd_uniq = scalars_pandas_df_index[col_name].unique() - numpy.array_equal(pd_uniq, bf_uniq) - - -@pytest.mark.skip(reason="AssertionError: Series are different") -@pytest.mark.parametrize( - ("col_name",), - [ - ("bool_col",), - ("int64_too",), - ], -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_duplicated(scalars_df_index, scalars_pandas_df_index, keep, col_name): - bf_series = scalars_df_index[col_name].duplicated(keep=keep).to_pandas() - pd_series = scalars_pandas_df_index[col_name].duplicated(keep=keep) - pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) - - -def test_shape(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].shape - pd_result = scalars_pandas_df["string_col"].shape - - assert pd_result == bf_result - - -def test_len(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = len(scalars_df["string_col"]) - pd_result = len(scalars_pandas_df["string_col"]) - - assert pd_result == bf_result - - -def test_size(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].size - pd_result = scalars_pandas_df["string_col"].size - - assert pd_result == bf_result - - -def test_series_hasnans_true(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].hasnans - pd_result = scalars_pandas_df["string_col"].hasnans - - assert pd_result == bf_result - - -def test_series_hasnans_false(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].dropna().hasnans - pd_result = scalars_pandas_df["string_col"].dropna().hasnans - - assert pd_result == bf_result - - -def test_empty_false(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].empty - pd_result = scalars_pandas_df["string_col"].empty - - assert pd_result == bf_result - - -def test_empty_true_row_filter(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"][ - scalars_df["string_col"] == "won't find this" - ].empty - pd_result = scalars_pandas_df["string_col"][ - scalars_pandas_df["string_col"] == "won't find this" - ].empty - - assert pd_result - assert pd_result == bf_result - - -def test_series_names(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].copy() - bf_result.index.name = "new index name" - bf_result.name = "new series name" - - pd_result = scalars_pandas_df["string_col"].copy() - pd_result.index.name = "new index name" - pd_result.name = "new series name" - - assert pd_result.name == bf_result.name - assert pd_result.index.name == bf_result.index.name - - -def test_dtype(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].dtype - pd_result = scalars_pandas_df["string_col"].dtype - - assert pd_result == bf_result - - -def test_dtypes(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["int64_col"].dtypes - pd_result = scalars_pandas_df["int64_col"].dtypes - - assert pd_result == bf_result - - -def test_head(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].head(2).to_pandas() - pd_result = scalars_pandas_df["string_col"].head(2) - - assert_series_equal( - pd_result, - bf_result, - ) - - -def test_tail(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["string_col"].tail(2).to_pandas() - pd_result = scalars_pandas_df["string_col"].tail(2) - - assert_series_equal( - pd_result, - bf_result, - ) - - -def test_head_then_scalar_operation(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() - pd_result = scalars_pandas_df["float64_col"].head(1) + 4 - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_head_then_series_operation(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = ( - scalars_df["float64_col"].head(4) + scalars_df["float64_col"].head(2) - ).to_pandas() - pd_result = scalars_pandas_df["float64_col"].head(4) + scalars_pandas_df[ - "float64_col" - ].head(2) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_peek(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - peek_result = scalars_df["float64_col"].peek(n=3, force=False) - - pd.testing.assert_series_equal( - peek_result, - scalars_pandas_df["float64_col"].reindex_like(peek_result), - ) - assert len(peek_result) == 3 - - -def test_series_peek_with_large_results_not_allowed(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - session = scalars_df._block.session - slot_millis_sum = session.slot_millis_sum - peek_result = scalars_df["float64_col"].peek( - n=3, force=False, allow_large_results=False - ) - - # The metrics won't be fully updated when we call query_and_wait. - print(session.slot_millis_sum - slot_millis_sum) - assert session.slot_millis_sum - slot_millis_sum < 500 - pd.testing.assert_series_equal( - peek_result, - scalars_pandas_df["float64_col"].reindex_like(peek_result), - ) - assert len(peek_result) == 3 - - -def test_series_peek_multi_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_series = scalars_df.set_index(["string_col", "bool_col"])["float64_col"] - bf_series.name = ("2-part", "name") - pd_series = scalars_pandas_df.set_index(["string_col", "bool_col"])["float64_col"] - pd_series.name = ("2-part", "name") - peek_result = bf_series.peek(n=3, force=False) - pd.testing.assert_series_equal( - peek_result, - pd_series.reindex_like(peek_result), - ) - - -def test_series_peek_filtered(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = scalars_df[scalars_df.int64_col > 0]["float64_col"].peek( - n=3, force=False - ) - pd_result = scalars_pandas_df[scalars_pandas_df.int64_col > 0]["float64_col"] - pd.testing.assert_series_equal( - peek_result, - pd_result.reindex_like(peek_result), - ) - - -def test_series_peek_force(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - cumsum_df = scalars_df[["int64_col", "int64_too"]].cumsum() - df_filtered = cumsum_df[cumsum_df.int64_col > 0]["int64_too"] - peek_result = df_filtered.peek(n=3, force=True) - pd_cumsum_df = scalars_pandas_df[["int64_col", "int64_too"]].cumsum() - pd_result = pd_cumsum_df[pd_cumsum_df.int64_col > 0]["int64_too"] - pd.testing.assert_series_equal( - peek_result, - pd_result.reindex_like(peek_result), - ) - - -def test_series_peek_force_float(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - cumsum_df = scalars_df[["int64_col", "float64_col"]].cumsum() - df_filtered = cumsum_df[cumsum_df.float64_col > 0]["float64_col"] - peek_result = df_filtered.peek(n=3, force=True) - pd_cumsum_df = scalars_pandas_df[["int64_col", "float64_col"]].cumsum() - pd_result = pd_cumsum_df[pd_cumsum_df.float64_col > 0]["float64_col"] - pd.testing.assert_series_equal( - peek_result, - pd_result.reindex_like(peek_result), - ) - - -def test_shift(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - bf_result = scalars_df_index[col_name].shift().to_pandas() - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index[col_name].shift().astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_ffill(scalars_df_index, scalars_pandas_df_index): - col_name = "numeric_col" - bf_result = scalars_df_index[col_name].ffill(limit=1).to_pandas() - pd_result = scalars_pandas_df_index[col_name].ffill(limit=1) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_bfill(scalars_df_index, scalars_pandas_df_index): - col_name = "numeric_col" - bf_result = scalars_df_index[col_name].bfill(limit=2).to_pandas() - pd_result = scalars_pandas_df_index[col_name].bfill(limit=2) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("1."): - pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") - - col_name = "int64_col" - bf_result = scalars_df_index[col_name].cumsum().to_pandas() - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("1."): - pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") - - col_name = "int64_col" - bf_result = ( - scalars_df_index.sort_values(by="rowindex_2")[col_name].cumsum().to_pandas() - ) - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = ( - scalars_pandas_df_index.sort_values(by="rowindex_2")[col_name] - .cumsum() - .astype(pd.Int64Dtype()) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Aggregate op RankOp() not yet supported in polars engine." -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - ("all",), - ], -) -def test_series_nlargest(scalars_df_index, scalars_pandas_df_index, keep): - col_name = "bool_col" - bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas() - pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("periods",), - [ - (1,), - (2,), - (-1,), - ], -) -def test_diff(scalars_df_index, scalars_pandas_df_index, periods): - bf_result = scalars_df_index["int64_col"].diff(periods=periods).to_pandas() - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = ( - scalars_pandas_df_index["int64_col"] - .diff(periods=periods) - .astype(pd.Int64Dtype()) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("periods",), - [ - (1,), - (2,), - (-1,), - ], -) -def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): - bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas() - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Aggregate op RankOp() not yet supported in polars engine." -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - ("all",), - ], -) -def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): - col_name = "bool_col" - bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas() - pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Aggregate op DenseRankOp() not yet supported in polars engine." -) -@pytest.mark.parametrize( - ("na_option", "method", "ascending", "numeric_only", "pct"), - [ - ("keep", "average", True, True, False), - ("top", "min", False, False, True), - ("bottom", "max", False, False, False), - ("top", "first", False, False, True), - ("bottom", "dense", False, False, False), - ], -) -def test_series_rank( - scalars_df_index, - scalars_pandas_df_index, - na_option, - method, - ascending, - numeric_only, - pct, -): - col_name = "int64_too" - bf_result = ( - scalars_df_index[col_name] - .rank( - na_option=na_option, - method=method, - ascending=ascending, - numeric_only=numeric_only, - pct=pct, - ) - .to_pandas() - ) - pd_result = ( - scalars_pandas_df_index[col_name] - .rank( - na_option=na_option, - method=method, - ascending=ascending, - numeric_only=numeric_only, - pct=pct, - ) - .astype(pd.Float64Dtype()) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cast_float_to_int(scalars_df_index, scalars_pandas_df_index): - col_name = "float64_col" - bf_result = scalars_df_index[col_name].astype(pd.Int64Dtype()).to_pandas() - # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index[col_name].astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cast_float_to_bool(scalars_df_index, scalars_pandas_df_index): - col_name = "float64_col" - bf_result = scalars_df_index[col_name].astype(pd.BooleanDtype()).to_pandas() - # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index[col_name].astype(pd.BooleanDtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): - col_name = "float64_col" - bf_result = scalars_df_index[col_name].cumsum().cumsum().cumsum().to_pandas() - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = ( - scalars_pandas_df_index[col_name] - .cumsum() - .cumsum() - .cumsum() - .astype(pd.Float64Dtype()) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: min_period not yet supported for polars engine" -) -def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - col_name = "float64_col" - # set non-unique index to check implicit alignment - bf_series = scalars_df_index.set_index("bool_col")[col_name].fillna(0.0) - pd_series = scalars_pandas_df_index.set_index("bool_col")[col_name].fillna(0.0) - - bf_result = ( - (bf_series + 5) - + (bf_series.cumsum().cumsum().cumsum() + bf_series.rolling(window=3).mean()) - + bf_series.expanding().max() - ).to_pandas() - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = ( - (pd_series + 5) - + ( - pd_series.cumsum().cumsum().cumsum().astype(pd.Float64Dtype()) - + pd_series.rolling(window=3).mean() - ) - + pd_series.expanding().max() - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cumsum_int_filtered(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - - bf_col = scalars_df_index[col_name] - bf_result = bf_col[bf_col > -2].cumsum().to_pandas() - - pd_col = scalars_pandas_df_index[col_name] - # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = pd_col[pd_col > -2].cumsum().astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cumsum_float(scalars_df_index, scalars_pandas_df_index): - col_name = "float64_col" - bf_result = scalars_df_index[col_name].cumsum().to_pandas() - # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Float64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cummin_int(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - bf_result = scalars_df_index[col_name].cummin().to_pandas() - pd_result = scalars_pandas_df_index[col_name].cummin() - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_cummax_int(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - bf_result = scalars_df_index[col_name].cummax().to_pandas() - pd_result = scalars_pandas_df_index[col_name].cummax() - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("kwargs"), - [ - {}, - {"normalize": True}, - {"ascending": True}, - ], - ids=[ - "default", - "normalize", - "ascending", - ], -) -def test_value_counts(scalars_dfs, kwargs): - if pd.__version__.startswith("1."): - pytest.skip("pandas 1.x produces different column labels.") - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_too" - - # Pandas `value_counts` can produce non-deterministic results with tied counts. - # Remove duplicates to enforce a consistent output. - s = scalars_df[col_name].drop(0) - pd_s = scalars_pandas_df[col_name].drop(0) - - bf_result = s.value_counts(**kwargs).to_pandas() - pd_result = pd_s.value_counts(**kwargs) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_value_counts_with_na(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - - bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas() - pd_result = scalars_pandas_df[col_name].value_counts(dropna=False) - - # Older pandas version may not have these values, bigframes tries to emulate 2.0+ - pd_result.name = "count" - pd_result.index.name = col_name - - assert_series_equal( - bf_result, - pd_result, - # bigframes values_counts does not honor ordering in the original data - ignore_order=True, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Aggregate op CutOp(bins=3, right=True, labels=False) not yet supported in polars engine." -) -def test_value_counts_w_cut(scalars_dfs): - if pd.__version__.startswith("1."): - pytest.skip("value_counts results different in pandas 1.x.") - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - - bf_cut = bigframes.pandas.cut(scalars_df[col_name], 3, labels=False) - pd_cut = pd.cut(scalars_pandas_df[col_name], 3, labels=False) - - bf_result = bf_cut.value_counts().to_pandas() - pd_result = pd_cut.value_counts() - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result.astype(pd.Int64Dtype()), - ) - - -def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): - - bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() - pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("start", "stop", "step"), - [ - (1, None, None), - (None, 4, None), - (None, None, 2), - (None, 50000000000, 1), - (5, 4, None), - (3, None, 2), - (1, 7, 2), - (1, 7, 50000000000), - (-1, -7, -2), - (None, -7, -2), - (-1, None, -2), - (-7, -1, 2), - (-7, -1, None), - (-7, 7, None), - (7, -7, -2), - ], -) -def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): - bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() - pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_at(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) - index = -2345 - bf_result = scalars_df_index["string_col"].at[index] - pd_result = scalars_pandas_df_index["string_col"].at[index] - - assert bf_result == pd_result - - -def test_iat(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].iat[3] - pd_result = scalars_pandas_df_index["int64_too"].iat[3] - - assert bf_result == pd_result - - -def test_iat_error(scalars_df_index, scalars_pandas_df_index): - with pytest.raises(ValueError): - scalars_pandas_df_index["int64_too"].iat["asd"] - with pytest.raises(ValueError): - scalars_df_index["int64_too"].iat["asd"] - - -def test_series_add_prefix(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].add_prefix("prefix_").to_pandas() - - pd_result = scalars_pandas_df_index["int64_too"].add_prefix("prefix_") - - # Index will be object type in pandas, string type in bigframes, but same values - pd.testing.assert_series_equal( - bf_result, - pd_result, - check_index_type=False, - ) - - -def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].add_suffix("_suffix").to_pandas() - - pd_result = scalars_pandas_df_index["int64_too"].add_suffix("_suffix") - - # Index will be object type in pandas, string type in bigframes, but same values - pd.testing.assert_series_equal( - bf_result, - pd_result, - check_index_type=False, - ) - - -def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): - pytest.skip("pandas filter items behavior different pre-2.1") - bf_result = scalars_df_index["float64_col"].filter(items=[5, 1, 3]).to_pandas() - - pd_result = scalars_pandas_df_index["float64_col"].filter(items=[5, 1, 3]) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - # Ignore ordering as pandas order differently depending on version - assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True) - - -def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy().set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") - - bf_result = scalars_df_index["float64_col"].filter(like="ello").to_pandas() - - pd_result = scalars_pandas_df_index["float64_col"].filter(like="ello") - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_filter_regex(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy().set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") - - bf_result = scalars_df_index["float64_col"].filter(regex="^[GH].*").to_pandas() - - pd_result = scalars_pandas_df_index["float64_col"].filter(regex="^[GH].*") - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_reindex(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]).to_pandas() - ) - - pd_result = scalars_pandas_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_reindex_nonunique(scalars_df_index): - with pytest.raises(ValueError): - # int64_too is non-unique - scalars_df_index.set_index("int64_too")["float64_col"].reindex( - index=[5, 1, 3, 99, 1], validate=True - ) - - -def test_series_reindex_like(scalars_df_index, scalars_pandas_df_index): - bf_reindex_target = scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) - bf_result = ( - scalars_df_index["int64_too"].reindex_like(bf_reindex_target).to_pandas() - ) - - pd_reindex_target = scalars_pandas_df_index["float64_col"].reindex( - index=[5, 1, 3, 99, 1] - ) - pd_result = scalars_pandas_df_index["int64_too"].reindex_like(pd_reindex_target) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_where_with_series(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index["int64_col"] - .where(scalars_df_index["bool_col"], scalars_df_index["int64_too"]) - .to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_col"].where( - scalars_pandas_df_index["bool_col"], scalars_pandas_df_index["int64_too"] - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_where_with_different_indices(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index["int64_col"] - .iloc[::2] - .where( - scalars_df_index["bool_col"].iloc[2:], - scalars_df_index["int64_too"].iloc[:5], - ) - .to_pandas() - ) - pd_result = ( - scalars_pandas_df_index["int64_col"] - .iloc[::2] - .where( - scalars_pandas_df_index["bool_col"].iloc[2:], - scalars_pandas_df_index["int64_too"].iloc[:5], - ) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_where_with_default(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index["int64_col"].where(scalars_df_index["bool_col"]).to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_col"].where( - scalars_pandas_df_index["bool_col"] - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_where_with_callable(scalars_df_index, scalars_pandas_df_index): - def _is_positive(x): - return x > 0 - - # Both cond and other are callable. - bf_result = ( - scalars_df_index["int64_col"] - .where(cond=_is_positive, other=lambda x: x * 10) - .to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_col"].where( - cond=_is_positive, other=lambda x: x * 10 - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" -) -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): - col_bf = scalars_df_index["int64_col"] - lower_bf = scalars_df_index["int64_too"] - 1 - upper_bf = scalars_df_index["int64_too"] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered) - - col_pd = scalars_pandas_df_index["int64_col"] - lower_pd = scalars_pandas_df_index["int64_too"] - 1 - upper_pd = scalars_pandas_df_index["int64_too"] + 1 - pd_result = col_pd.clip(lower_pd, upper_pd) - - assert_series_equal(bf_result, pd_result, ignore_order=not ordered) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" -) -def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): - col_bf = scalars_df_index["int64_too"] - bf_result = col_bf.clip(-100, 3.14151593).to_pandas() - - col_pd = scalars_pandas_df_index["int64_too"] - # pandas doesn't work with Int64 and clip with floats - pd_result = col_pd.astype("int64").clip(-100, 3.14151593).astype("Float64") - - assert_series_equal(bf_result, pd_result) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" -) -def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): - col_bf = scalars_df_index["int64_col"].iloc[::2] - lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 - upper_bf = scalars_df_index["int64_too"].iloc[:5] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() - - col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] - lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 - upper_pd = scalars_pandas_df_index["int64_too"].iloc[:5] + 1 - pd_result = col_pd.clip(lower_pd, upper_pd) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented maximum()" -) -def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): - col_bf = scalars_df_index["int64_col"].iloc[::2] - lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 - bf_result = col_bf.clip(lower_bf, None).to_pandas() - - col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] - lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 - pd_result = col_pd.clip(lower_pd, None) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_dot(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_too"] @ scalars_df["int64_too"] - - pd_result = scalars_pandas_df["int64_too"] @ scalars_pandas_df["int64_too"] - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("left", "right", "inclusive"), - [ - (-234892, 55555, "left"), - (-234892, 55555, "both"), - (-234892, 55555, "neither"), - (-234892, 55555, "right"), - ], -) -def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusive): - bf_result = ( - scalars_df_index["int64_col"].between(left, right, inclusive).to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_col"].between(left, right, inclusive) - - pd.testing.assert_series_equal( - bf_result, - pd_result.astype(pd.BooleanDtype()), - ) - - -@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") -def test_series_case_when(scalars_dfs_maybe_ordered): - pytest.importorskip( - "pandas", - minversion="2.2.0", - reason="case_when added in pandas 2.2.0", - ) - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - bf_series = scalars_df["int64_col"] - pd_series = scalars_pandas_df["int64_col"] - - # TODO(tswast): pandas case_when appears to assume True when a value is - # null. I suspect this should be considered a bug in pandas. - - # Generate 150 conditions to test case_when with a large number of conditions - bf_conditions = ( - [((bf_series > 645).fillna(True), bf_series - 1)] - + [((bf_series > (-100 + i * 5)).fillna(True), i) for i in range(148, 0, -1)] - + [((bf_series <= -100).fillna(True), pd.NA)] - ) - - pd_conditions = ( - [((pd_series > 645), pd_series - 1)] - + [((pd_series > (-100 + i * 5)), i) for i in range(148, 0, -1)] - + [(pd_series <= -100, pd.NA)] - ) - - assert len(bf_conditions) == 150 - - bf_result = bf_series.case_when(bf_conditions).to_pandas() - pd_result = pd_series.case_when(pd_conditions) - - pd.testing.assert_series_equal( - bf_result, - pd_result.astype(pd.Int64Dtype()), - ) - - -@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") -def test_series_case_when_change_type(scalars_dfs_maybe_ordered): - pytest.importorskip( - "pandas", - minversion="2.2.0", - reason="case_when added in pandas 2.2.0", - ) - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - bf_series = scalars_df["int64_col"] - pd_series = scalars_pandas_df["int64_col"] - - # TODO(tswast): pandas case_when appears to assume True when a value is - # null. I suspect this should be considered a bug in pandas. - - bf_conditions = [ - ((bf_series > 645).fillna(True), scalars_df["string_col"]), - ((bf_series <= -100).fillna(True), pd.NA), - (True, "not_found"), - ] - - pd_conditions = [ - ((pd_series > 645).fillna(True), scalars_pandas_df["string_col"]), - ((pd_series <= -100).fillna(True), pd.NA), - # pandas currently fails if both the condition and the value are literals. - ([True] * len(pd_series), ["not_found"] * len(pd_series)), - ] - - bf_result = bf_series.case_when(bf_conditions).to_pandas() - pd_result = pd_series.case_when(pd_conditions) - - pd.testing.assert_series_equal( - bf_result, - pd_result.astype("string[pyarrow]"), - ) - - -def test_to_frame(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["int64_col"].to_frame().to_pandas() - pd_result = scalars_pandas_df["int64_col"].to_frame() - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_to_frame_no_name(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df["int64_col"].rename(None).to_frame().to_pandas() - pd_result = scalars_pandas_df["int64_col"].rename(None).to_frame() - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.skip(reason="fixture 'gcs_folder' not found") -def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): - path = gcs_folder + "test_series_to_json*.jsonl" - scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") - gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True) - - pd.testing.assert_series_equal( - gcs_df["int64_col"].astype(pd.Int64Dtype()), - scalars_pandas_df_index["int64_col"], - check_dtype=False, - check_index=False, - ) - - -@pytest.mark.skip(reason="fixture 'gcs_folder' not found") -def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): - path = gcs_folder + "test_series_to_csv*.csv" - scalars_df_index["int64_col"].to_csv(path) - gcs_df = pd.read_csv(get_first_file_from_wildcard(path)) - - pd.testing.assert_series_equal( - gcs_df["int64_col"].astype(pd.Int64Dtype()), - scalars_pandas_df_index["int64_col"], - check_dtype=False, - check_index=False, - ) - - -def test_to_latex(scalars_df_index, scalars_pandas_df_index): - pytest.importorskip("jinja2") - bf_result = scalars_df_index["int64_col"].to_latex() - pd_result = scalars_pandas_df_index["int64_col"].to_latex() - - assert bf_result == pd_result - - -def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.int64_col.to_json() - pd_result = scalars_pandas_df_index.int64_col.to_json() - - assert bf_result == pd_result - - -def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.int64_col.to_json(bf_result_file) - scalars_pandas_df_index.int64_col.to_json(pd_result_file) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_series_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.int64_col.to_csv() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.int64_col.to_csv() - - assert bf_result == pd_result - - -def test_series_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.int64_col.to_csv(bf_result_file) - scalars_pandas_df_index.int64_col.to_csv(pd_result_file) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_to_dict(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].to_dict() - - pd_result = scalars_pandas_df_index["int64_too"].to_dict() - - assert bf_result == pd_result - - -def test_to_excel(scalars_df_index, scalars_pandas_df_index): - pytest.importorskip("openpyxl") - bf_result_file = tempfile.TemporaryFile() - pd_result_file = tempfile.TemporaryFile() - scalars_df_index["int64_too"].to_excel(bf_result_file) - scalars_pandas_df_index["int64_too"].to_excel(pd_result_file) - bf_result = bf_result_file.read() - pd_result = bf_result_file.read() - - assert bf_result == pd_result - - -def test_to_pickle(scalars_df_index, scalars_pandas_df_index): - bf_result_file = tempfile.TemporaryFile() - pd_result_file = tempfile.TemporaryFile() - scalars_df_index["int64_too"].to_pickle(bf_result_file) - scalars_pandas_df_index["int64_too"].to_pickle(pd_result_file) - bf_result = bf_result_file.read() - pd_result = bf_result_file.read() - - assert bf_result == pd_result - - -def test_to_string(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].to_string() - - pd_result = scalars_pandas_df_index["int64_too"].to_string() - - assert bf_result == pd_result - - -def test_to_list(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].to_list() - - pd_result = scalars_pandas_df_index["int64_too"].to_list() - - assert bf_result == pd_result - - -def test_to_numpy(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].to_numpy() - - pd_result = scalars_pandas_df_index["int64_too"].to_numpy() - - assert (bf_result == pd_result).all() - - -def test_to_xarray(scalars_df_index, scalars_pandas_df_index): - pytest.importorskip("xarray") - bf_result = scalars_df_index["int64_too"].to_xarray() - - pd_result = scalars_pandas_df_index["int64_too"].to_xarray() - - assert bf_result.equals(pd_result) - - -def test_to_markdown(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].to_markdown() - - pd_result = scalars_pandas_df_index["int64_too"].to_markdown() - - assert bf_result == pd_result - - -def test_series_values(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].values - - pd_result = scalars_pandas_df_index["int64_too"].values - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_series_equal( - pd.Series(bf_result), pd.Series(pd_result), check_dtype=False - ) - - -def test_series___array__(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["float64_col"].__array__() - - pd_result = scalars_pandas_df_index["float64_col"].__array__() - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - numpy.array_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("ascending", "na_position"), - [ - (True, "first"), - (True, "last"), - (False, "first"), - (False, "last"), - ], -) -def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_position): - # Test needs values to be unique - bf_result = ( - scalars_df_index["int64_col"] - .sort_values(ascending=ascending, na_position=na_position) - .to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_col"].sort_values( - ascending=ascending, na_position=na_position - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_sort_values_inplace(scalars_df_index, scalars_pandas_df_index): - # Test needs values to be unique - bf_series = scalars_df_index["int64_col"].copy() - bf_series.sort_values(ascending=False, inplace=True) - bf_result = bf_series.to_pandas() - pd_result = scalars_pandas_df_index["int64_col"].sort_values(ascending=False) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("ascending"), - [ - (True,), - (False,), - ], -) -def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending): - bf_result = ( - scalars_df_index["int64_too"].sort_index(ascending=ascending).to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=ascending) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_series_sort_index_inplace(scalars_df_index, scalars_pandas_df_index): - bf_series = scalars_df_index["int64_too"].copy() - bf_series.sort_index(ascending=False, inplace=True) - bf_result = bf_series.to_pandas() - pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=False) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_mask_default_value(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df["int64_col"] - bf_col_masked = bf_col.mask(bf_col % 2 == 1) - bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() - - pd_col = scalars_pandas_df["int64_col"] - pd_col_masked = pd_col.mask(pd_col % 2 == 1) - pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_mask_custom_value(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df["int64_col"] - bf_col_masked = bf_col.mask(bf_col % 2 == 1, -1) - bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() - - pd_col = scalars_pandas_df["int64_col"] - pd_col_masked = pd_col.mask(pd_col % 2 == 1, -1) - pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - - # TODO(shobs): There is a pd.NA value in the original series, which is not - # odd so should be left as is, but it is being masked in pandas. - # Accidentally the bigframes bahavior matches, but it should be updated - # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 - assert_pandas_df_equal(bf_result, pd_result) - - -def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index): - def _ten_times(x): - return x * 10 - - # Both cond and other are callable. - bf_result = ( - scalars_df_index["int64_col"] - .mask(cond=lambda x: x > 0, other=_ten_times) - .to_pandas() - ) - pd_result = scalars_pandas_df_index["int64_col"].mask( - cond=lambda x: x > 0, other=_ten_times - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("lambda_",), - [ - pytest.param(lambda x: x > 0), - pytest.param( - lambda x: True if x > 0 else False, - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], - ids=[ - "lambda_arithmatic", - "lambda_arbitrary", - ], -) -def test_mask_lambda(scalars_dfs, lambda_): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df["int64_col"] - bf_result = bf_col.mask(lambda_).to_pandas() - - pd_col = scalars_pandas_df["int64_col"] - pd_result = pd_col.mask(lambda_) - - # ignore dtype check, which are Int64 and object respectively - assert_series_equal(bf_result, pd_result, check_dtype=False) - - -def test_mask_simple_udf(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - def foo(x): - return x < 1000000 - - bf_col = scalars_df["int64_col"] - bf_result = bf_col.mask(foo).to_pandas() - - pd_col = scalars_pandas_df["int64_col"] - pd_result = pd_col.mask(foo) - - # ignore dtype check, which are Int64 and object respectively - assert_series_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.skip( - reason="polars.exceptions.InvalidOperationError: decimal precision should be <= 38 & >= 1" -) -@pytest.mark.parametrize("errors", ["raise", "null"]) -@pytest.mark.parametrize( - ("column", "to_type"), - [ - ("int64_col", "Float64"), - ("int64_col", "Int64"), # No-op - ("int64_col", pd.Float64Dtype()), - ("int64_col", "string[pyarrow]"), - ("int64_col", "boolean"), - ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), - ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), - ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), - ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), - ("int64_col", "time64[us][pyarrow]"), - ("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), - ("bool_col", "Int64"), - ("bool_col", "string[pyarrow]"), - ("bool_col", "Float64"), - ("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), - ("string_col", "binary[pyarrow]"), - ("bytes_col", "string[pyarrow]"), - # pandas actually doesn't let folks convert to/from naive timestamp and - # raises a deprecation warning to use tz_localize/tz_convert instead, - # but BigQuery always stores values as UTC and doesn't have to deal - # with timezone conversions, so we'll allow it. - ("timestamp_col", "date32[day][pyarrow]"), - ("timestamp_col", "time64[us][pyarrow]"), - ("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))), - ("datetime_col", "date32[day][pyarrow]"), - pytest.param( - "datetime_col", - "string[pyarrow]", - marks=pytest.mark.skipif( - pd.__version__.startswith("2.2"), - reason="pandas 2.2 uses T as date/time separator whereas earlier versions use space", - ), - ), - ("datetime_col", "time64[us][pyarrow]"), - ("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), - ("date_col", "string[pyarrow]"), - ("date_col", pd.ArrowDtype(pa.timestamp("us"))), - ("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), - ("time_col", "string[pyarrow]"), - # TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int - # ("float64_col", "Int64"), - # TODO(bmil): decide whether to fix Ibis bug: BigQuery backend - # formats floats with no decimal places if they have no fractional - # part, and does not switch to scientific notation for > 10^15 - # ("float64_col", "string[pyarrow]") - # TODO(bmil): add any other compatible conversions per - # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions - ], -) -def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() - pd_result = scalars_pandas_df_index[column].astype(to_type) - pd.testing.assert_series_equal(bf_result, pd_result) - - -@pytest.mark.skip( - reason="AttributeError: 'DataFrame' object has no attribute 'dtype'. Did you mean: 'dtypes'?" -) -def test_series_astype_python(session): - input = pd.Series(["hello", "world", "3.11", "4000"]) - exepcted = pd.Series( - [None, None, 3.11, 4000], - dtype="Float64", - index=pd.Index([0, 1, 2, 3], dtype="Int64"), - ) - result = session.read_pandas(input).astype(float, errors="null").to_pandas() - pd.testing.assert_series_equal(result, exepcted) - - -@pytest.mark.skip( - reason="AttributeError: 'DataFrame' object has no attribute 'dtype'. Did you mean: 'dtypes'?" -) -def test_astype_safe(session): - input = pd.Series(["hello", "world", "3.11", "4000"]) - exepcted = pd.Series( - [None, None, 3.11, 4000], - dtype="Float64", - index=pd.Index([0, 1, 2, 3], dtype="Int64"), - ) - result = session.read_pandas(input).astype("Float64", errors="null").to_pandas() - pd.testing.assert_series_equal(result, exepcted) - - -def test_series_astype_w_invalid_error(session): - input = pd.Series(["hello", "world", "3.11", "4000"]) - with pytest.raises(ValueError): - session.read_pandas(input).astype("Float64", errors="bad_value") - - -@pytest.mark.parametrize( - ("column", "to_type"), - [ - ("timestamp_col", "int64[pyarrow]"), - ("datetime_col", "int64[pyarrow]"), - ("time_col", "int64[pyarrow]"), - ], -) -def test_date_time_astype_int( - scalars_df_index, scalars_pandas_df_index, column, to_type -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - bf_result = scalars_df_index[column].astype(to_type).to_pandas() - pd_result = scalars_pandas_df_index[column].astype(to_type) - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) - assert bf_result.dtype == "Int64" - - -@pytest.mark.skip( - reason="polars.exceptions.InvalidOperationError: conversion from `str` to `i64` failed in column 'column_0' for 1 out of 4 values: [' -03']" -) -def test_string_astype_int(): - pd_series = pd.Series(["4", "-7", "0", " -03"]) - bf_series = series.Series(pd_series) - - pd_result = pd_series.astype("Int64") - bf_result = bf_series.astype("Int64").to_pandas() - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -@pytest.mark.skip( - reason="polars.exceptions.InvalidOperationError: conversion from `str` to `f64` failed in column 'column_0' for 1 out of 10 values: [' -03.235']" -) -def test_string_astype_float(): - pd_series = pd.Series( - ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] - ) - - bf_series = series.Series(pd_series) - - pd_result = pd_series.astype("Float64") - bf_result = bf_series.astype("Float64").to_pandas() - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -def test_string_astype_date(): - if int(pa.__version__.split(".")[0]) < 15: - pytest.skip( - "Avoid pyarrow.lib.ArrowNotImplementedError: " - "Unsupported cast from string to date32 using function cast_date32." - ) - - pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( - pd.ArrowDtype(pa.string()) - ) - - bf_series = series.Series(pd_series) - - # TODO(b/340885567): fix type error - pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore - bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -def test_string_astype_datetime(): - pd_series = pd.Series( - ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] - ).astype(pd.ArrowDtype(pa.string())) - - bf_series = series.Series(pd_series) - - pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) - bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -def test_string_astype_timestamp(): - pd_series = pd.Series( - [ - "2014-08-15 08:15:12+00:00", - "2015-08-15 08:15:12.654754+05:00", - "2016-02-29 00:00:00+08:00", - ] - ).astype(pd.ArrowDtype(pa.string())) - - bf_series = series.Series(pd_series) - - pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) - bf_result = bf_series.astype( - pd.ArrowDtype(pa.timestamp("us", tz="UTC")) - ).to_pandas() - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -@pytest.mark.skip(reason="AssertionError: Series are different") -def test_timestamp_astype_string(): - bf_series = series.Series( - [ - "2014-08-15 08:15:12+00:00", - "2015-08-15 08:15:12.654754+05:00", - "2016-02-29 00:00:00+08:00", - ] - ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) - - expected_result = pd.Series( - [ - "2014-08-15 08:15:12+00", - "2015-08-15 03:15:12.654754+00", - "2016-02-28 16:00:00+00", - ] - ) - bf_result = bf_series.astype(pa.string()).to_pandas() - - pd.testing.assert_series_equal( - bf_result, expected_result, check_index_type=False, check_dtype=False - ) - assert bf_result.dtype == "string[pyarrow]" - - -@pytest.mark.skip(reason="AssertionError: Series are different") -@pytest.mark.parametrize("errors", ["raise", "null"]) -def test_float_astype_json(errors): - data = ["1.25", "2500000000", None, "-12323.24"] - bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) - - bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) - assert bf_result.dtype == dtypes.JSON_DTYPE - - expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) - expected_result.index = expected_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) - - -@pytest.mark.skip(reason="AssertionError: Series are different") -def test_float_astype_json_str(): - data = ["1.25", "2500000000", None, "-12323.24"] - bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) - - bf_result = bf_series.astype("json") - assert bf_result.dtype == dtypes.JSON_DTYPE - - expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) - expected_result.index = expected_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) - - -@pytest.mark.parametrize("errors", ["raise", "null"]) -def test_string_astype_json(errors): - data = [ - "1", - None, - '["1","3","5"]', - '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', - ] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) - - bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) - assert bf_result.dtype == dtypes.JSON_DTYPE - - pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) - pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) - - -@pytest.mark.skip(reason="AssertionError: Series NA mask are different") -def test_string_astype_json_in_safe_mode(): - data = ["this is not a valid json string"] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) - bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") - assert bf_result.dtype == dtypes.JSON_DTYPE - - expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected) - - -@pytest.mark.skip( - reason="Failed: DID NOT RAISE " -) -def test_string_astype_json_raise_error(): - data = ["this is not a valid json string"] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) - with pytest.raises( - google.api_core.exceptions.BadRequest, - match="syntax error while parsing value", - ): - bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas() - - -@pytest.mark.parametrize("errors", ["raise", "null"]) -@pytest.mark.parametrize( - ("data", "to_type"), - [ - pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"), - pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"), - pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"), - pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"), - pytest.param( - ['"str"', None], - dtypes.TIME_DTYPE, - id="invalid", - marks=pytest.mark.xfail(raises=TypeError), - ), - ], -) -def test_json_astype_others(data, to_type, errors): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) - - bf_result = bf_series.astype(to_type, errors=errors) - assert bf_result.dtype == to_type - - load_data = [json.loads(item) if item is not None else None for item in data] - expected = pd.Series(load_data, dtype=to_type) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected) - - -@pytest.mark.skip( - reason="Failed: DID NOT RAISE " -) -@pytest.mark.parametrize( - ("data", "to_type"), - [ - pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), - pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), - pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), - pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), - ], -) -def test_json_astype_others_raise_error(data, to_type): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) - with pytest.raises(google.api_core.exceptions.BadRequest): - bf_series.astype(to_type, errors="raise").to_pandas() - - -@pytest.mark.skip(reason="AssertionError: Series NA mask are different") -@pytest.mark.parametrize( - ("data", "to_type"), - [ - pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), - pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), - pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), - pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), - ], -) -def test_json_astype_others_in_safe_mode(data, to_type): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) - bf_result = bf_series.astype(to_type, errors="null") - assert bf_result.dtype == to_type - - expected = pd.Series([None, None], dtype=to_type) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.to_pandas(), expected) - - -@pytest.mark.parametrize( - "index", - [0, 5, -2], -) -def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.string_col.iloc[index] - pd_result = scalars_pandas_df_index.string_col.iloc[index] - - assert bf_result == pd_result - - -def test_iloc_single_integer_out_of_bound_error(scalars_df_index): - with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): - scalars_df_index.string_col.iloc[99] - - -def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.loc[scalars_df_index.bool_col].to_pandas() - pd_result = scalars_pandas_df_index.string_col.loc[scalars_pandas_df_index.bool_col] - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.skip(reason="fixture 'scalars_pandas_df_default_index' not found") -def test_loc_bool_series_default_index( - scalars_df_default_index, scalars_pandas_df_default_index -): - bf_result = scalars_df_default_index.string_col.loc[ - scalars_df_default_index.bool_col - ].to_pandas() - pd_result = scalars_pandas_df_default_index.string_col.loc[ - scalars_pandas_df_default_index.bool_col - ] - - assert_pandas_df_equal( - bf_result.to_frame(), - pd_result.to_frame(), - ) - - -def test_argmin(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.argmin() - pd_result = scalars_pandas_df_index.string_col.argmin() - assert bf_result == pd_result - - -def test_argmax(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.int64_too.argmax() - pd_result = scalars_pandas_df_index.int64_too.argmax() - assert bf_result == pd_result - - -def test_series_idxmin(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.idxmin() - pd_result = scalars_pandas_df_index.string_col.idxmin() - assert bf_result == pd_result - - -def test_series_idxmax(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.int64_too.idxmax() - pd_result = scalars_pandas_df_index.int64_too.idxmax() - assert bf_result == pd_result - - -def test_getattr_attribute_error_when_pandas_has(scalars_df_index): - # asof is implemented in pandas but not in bigframes - with pytest.raises(AttributeError): - scalars_df_index.string_col.asof() - - -def test_getattr_attribute_error(scalars_df_index): - with pytest.raises(AttributeError): - scalars_df_index.string_col.not_a_method() - - -def test_rename(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.rename("newname") - pd_result = scalars_pandas_df_index.string_col.rename("newname") - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_rename_nonstring(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.rename((4, 2)) - pd_result = scalars_pandas_df_index.string_col.rename((4, 2)) - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_rename_dict_same_type(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.rename({1: 100, 2: 200}) - pd_result = scalars_pandas_df_index.string_col.rename({1: 100, 2: 200}) - - pd_result.index = pd_result.index.astype("Int64") - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_rename_axis(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.rename_axis("newindexname") - pd_result = scalars_pandas_df_index.string_col.rename_axis("newindexname") - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): - index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values - - scalars_df_index = scalars_df_index.set_index("string_col", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index( - "string_col", drop=False - ) - - bf_result = scalars_df_index.string_col.loc[index_list] - pd_result = scalars_pandas_df_index.string_col.loc[index_list] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): - index_list = [3, 2, 1, 3, 2, 1] - - bf_result = scalars_df_index.bool_col.loc[index_list] - pd_result = scalars_pandas_df_index.bool_col.loc[index_list] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): - scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) - scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( - ["string_col", "int64_col"] - ) - index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] - - bf_result = scalars_df_multiindex.int64_too.loc[index_list] - pd_result = scalars_pandas_df_multiindex.int64_too.loc[index_list] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_iloc_list(scalars_df_index, scalars_pandas_df_index): - index_list = [0, 0, 0, 5, 4, 7] - - bf_result = scalars_df_index.string_col.iloc[index_list] - pd_result = scalars_pandas_df_index.string_col.iloc[index_list] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_iloc_list_nameless(scalars_df_index, scalars_pandas_df_index): - index_list = [0, 0, 0, 5, 4, 7] - - bf_series = scalars_df_index.string_col.rename(None) - bf_result = bf_series.iloc[index_list] - pd_series = scalars_pandas_df_index.string_col.rename(None) - pd_result = pd_series.iloc[index_list] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_list_nameless(scalars_df_index, scalars_pandas_df_index): - index_list = [0, 0, 0, 5, 4, 7] - - bf_series = scalars_df_index.string_col.rename(None) - bf_result = bf_series.loc[index_list] - - pd_series = scalars_pandas_df_index.string_col.rename(None) - pd_result = pd_series.loc[index_list] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): - pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - - scalars_df_index = scalars_df_index.set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") - - bf_result = scalars_df_index.date_col.loc[bf_string_series] - pd_result = scalars_pandas_df_index.date_col.loc[pd_string_series] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): - pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - - scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) - scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( - ["string_col", "int64_col"] - ) - - bf_result = scalars_df_multiindex.int64_too.loc[bf_string_series] - pd_result = scalars_pandas_df_multiindex.int64_too.loc[pd_string_series] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): - pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index - bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index - - bf_result = scalars_df_index.date_col.loc[bf_index] - pd_result = scalars_pandas_df_index.date_col.loc[pd_index] - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("string_col", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index( - "string_col", drop=False - ) - index = "Hello, World!" - bf_result = scalars_df_index.date_col.loc[index] - pd_result = scalars_pandas_df_index.date_col.loc[index] - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) - index = -2345 - bf_result = scalars_df_index.date_col.loc[index] - pd_result = scalars_pandas_df_index.date_col.loc[index] - assert bf_result == pd_result - - -def test_series_bool_interpretation_error(scalars_df_index): - with pytest.raises(ValueError): - True if scalars_df_index["string_col"] else False - - -@pytest.mark.skip( - reason="NotImplementedError: dry_run not implemented for this executor" -) -def test_query_job_setters(scalars_dfs): - # if allow_large_results=False, might not create query job - with bigframes.option_context("compute.allow_large_results", True): - job_ids = set() - df, _ = scalars_dfs - series = df["int64_col"] - assert series.query_job is not None - repr(series) - job_ids.add(series.query_job.job_id) - series.to_pandas() - job_ids.add(series.query_job.job_id) - assert len(job_ids) == 2 - - -@pytest.mark.parametrize( - ("series_input",), - [ - ([1, 2, 3, 4, 5],), - ([1, 1, 3, 5, 5],), - ([1, pd.NA, 4, 5, 5],), - ([1, 3, 2, 5, 4],), - ([pd.NA, pd.NA],), - ([1, 1, 1, 1, 1],), - ], -) -def test_is_monotonic_increasing(series_input): - scalars_df = series.Series(series_input, dtype=pd.Int64Dtype()) - scalars_pandas_df = pd.Series(series_input, dtype=pd.Int64Dtype()) - assert ( - scalars_df.is_monotonic_increasing == scalars_pandas_df.is_monotonic_increasing - ) - - -@pytest.mark.parametrize( - ("series_input",), - [ - ([1],), - ([5, 4, 3, 2, 1],), - ([5, 5, 3, 1, 1],), - ([1, pd.NA, 4, 5, 5],), - ([5, pd.NA, 4, 2, 1],), - ([1, 1, 1, 1, 1],), - ], -) -def test_is_monotonic_decreasing(series_input): - scalars_df = series.Series(series_input) - scalars_pandas_df = pd.Series(series_input) - assert ( - scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing - ) - - -def test_map_dict_input(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - local_map = dict() - # construct a local map, incomplete to cover behavior - for s in scalars_pandas_df.string_col[:-3]: - if isinstance(s, str): - local_map[s] = ord(s[0]) - - pd_result = scalars_pandas_df.string_col.map(local_map) - pd_result = pd_result.astype("Int64") # pandas type differences - bf_result = scalars_df.string_col.map(local_map) - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_map_series_input(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - new_index = scalars_pandas_df.int64_too.drop_duplicates() - pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] - pd_map_series.index = new_index - bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr.session - ) - - pd_result = scalars_pandas_df.int64_too.map(pd_map_series) - bf_result = scalars_df.int64_too.map(bf_map_series) - - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_map_series_input_duplicates_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - new_index = scalars_pandas_df.int64_too - pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] - pd_map_series.index = new_index - bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr.session - ) - - with pytest.raises(pd.errors.InvalidIndexError): - scalars_pandas_df.int64_too.map(pd_map_series) - with pytest.raises(pd.errors.InvalidIndexError): - scalars_df.int64_too.map(bf_map_series, verify_integrity=True) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented hash()" -) -@pytest.mark.parametrize( - ("frac", "n", "random_state"), - [ - (None, 4, None), - (0.5, None, None), - (None, 4, 10), - (0.5, None, 10), - (None, None, None), - ], - ids=[ - "n_wo_random_state", - "frac_wo_random_state", - "n_w_random_state", - "frac_w_random_state", - "n_default", - ], -) -def test_sample(scalars_dfs, frac, n, random_state): - scalars_df, _ = scalars_dfs - df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state) - bf_result = df.to_pandas() - - n = 1 if n is None else n - expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n - assert bf_result.shape[0] == expected_sample_size - - -def test_series_iter( - scalars_df_index, - scalars_pandas_df_index, -): - for bf_i, pd_i in zip( - scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"] - ): - assert bf_i == pd_i - - -@pytest.mark.parametrize( - ( - "col", - "lambda_", - ), - [ - pytest.param("int64_col", lambda x: x * x + x + 1), - pytest.param("int64_col", lambda x: x % 2 == 1), - pytest.param("string_col", lambda x: x + "_suffix"), - ], - ids=[ - "lambda_int_int", - "lambda_int_bool", - "lambda_str_str", - ], -) -def test_apply_lambda(scalars_dfs, col, lambda_): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df[col] - - # Can't be applied to BigFrames Series without by_row=False - with pytest.raises(ValueError, match="by_row=False"): - bf_col.apply(lambda_) - - bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() - - pd_col = scalars_pandas_df[col] - if pd.__version__[:3] in ("2.2", "2.3"): - pd_result = pd_col.apply(lambda_, by_row=False) - else: - pd_result = pd_col.apply(lambda_) - - # ignore dtype check, which are Int64 and object respectively - # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" - assert_series_equal( - bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented log()" -) -@pytest.mark.parametrize( - ("ufunc",), - [ - pytest.param(numpy.log), - pytest.param(numpy.sqrt), - pytest.param(numpy.sin), - ], - ids=[ - "log", - "sqrt", - "sin", - ], -) -def test_apply_numpy_ufunc(scalars_dfs, ufunc): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df["int64_col"] - - # Can't be applied to BigFrames Series without by_row=False - with pytest.raises(ValueError, match="by_row=False"): - bf_col.apply(ufunc) - - bf_result = bf_col.apply(ufunc, by_row=False).to_pandas() - - pd_col = scalars_pandas_df["int64_col"] - pd_result = pd_col.apply(ufunc) - - assert_series_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("ufunc",), - [ - pytest.param(numpy.add), - pytest.param(numpy.divide), - ], - ids=[ - "add", - "divide", - ], -) -def test_combine_series_ufunc(scalars_dfs, ufunc): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df["int64_col"].dropna() - bf_result = bf_col.combine(bf_col, ufunc).to_pandas() - - pd_col = scalars_pandas_df["int64_col"].dropna() - pd_result = pd_col.combine(pd_col, ufunc) - - assert_series_equal(bf_result, pd_result, check_dtype=False) - - -def test_combine_scalar_ufunc(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_col = scalars_df["int64_col"].dropna() - bf_result = bf_col.combine(2.5, numpy.add).to_pandas() - - pd_col = scalars_pandas_df["int64_col"].dropna() - pd_result = pd_col.combine(2.5, numpy.add) - - assert_series_equal(bf_result, pd_result, check_dtype=False) - - -def test_apply_simple_udf(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - def foo(x): - return x * x + 2 * x + 3 - - bf_col = scalars_df["int64_col"] - - # Can't be applied to BigFrames Series without by_row=False - with pytest.raises(ValueError, match="by_row=False"): - bf_col.apply(foo) - - bf_result = bf_col.apply(foo, by_row=False).to_pandas() - - pd_col = scalars_pandas_df["int64_col"] - - if pd.__version__[:3] in ("2.2", "2.3"): - pd_result = pd_col.apply(foo, by_row=False) - else: - pd_result = pd_col.apply(foo) - - # ignore dtype check, which are Int64 and object respectively - # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" - assert_series_equal( - bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 - ) - - -@pytest.mark.parametrize( - ("col", "lambda_", "exception"), - [ - pytest.param("int64_col", {1: 2, 3: 4}, ValueError), - pytest.param("int64_col", numpy.square, TypeError), - pytest.param("string_col", lambda x: x.capitalize(), AttributeError), - ], - ids=[ - "not_callable", - "numpy_ufunc", - "custom_lambda", - ], -) -def test_apply_not_supported(scalars_dfs, col, lambda_, exception): - scalars_df, _ = scalars_dfs - - bf_col = scalars_df[col] - with pytest.raises(exception): - bf_col.apply(lambda_, by_row=False) - - -def test_series_pipe( - scalars_df_index, - scalars_pandas_df_index, -): - column = "int64_too" - - def foo(x: int, y: int, df): - return (df + x) % y - - bf_result = ( - scalars_df_index[column] - .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x**2) - .to_pandas() - ) - - pd_result = ( - scalars_pandas_df_index[column] - .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x**2) - ) - - assert_series_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("data"), - [ - pytest.param([1, 2, 3], id="int"), - pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"), - pytest.param( - [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array" - ), - pytest.param( - [ - {"A": {"x": 1.0}, "B": "b"}, - {"A": {"y": 2.0}, "B": "bb"}, - {"A": {"z": 4.0}}, - {}, - numpy.nan, - ], - id="struct_array", - ), - ], -) -def test_series_explode(data): - s = bigframes.pandas.Series(data) - pd_s = s.to_pandas() - pd.testing.assert_series_equal( - s.explode().to_pandas(), - pd_s.explode(), - check_index_type=False, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - ("index", "ignore_index"), - [ - pytest.param(None, True, id="default_index"), - pytest.param(None, False, id="ignore_default_index"), - pytest.param([5, 1, 3, 2], True, id="unordered_index"), - pytest.param([5, 1, 3, 2], False, id="ignore_unordered_index"), - pytest.param(["z", "x", "a", "b"], True, id="str_index"), - pytest.param(["z", "x", "a", "b"], False, id="ignore_str_index"), - pytest.param( - pd.Index(["z", "x", "a", "b"], name="idx"), True, id="str_named_index" - ), - pytest.param( - pd.Index(["z", "x", "a", "b"], name="idx"), - False, - id="ignore_str_named_index", - ), - pytest.param( - pd.MultiIndex.from_frame( - pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) - ), - True, - id="multi_index", - ), - pytest.param( - pd.MultiIndex.from_frame( - pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) - ), - False, - id="ignore_multi_index", - ), - ], -) -def test_series_explode_w_index(index, ignore_index): - data = [[], [200.0, 23.12], [4.5, -9.0], [1.0]] - s = bigframes.pandas.Series(data, index=index) - pd_s = pd.Series(data, index=index) - # TODO(b/340885567): fix type error - pd.testing.assert_series_equal( - s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore - pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("ignore_index", "ordered"), - [ - pytest.param(True, True, id="include_index_ordered"), - pytest.param(True, False, id="include_index_unordered"), - pytest.param(False, True, id="ignore_index_ordered"), - ], -) -def test_series_explode_reserve_order(ignore_index, ordered): - data = [numpy.random.randint(0, 10, 10) for _ in range(10)] - s = bigframes.pandas.Series(data) - pd_s = pd.Series(data) - - # TODO(b/340885567): fix type error - res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) # type: ignore - # TODO(b/340885567): fix type error - pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) # type: ignore - pd_res.index = pd_res.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( - res if ordered else res.sort_index(), - pd_res, - ) - - -def test_series_explode_w_aggregate(): - data = [[1, 2, 3], [], numpy.nan, [3, 4]] - s = bigframes.pandas.Series(data) - pd_s = pd.Series(data) - assert s.explode().sum() == pd_s.explode().sum() - - -def test_series_construct_empty_array(): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - s = bigframes.pandas.Series([[]]) - expected = pd.Series( - [[]], - dtype=pd.ArrowDtype(pa.list_(pa.float64())), - index=pd.Index([0], dtype=pd.Int64Dtype()), - ) - pd.testing.assert_series_equal( - expected, - s.to_pandas(), - ) - - -@pytest.mark.parametrize( - ("data"), - [ - pytest.param(numpy.nan, id="null"), - pytest.param([numpy.nan], id="null_array"), - pytest.param([[]], id="empty_array"), - pytest.param([numpy.nan, []], id="null_and_empty_array"), - ], -) -def test_series_explode_null(data): - s = bigframes.pandas.Series(data) - pd.testing.assert_series_equal( - s.explode().to_pandas(), - s.to_pandas().explode(), - check_dtype=False, - ) - - -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented IntegerLabelToDatetimeOp(freq=<75 * Days>, label=None, origin='start_day')" -) -@pytest.mark.parametrize( - ("append", "level", "col", "rule"), - [ - pytest.param(False, None, "timestamp_col", "75D"), - pytest.param(True, 1, "timestamp_col", "25W"), - pytest.param(False, None, "datetime_col", "3ME"), - pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), - ], -) -def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] - scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ - "int64_col" - ] - bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() - pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() - pd.testing.assert_series_equal(bf_result, pd_result) - - -@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") -def test_series_struct_get_field_by_attribute( - nested_structs_df, nested_structs_pandas_df -): - if Version(pd.__version__) < Version("2.2.0"): - pytest.skip("struct accessor is not supported before pandas 2.2") - - bf_series = nested_structs_df["person"] - df_series = nested_structs_pandas_df["person"] - - pd.testing.assert_series_equal( - bf_series.address.city.to_pandas(), - df_series.struct.field("address").struct.field("city"), - check_dtype=False, - check_index=False, - ) - pd.testing.assert_series_equal( - bf_series.address.country.to_pandas(), - df_series.struct.field("address").struct.field("country"), - check_dtype=False, - check_index=False, - ) - - -@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") -def test_series_struct_fields_in_dir(nested_structs_df): - series = nested_structs_df["person"] - - assert "age" in dir(series) - assert "address" in dir(series) - assert "city" in dir(series.address) - assert "country" in dir(series.address) - - -@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") -def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): - series = nested_structs_df["person"] - - assert series.name == "person" - - -@pytest.mark.skip( - reason="NotImplementedError: dry_run not implemented for this executor" -) -def test_series_to_pandas_dry_run(scalars_df_index): - bf_series = scalars_df_index["int64_col"] - - result = bf_series.to_pandas(dry_run=True) - - assert isinstance(result, pd.Series) - assert len(result) > 0 - - -def test_series_item(session): - # Test with a single item - bf_s_single = bigframes.pandas.Series([42], session=session) - pd_s_single = pd.Series([42]) - assert bf_s_single.item() == pd_s_single.item() - - -def test_series_item_with_multiple(session): - # Test with multiple items - bf_s_multiple = bigframes.pandas.Series([1, 2, 3], session=session) - pd_s_multiple = pd.Series([1, 2, 3]) - - try: - pd_s_multiple.item() - except ValueError as e: - expected_message = str(e) - else: - raise AssertionError("Expected ValueError from pandas, but didn't get one") - - with pytest.raises(ValueError, match=re.escape(expected_message)): - bf_s_multiple.item() - - -def test_series_item_with_empty(session): - # Test with an empty Series - bf_s_empty = bigframes.pandas.Series([], dtype="Int64", session=session) - pd_s_empty = pd.Series([], dtype="Int64") - - try: - pd_s_empty.item() - except ValueError as e: - expected_message = str(e) - else: - raise AssertionError("Expected ValueError from pandas, but didn't get one") - - with pytest.raises(ValueError, match=re.escape(expected_message)): - bf_s_empty.item() From ddbb32dd2baab6f0fde38d940f685040525f2d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 15:52:05 +0000 Subject: [PATCH 30/63] revert new session methods --- bigframes/core/indexes/base.py | 11 +-- bigframes/core/indexes/multi.py | 48 +----------- bigframes/core/log_adapter.py | 4 +- bigframes/core/reshape/tile.py | 7 +- bigframes/core/tools/datetimes.py | 10 +-- bigframes/pandas/__init__.py | 17 ++-- bigframes/session/__init__.py | 124 ++---------------------------- tests/unit/test_pandas.py | 26 +++---- 8 files changed, 35 insertions(+), 212 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index a258c01195..83dd11dacb 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -383,16 +383,9 @@ def to_series( name = self.name if name is None else name if index is None: - return bigframes.series.Series( - data=self, index=self, name=name, session=self._session - ) + return bigframes.series.Series(data=self, index=self, name=name) else: - return bigframes.series.Series( - data=self, - index=Index(index, session=self._session), - name=name, - session=self._session, - ) + return bigframes.series.Series(data=self, index=Index(index), name=name) def get_level_values(self, level) -> Index: level_n = level if isinstance(level, int) else self.names.index(level) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index a611442b88..a8b4b7dffe 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -14,7 +14,7 @@ from __future__ import annotations -from typing import cast, Hashable, Iterable, Optional, Sequence, TYPE_CHECKING +from typing import cast, Hashable, Iterable, Sequence import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas @@ -23,9 +23,6 @@ from bigframes.core import expression as ex from bigframes.core.indexes.base import Index -if TYPE_CHECKING: - import bigframes.session - class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ @@ -36,12 +33,10 @@ def from_tuples( tuples: Iterable[tuple[Hashable, ...]], sortorder: int | None = None, names: Sequence[Hashable] | Hashable | None = None, - *, - session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index, session=session)) + return cast(MultiIndex, Index(pd_index)) @classmethod def from_arrays( @@ -49,12 +44,10 @@ def from_arrays( arrays, sortorder: int | None = None, names=None, - *, - session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index, session=session)) + return cast(MultiIndex, Index(pd_index)) def __eq__(self, other) -> Index: # type: ignore import bigframes.operations as ops @@ -78,38 +71,3 @@ def __eq__(self, other) -> Index: # type: ignore index_labels=[None], ) ) - - -class MultiIndexAccessor: - """Proxy to MultiIndex constructors to allow a session to be passed in.""" - - def __init__(self, session: bigframes.session.Session): - self._session = session - - def __call__(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :class:`bigframes.pandas.MultiIndex`. - """ - return MultiIndex(*args, session=self._session, **kwargs) - - def from_arrays(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :func:`bigframes.pandas.MultiIndex.from_arrays`. - """ - return MultiIndex.from_arrays(*args, session=self._session, **kwargs) - - def from_frame(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :func:`bigframes.pandas.MultiIndex.from_frame`. - """ - return cast(MultiIndex, MultiIndex.from_frame(*args, **kwargs)) - - def from_tuples(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :func:`bigframes.pandas.MultiIndex.from_tuples`. - """ - return MultiIndex.from_tuples(*args, session=self._session, **kwargs) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 8179ffbeed..3ec1e86dc7 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -155,9 +155,7 @@ def method_logger(method=None, /, *, custom_base_name: Optional[str] = None): def outer_wrapper(method): @functools.wraps(method) def wrapper(*args, **kwargs): - api_method_name = getattr( - method, LOG_OVERRIDE_NAME, method.__name__ - ).lower() + api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) if custom_base_name is None: qualname_parts = getattr(method, "__qualname__", method.__name__).split( "." diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index a2efa8f927..74a941be54 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -15,7 +15,6 @@ from __future__ import annotations import typing -from typing import Optional, TYPE_CHECKING import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -32,9 +31,6 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series -if TYPE_CHECKING: - import bigframes.session - def cut( x, @@ -46,7 +42,6 @@ def cut( *, right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, - session: Optional[bigframes.session.Session] = None, ) -> bigframes.series.Series: if ( labels is not None @@ -70,7 +65,7 @@ def cut( raise ValueError("Cannot cut empty array.") if not isinstance(x, bigframes.series.Series): - x = bigframes.series.Series(x, session=session) + x = bigframes.series.Series(x) if isinstance(bins, int): if bins <= 0: diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 0e5594d498..7edf2fa2e4 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations - from collections.abc import Mapping from datetime import date, datetime -from typing import Optional, TYPE_CHECKING, Union +from typing import Optional, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes @@ -27,9 +25,6 @@ import bigframes.operations as ops import bigframes.series -if TYPE_CHECKING: - import bigframes.session - def to_datetime( arg: Union[ @@ -42,7 +37,6 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, - session: Optional[bigframes.session.Session] = None, ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( @@ -58,7 +52,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg, session=session) + arg = bigframes.series.Series(arg) if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 6fcb71f0d8..2455637b0a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -16,8 +16,8 @@ from __future__ import annotations -import collections -import datetime +from collections import namedtuple +from datetime import date, datetime import inspect import sys import typing @@ -198,18 +198,18 @@ def to_datetime( @typing.overload def to_datetime( - arg: Union[int, float, str, datetime.datetime, datetime.date], + arg: Union[int, float, str, datetime, date], *, utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime.datetime]: +) -> Union[pandas.Timestamp, datetime]: ... def to_datetime( arg: Union[ - Union[int, float, str, datetime.datetime, datetime.date], + Union[int, float, str, datetime, date], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, @@ -218,9 +218,8 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: - return global_session.with_default_session( - bigframes.session.Session.to_datetime, +) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: + return bigframes.core.tools.to_datetime( arg, utc=utc, format=format, @@ -323,7 +322,7 @@ def clean_up_by_session_id( __version__ = bigframes.version.__version__ # Other public pandas attributes -NamedAgg = collections.namedtuple("NamedAgg", ["column", "aggfunc"]) +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 886072b884..46fb56b88e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,8 +68,6 @@ import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.events -import bigframes.core.indexes -import bigframes.core.indexes.multi import bigframes.core.pyformat import bigframes.formatting_helpers import bigframes.functions._function_session as bff_session @@ -81,6 +79,7 @@ # Avoid circular imports. if typing.TYPE_CHECKING: + import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series import bigframes.streaming.dataframe as streaming_dataframe @@ -321,15 +320,6 @@ def bqconnectionmanager(self): ) return self._bq_connection_manager - @property - def options(self) -> bigframes._config.Options: - """Options for configuring BigQuery DataFrames. - - Included for compatibility between bpd and Session. - """ - # TODO(tswast): Consider making a separate session-level options object. - return bigframes._config.options - @property def session_id(self): return self._session_id @@ -1836,7 +1826,7 @@ def udf( Turning an arbitrary python function into a BigQuery managed python udf: >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) # doctest: +SKIP + >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1849,8 +1839,8 @@ def udf( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP - >>> hours # doctest: +SKIP + >>> hours = minutes.apply(minutes_to_hours) + >>> hours 0 0.0 1 0.5 2 1.0 @@ -1863,7 +1853,7 @@ def udf( packages (optionally with the package version) via `packages` param. >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf( # doctest: +SKIP + >>> @bpd.udf( ... dataset="bigfranes_testing", ... name=bq_name, ... packages=["cryptography"] @@ -1880,14 +1870,14 @@ def udf( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) # doctest: +SKIP + >>> hashes = names.apply(get_hash) You can clean-up the BigQuery functions created above using the BigQuery client from the BigQuery DataFrames session: >>> session = bpd.get_global_session() - >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) # doctest: +SKIP - >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) # doctest: +SKIP + >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) + >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) Args: input_types (type or sequence(type), Optional): @@ -2307,104 +2297,6 @@ def read_gbq_object_table( s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() - # ========================================================================= - # bigframes.pandas attributes - # - # These are included so that Session and bigframes.pandas can be used - # interchangeably. - # ========================================================================= - def cut(self, *args, **kwargs) -> bigframes.series.Series: - """Cuts a BigQuery DataFrames object. - - Included for compatibility between bpd and Session. - - See :func:`bigframes.pandas.cut` for full documentation. - """ - import bigframes.core.reshape.tile - - return bigframes.core.reshape.tile.cut( - *args, - session=self, - **kwargs, - ) - - def DataFrame(self, *args, **kwargs): - """Constructs a DataFrame. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.DataFrame` for full documentation. - """ - import bigframes.dataframe - - return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) - - @property - def MultiIndex(self) -> bigframes.core.indexes.multi.MultiIndexAccessor: - """Constructs a MultiIndex. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.MulitIndex` for full documentation. - """ - import bigframes.core.indexes.multi - - return bigframes.core.indexes.multi.MultiIndexAccessor(self) - - def Index(self, *args, **kwargs): - """Constructs a Index. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.Index` for full documentation. - """ - import bigframes.core.indexes - - return bigframes.core.indexes.Index(*args, session=self, **kwargs) - - def Series(self, *args, **kwargs): - """Constructs a Series. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.Series` for full documentation. - """ - import bigframes.series - - return bigframes.series.Series(*args, session=self, **kwargs) - - def to_datetime( - self, *args, **kwargs - ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: - """Converts a BigQuery DataFrames object to datetime dtype. - - Included for compatibility between bpd and Session. - - See :func:`bigframes.pandas.to_datetime` for full documentation. - """ - import bigframes.core.tools - - return bigframes.core.tools.to_datetime( - *args, - session=self, - **kwargs, - ) - - def to_timedelta(self, *args, **kwargs): - """Converts a BigQuery DataFrames object to timedelta/duration dtype. - - Included for compatibility between bpd and Session. - - See :func:`bigframes.pandas.to_timedelta` for full documentation. - """ - import bigframes.pandas.core.tools.timedeltas - - return bigframes.pandas.core.tools.timedeltas.to_timedelta( - *args, - session=self, - **kwargs, - ) - def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 5e75e6b20f..73e0b7f2d6 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -64,12 +64,8 @@ def test_method_matches_session(method_name: str): pandas_method = getattr(bigframes.pandas, method_name) pandas_doc = inspect.getdoc(pandas_method) assert pandas_doc is not None, "docstrings are required" - - pandas_doc_stripped = re.sub(leading_whitespace, "", pandas_doc) - session_doc_stripped = re.sub(leading_whitespace, "", session_doc) - assert ( - pandas_doc_stripped == session_doc_stripped - or ":`bigframes.pandas" in session_doc_stripped + assert re.sub(leading_whitespace, "", pandas_doc) == re.sub( + leading_whitespace, "", session_doc ) # Add `eval_str = True` so that deferred annotations are turned into their @@ -79,20 +75,18 @@ def test_method_matches_session(method_name: str): eval_str=True, globals={**vars(bigframes.session), **{"dataframe": bigframes.dataframe}}, ) - session_args = [ - # Kind includes position, which will be an offset. - parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) - for parameter in session_signature.parameters.values() - # Don't include the first parameter, which is `self: Session` - ][1:] pandas_signature = inspect.signature(pandas_method, eval_str=True) - pandas_args = [ + assert [ # Kind includes position, which will be an offset. parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) for parameter in pandas_signature.parameters.values() - ] - assert session_args == pandas_args or ["args", "kwargs"] == [ - parameter.name for parameter in session_args + ] == [ + # Kind includes position, which will be an offset. + parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) + for parameter in session_signature.parameters.values() + # Don't include the first parameter, which is `self: Session` + ][ + 1: ] assert pandas_signature.return_annotation == session_signature.return_annotation From d80bfcb717d4e8186b276738c7d01c24c17f6caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 16:08:59 +0000 Subject: [PATCH 31/63] fix TestSession read_pandas for Series --- bigframes/dataframe.py | 2 +- bigframes/operations/base.py | 27 ++++----------------------- bigframes/testing/polars_session.py | 12 +++++++++--- 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3527b225e2..bc2bbb963b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -688,7 +688,7 @@ def _getitem_label(self, key: blocks.Label): return DataFrame(block) if len(col_ids) == 1: - return bigframes.series.Series(block, name=key) + return bigframes.series.Series(block) return DataFrame(block) # Bool Series selects rows diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 7d6a1c3b68..f2bbcb3320 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,7 +14,6 @@ from __future__ import annotations -import enum import typing from typing import List, Sequence, Union @@ -36,18 +35,6 @@ import bigframes.session -class Default(enum.Enum): - """Sentinel that can disambiguate explicit None from missing. - - See https://stackoverflow.com/a/76606310/101923 - """ - - token = 0 - - -DEFAULT = Default.token - - class SeriesMethods: def __init__( self, @@ -56,7 +43,7 @@ def __init__( dtype: typing.Optional[ bigframes.dtypes.DtypeString | bigframes.dtypes.Dtype ] = None, - name: str | None | Default = DEFAULT, + name: str | None = None, copy: typing.Optional[bool] = None, *, session: typing.Optional[bigframes.session.Session] = None, @@ -120,7 +107,6 @@ def __init__( block = data_block if block: - # Data was a bigframes object. assert len(block.value_columns) == 1 assert len(block.column_labels) == 1 if index is not None: # reindexing operation @@ -129,27 +115,23 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) - if name is not DEFAULT: + if name: block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: - # Data was local. if isinstance(dtype, str) and dtype.lower() == "json": dtype = bigframes.dtypes.JSON_DTYPE pd_series = pd.Series( data=data, index=index, # type:ignore dtype=dtype, # type:ignore - name=name if name is not DEFAULT else None, + name=name, ) - name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore - block = block.with_column_labels([name]) assert block is not None - self._block: blocks.Block = block @property @@ -178,8 +160,7 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - result = series.Series(block.select_column(result_id), name=self._name) - return result + return series.Series(block.select_column(result_id)) def _apply_binary_op( self, diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 4d3e6862b9..ba6d502fcc 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -94,17 +94,23 @@ def __init__(self): self._loader = None # type: ignore def read_pandas(self, pandas_dataframe, write_engine="default"): + original_input = pandas_dataframe + # override read_pandas to always keep data local-only if isinstance(pandas_dataframe, (pandas.Series, pandas.Index)): pandas_dataframe = pandas_dataframe.to_frame() + local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) bf_df = bigframes.dataframe.DataFrame(local_block) - if isinstance(pandas_dataframe, pandas.Series): + + if isinstance(original_input, pandas.Series): series = bf_df[bf_df.columns[0]] - series.name = pandas_dataframe.name + series.name = original_input.name return series - if isinstance(pandas_dataframe, pandas.Index): + + if isinstance(original_input, pandas.Index): return bf_df.index + return bf_df @property From 0a5a9353bd3e97cc74c9000af0cb4e5a379bc5b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 16:10:56 +0000 Subject: [PATCH 32/63] revert more unnecessary changes --- scripts/publish_api_coverage.py | 3 --- tests/unit/test_dataframe_polars.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 181b8c3365..8f305bcc0f 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -204,9 +204,6 @@ def generate_pandas_api_coverage(): def generate_sklearn_api_coverage(): """Explore all SKLearn modules, and for each item contained generate a regex to detect it being imported, and record whether we implement it""" - - import sklearn # noqa - sklearn_modules = [ "sklearn", "sklearn.model_selection", diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index c95c647fa8..a6f5c3d1ef 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2319,8 +2319,7 @@ def test_binop_with_self_aggregate(session, scalars_dfs): df_columns = ["int64_col", "float64_col", "int64_too"] bf_df = scalars_df[df_columns] - bf_deviation = bf_df - bf_df.mean() - bf_result = bf_deviation.to_pandas() + bf_result = (bf_df - bf_df.mean()).to_pandas() pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() From 11262443537ca528d9d67c7ef93b296578837b99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 16:12:24 +0000 Subject: [PATCH 33/63] even more --- bigframes/core/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cf3518ff29..f9896784bb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2471,7 +2471,7 @@ def _align_series_block_axis_1( def _align_pd_series_axis_1( self, other: pd.Series, how: str ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: - if self.column_labels.astype("object").equals(other.index.astype("object")): + if self.column_labels.equals(other.index): columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: if not (self.column_labels.is_unique and other.index.is_unique): From d63a95f72db7264847cf81504576175f064d0124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 18:23:32 +0000 Subject: [PATCH 34/63] add unit_noextras to improve code coverage --- noxfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/noxfile.py b/noxfile.py index 703937d453..095a10c1e2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -124,6 +124,7 @@ # Sessions are executed in the order so putting the smaller sessions # ahead to fail fast at presubmit running. nox.options.sessions = [ + "unit_noextras", "system-3.9", # No extras. "system-3.11", "cover", From 6aadbaf229714efd3665b79a52a40111973d7b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 19:03:32 +0000 Subject: [PATCH 35/63] run system tests on latest fully supported --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 095a10c1e2..f2d25103c8 100644 --- a/noxfile.py +++ b/noxfile.py @@ -126,7 +126,7 @@ nox.options.sessions = [ "unit_noextras", "system-3.9", # No extras. - "system-3.11", + f"system-{LATEST_FULLY_SUPPORTED_PYTHON}", # All extras. "cover", # TODO(b/401609005): remove "cleanup", From 95e4394fc39edf7c9b8447deda8c98c55ad1a2ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 19:09:35 +0000 Subject: [PATCH 36/63] system-3.12 not found --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index f2d25103c8..e7f3299933 100644 --- a/noxfile.py +++ b/noxfile.py @@ -89,7 +89,7 @@ # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search # bigframes/windows-docker, internally. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.13"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", From d33147ae86e15156b96a1e6336879fb4b4f5fe7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 22:25:28 +0000 Subject: [PATCH 37/63] cap polars version --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index abc760b691..8072a3a3f5 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,9 @@ "google-cloud-pubsub >=2.21.4", ], # used for local engine - "polars": ["polars >= 1.21.0"], + # TODO(tswast): relax upper pin when issue with test_engines_astype_int + # and test_divmods_series is resolved. + "polars": ["polars >= 1.21.0, <1.34.0"], "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. "dev": [ From a7f542c04e2fc85ea1c6ee165e0e6d509960bcb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 19:02:53 +0000 Subject: [PATCH 38/63] hide progress bar --- .../bigframes_vendored/pandas/core/arrays/datetimelike.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 1736a7f9ef..22e946edcd 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -53,6 +53,7 @@ def normalize(self): >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', ... freq='h', @@ -87,6 +88,7 @@ def floor(self, freq: str): >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 From ace646a08b137eef2c76324bb54077316701282c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 19:14:44 +0000 Subject: [PATCH 39/63] relax polars upper pin --- setup.py | 4 +--- tests/system/small/engines/test_generic_ops.py | 8 +++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 8072a3a3f5..abc760b691 100644 --- a/setup.py +++ b/setup.py @@ -77,9 +77,7 @@ "google-cloud-pubsub >=2.21.4", ], # used for local engine - # TODO(tswast): relax upper pin when issue with test_engines_astype_int - # and test_divmods_series is resolved. - "polars": ["polars >= 1.21.0, <1.34.0"], + "polars": ["polars >= 1.21.0"], "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. "dev": [ diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index fc491d358b..f252782dbd 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -22,7 +22,7 @@ from bigframes.session import polars_executor from bigframes.testing.engine_utils import assert_equivalence_execution -pytest.importorskip("polars") +polars = pytest.importorskip("polars") # Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. REFERENCE_ENGINE = polars_executor.PolarsExecutor() @@ -54,6 +54,12 @@ def apply_op( @pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_int(scalars_array_value: array_value.ArrayValue, engine): + polars_version = tuple([int(part) for part in polars.__version__.split(".")]) + if polars_version >= (1, 34, 0): + # TODO(https://github.com/pola-rs/polars/issues/24841): Remove this when + # polars fixes Decimal to Int cast. + scalars_array_value = scalars_array_value.drop_columns(["numeric_col"]) + arr = apply_op( scalars_array_value, ops.AsTypeOp(to_type=bigframes.dtypes.INT_DTYPE), From b2c078bb217679a7914707321bcad4754212135c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 19:36:05 +0000 Subject: [PATCH 40/63] try to restore docs changes --- conftest.py | 45 +++++++++++++++++++ noxfile.py | 6 +-- .../pandas/core/arrays/datetimelike.py | 5 --- 3 files changed, 47 insertions(+), 9 deletions(-) create mode 100644 conftest.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000..e1f3f6d84c --- /dev/null +++ b/conftest.py @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import bigframes._config + + +@pytest.fixture(scope="session") +def polars_session(): + pytest.importorskip("polars") + + from bigframes.testing import polars_session + + return polars_session.TestSession() + + +@pytest.fixture(autouse=True) +def default_doctest_imports(doctest_namespace, polars_session): + """ + Avoid some boilerplate in pandas-inspired tests. + + See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["pa"] = pa + doctest_namespace["bpd"] = polars_session + bigframes._config.options.display.progress_bar = None diff --git a/noxfile.py b/noxfile.py index 41a073f05d..f9c20c999c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -46,9 +46,7 @@ "3.11", ] -# pytest-retry is not yet compatible with pytest 8.x. -# https://github.com/str0zzapreti/pytest-retry/issues/32 -PYTEST_VERSION = "pytest<8.0.0dev" +PYTEST_VERSION = "pytest==8.4.2" SPHINX_VERSION = "sphinx==4.5.0" LINT_PATHS = [ "docs", @@ -115,7 +113,7 @@ # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. "3.10": ["tests", "scikit-learn", "anywidget"], - "3.11": ["tests", "scikit-learn", "polars", "anywidget"], + LATEST_FULLY_SUPPORTED_PYTHON: ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars", "anywidget"], } diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 22e946edcd..95af4d5d2c 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -15,7 +15,6 @@ def strftime(self, date_format: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.to_datetime( ... ['2014-08-15 08:15:12', '2012-02-29 08:15:12+06:00', '2015-08-15 08:15:12+05:00'], @@ -52,8 +51,6 @@ def normalize(self): **Examples:** >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', ... freq='h', @@ -87,8 +84,6 @@ def floor(self, freq: str): **Examples:** >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 From e6f711f57170aad510db6f328346e65141506250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 19:45:39 +0000 Subject: [PATCH 41/63] remove progress bar boilerplate --- bigframes/bigquery/_operations/ai.py | 7 - bigframes/bigquery/_operations/approx_agg.py | 1 - bigframes/bigquery/_operations/array.py | 3 - bigframes/bigquery/_operations/datetime.py | 3 - bigframes/bigquery/_operations/geo.py | 9 -- bigframes/bigquery/_operations/json.py | 11 -- bigframes/bigquery/_operations/search.py | 1 - bigframes/bigquery/_operations/sql.py | 1 - bigframes/bigquery/_operations/struct.py | 1 - bigframes/dataframe.py | 3 - bigframes/ml/compose.py | 1 - bigframes/operations/ai.py | 7 - bigframes/operations/semantics.py | 8 -- bigframes/operations/strings.py | 1 - bigframes/series.py | 3 - bigframes/session/__init__.py | 7 - .../bigframes_vendored/geopandas/geoseries.py | 9 -- .../pandas/core/arrays/arrow/accessors.py | 6 - .../pandas/core/computation/eval.py | 1 - .../pandas/core/config_init.py | 1 - .../bigframes_vendored/pandas/core/frame.py | 127 ------------------ .../bigframes_vendored/pandas/core/generic.py | 12 -- .../pandas/core/groupby/__init__.py | 38 ------ .../pandas/core/indexes/accessor.py | 20 --- .../pandas/core/indexes/base.py | 35 ----- .../pandas/core/indexes/datetimes.py | 6 - .../pandas/core/indexes/multi.py | 2 - .../pandas/core/reshape/tile.py | 1 - .../bigframes_vendored/pandas/core/series.py | 120 ----------------- .../pandas/core/strings/accessor.py | 35 ----- .../pandas/core/tools/datetimes.py | 1 - .../pandas/core/tools/timedeltas.py | 1 - .../bigframes_vendored/pandas/io/gbq.py | 1 - .../bigframes_vendored/pandas/io/parquet.py | 1 - .../pandas/io/parsers/readers.py | 2 - .../bigframes_vendored/pandas/io/pickle.py | 1 - .../pandas/plotting/_core.py | 9 -- .../sklearn/cluster/_kmeans.py | 1 - .../sklearn/decomposition/_mf.py | 1 - .../sklearn/decomposition/_pca.py | 1 - .../sklearn/impute/_base.py | 1 - .../sklearn/linear_model/_base.py | 1 - .../sklearn/linear_model/_logistic.py | 1 - .../sklearn/metrics/_classification.py | 5 - .../sklearn/metrics/_ranking.py | 3 - .../sklearn/metrics/_regression.py | 3 - .../sklearn/model_selection/_split.py | 2 - .../sklearn/model_selection/_validation.py | 1 - .../sklearn/preprocessing/_encoder.py | 1 - 49 files changed, 517 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index f4302f8ece..e0af130016 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -53,7 +53,6 @@ def generate( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> country = bpd.Series(["Japan", "Canada"]) >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) 0 {'result': 'Tokyo\\n', 'full_response': '{"cand... @@ -155,7 +154,6 @@ def generate_bool( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "col_1": ["apple", "bear", "pear"], ... "col_2": ["fruit", "animal", "animal"] @@ -240,7 +238,6 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... @@ -322,7 +319,6 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... @@ -402,7 +398,6 @@ def if_( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True @@ -459,7 +454,6 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df @@ -517,7 +511,6 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 696f8f5a66..73b6fdbb73 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -40,7 +40,6 @@ def approx_top_count( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) >>> bbq.approx_top_count(s, number=2) [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 4af1416127..b21453ec45 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -40,7 +40,6 @@ def array_length(series: series.Series) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) >>> bbq.array_length(s) @@ -79,7 +78,6 @@ def array_agg( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import numpy as np - >>> bpd.options.display.progress_bar = None For a SeriesGroupBy object: @@ -129,7 +127,6 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) >>> bbq.array_to_string(s, delimiter=", ") diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index f8767336dd..78272a514b 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -24,7 +24,6 @@ def unix_seconds(input: series.Series) -> series.Series: >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_seconds(s) @@ -51,7 +50,6 @@ def unix_millis(input: series.Series) -> series.Series: >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_millis(s) @@ -78,7 +76,6 @@ def unix_micros(input: series.Series) -> series.Series: >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_micros(s) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 9a92a8960d..254d2ae13f 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -53,7 +53,6 @@ def st_area( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -125,7 +124,6 @@ def st_buffer( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -195,7 +193,6 @@ def st_centroid( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -250,7 +247,6 @@ def st_convexhull( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -312,7 +308,6 @@ def st_difference( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -407,7 +402,6 @@ def st_distance( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -489,7 +483,6 @@ def st_intersection( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -583,7 +576,6 @@ def st_isclosed( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point, LineString, Polygon - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -650,7 +642,6 @@ def st_length( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point, GeometryCollection - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 656e59af0d..fb8ebcdfcb 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -50,7 +50,6 @@ def json_set( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) @@ -101,7 +100,6 @@ def json_extract( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") @@ -141,7 +139,6 @@ def json_extract_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_array(s) @@ -204,7 +201,6 @@ def json_extract_string_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_string_array(s) @@ -272,7 +268,6 @@ def json_query( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_query(s, json_path="$.class") @@ -303,7 +298,6 @@ def json_query_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_query_array(s) @@ -355,7 +349,6 @@ def json_value( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"name": "Jakob", "age": "6"}', '{"name": "Jakob", "age": []}']) >>> bbq.json_value(s, json_path="$.age") @@ -392,7 +385,6 @@ def json_value_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_value_array(s) @@ -439,7 +431,6 @@ def to_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json(s) @@ -473,7 +464,6 @@ def to_json_string( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json_string(s) @@ -512,7 +502,6 @@ def parse_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> s diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index c16c2af1a9..b65eed2475 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -111,7 +111,6 @@ def vector_search( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column is used as the search query: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index a2de61fc21..21b490d708 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -38,7 +38,6 @@ def sql_scalar( >>> import bigframes.bigquery as bbq >>> import pandas as pd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index 7cb826351c..a6304677ef 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -39,7 +39,6 @@ def struct(value: dataframe.DataFrame) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import bigframes.series as series - >>> bpd.options.display.progress_bar = None >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) >>> df = srs.struct.explode() diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index bc2bbb963b..69d9ce22bf 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1771,7 +1771,6 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 2, 2]}) Download the data from BigQuery and convert it into an in-memory pandas DataFrame. @@ -1893,7 +1892,6 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) Iterate through the results in batches, limiting the total rows yielded @@ -4253,7 +4251,6 @@ def _resample( >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 92c98695cd..54ce7066cb 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -69,7 +69,6 @@ class SQLScalarColumnTransformer: >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index ac294b0fbd..ad58e8825c 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -45,7 +45,6 @@ def filter( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -115,7 +114,6 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -134,7 +132,6 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -266,7 +263,6 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -356,7 +352,6 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -496,7 +491,6 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.ai_operators = True @@ -608,7 +602,6 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 9fa5450748..2266702d47 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -52,7 +52,6 @@ def agg( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -247,7 +246,6 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -321,7 +319,6 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -435,7 +432,6 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -558,7 +554,6 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -697,7 +692,6 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True @@ -800,7 +794,6 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -1001,7 +994,6 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 4743483954..c69993849a 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -68,7 +68,6 @@ def reverse(self) -> series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) >>> s.str.reverse() diff --git a/bigframes/series.py b/bigframes/series.py index 490298d8dd..4adb6a1730 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -533,7 +533,6 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2]) Download the data from BigQuery and convert it into an in-memory pandas Series. @@ -661,7 +660,6 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2, 2, 3]) Iterate through the results in batches, limiting the total rows yielded @@ -2422,7 +2420,6 @@ def _resample( >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 886072b884..960629ea87 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -618,7 +618,6 @@ def read_gbq_query( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Simple query input: @@ -774,7 +773,6 @@ def read_gbq_table( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). @@ -853,7 +851,6 @@ def read_gbq_table_streaming( >>> import bigframes.streaming as bst >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") @@ -882,7 +879,6 @@ def read_gbq_model(self, model_name: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Read an existing BigQuery ML model. @@ -953,7 +949,6 @@ def read_pandas( >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -1831,7 +1826,6 @@ def udf( >>> import bigframes.pandas as bpd >>> import datetime - >>> bpd.options.display.progress_bar = None Turning an arbitrary python function into a BigQuery managed python udf: @@ -1994,7 +1988,6 @@ def read_gbq_function( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index 92a58b3dc6..20587b4d57 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -18,7 +18,6 @@ class GeoSeries: >>> import bigframes.geopandas >>> import bigframes.pandas as bpd >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -73,7 +72,6 @@ def x(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -100,7 +98,6 @@ def y(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -129,7 +126,6 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Polygon, LineString, Point >>> s = geopandas.GeoSeries( @@ -171,7 +167,6 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import bigframes.geopandas - >>> bpd.options.display.progress_bar = None >>> x = [2.5, 5, -3.0] >>> y = [0.5, 1, 1.5] @@ -210,7 +205,6 @@ def from_wkt(cls, data, index=None) -> bigframes.geopandas.GeoSeries: >>> import bigframes as bpd >>> import bigframes.geopandas - >>> bpd.options.display.progress_bar = None >>> wkts = [ ... 'POINT (1 1)', @@ -246,7 +240,6 @@ def to_wkt(self) -> bigframes.series.Series: >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -279,7 +272,6 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -411,7 +403,6 @@ def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignor >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index fe15e7b40d..8515ed5769 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -20,7 +20,6 @@ def len(self): >>> import bigframes.pandas as bpd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], @@ -46,7 +45,6 @@ def __getitem__(self, key: int | slice): >>> import bigframes.pandas as bpd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], @@ -84,7 +82,6 @@ def field(self, name_or_index: str | int): >>> import bigframes.pandas as bpd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, @@ -130,7 +127,6 @@ def explode(self): >>> import bigframes.pandas as bpd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, @@ -166,7 +162,6 @@ def dtypes(self): >>> import bigframes.pandas as bpd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, @@ -201,7 +196,6 @@ def explode(self, column, *, separator: str = "."): >>> import bigframes.pandas as bpd >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index d3d11a9c2a..3cca7ec5cb 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -173,7 +173,6 @@ def eval( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 3425674e4f..dc2b11ab94 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -49,7 +49,6 @@ or just remove it. - >>> bpd.options.display.progress_bar = None Setting to default value "auto" will detect and show progress bar automatically. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 557c332797..1876a1d480 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -40,7 +40,6 @@ def shape(self) -> tuple[int, int]: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2, 3], ... 'col2': [4, 5, 6]}) @@ -64,7 +63,6 @@ def axes(self) -> list: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes[1:] @@ -79,7 +77,6 @@ def values(self) -> np.ndarray: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.values @@ -111,7 +108,6 @@ def T(self) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -147,7 +143,6 @@ def transpose(self) -> DataFrame: **Square DataFrame with homogeneous dtype** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = bpd.DataFrame(data=d1) @@ -257,7 +252,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) >>> df.select_dtypes(include=['Int64']) @@ -381,7 +375,6 @@ def to_numpy( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_numpy() @@ -420,7 +413,6 @@ def to_gbq( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Write a DataFrame to a BigQuery table. @@ -530,7 +522,6 @@ def to_parquet( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" @@ -587,7 +578,6 @@ def to_dict( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() @@ -668,7 +658,6 @@ def to_excel( >>> import bigframes.pandas as bpd >>> import tempfile - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_excel(tempfile.TemporaryFile()) @@ -704,7 +693,6 @@ def to_latex( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_latex()) @@ -755,7 +743,6 @@ def to_records( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_records() @@ -815,7 +802,6 @@ def to_string( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_string()) @@ -915,7 +901,6 @@ def to_html( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_html()) @@ -1025,7 +1010,6 @@ def to_markdown( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_markdown()) @@ -1059,7 +1043,6 @@ def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" @@ -1081,7 +1064,6 @@ def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | No **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> import tempfile @@ -1191,7 +1173,6 @@ def insert(self, loc, column, value, allow_duplicates=False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) @@ -1244,7 +1225,6 @@ def drop( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) @@ -1403,7 +1383,6 @@ def rename( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df @@ -1475,7 +1454,6 @@ def set_index( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], @@ -1617,7 +1595,6 @@ def reset_index( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import numpy as np >>> df = bpd.DataFrame([('bird', 389.0), @@ -1796,7 +1773,6 @@ def dropna( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], @@ -1909,7 +1885,6 @@ def isin(self, values): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) @@ -1965,7 +1940,6 @@ def keys(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -1986,7 +1960,6 @@ def iterrows(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2012,7 +1985,6 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2045,7 +2017,6 @@ def items(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, @@ -2086,7 +2057,6 @@ def where(self, cond, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df @@ -2178,7 +2148,6 @@ def mask(self, cond, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df @@ -2281,7 +2250,6 @@ def sort_values( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], @@ -2425,7 +2393,6 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can use method name: @@ -2468,7 +2435,6 @@ def __eq__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], @@ -2499,7 +2465,6 @@ def __invert__(self) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'a':[True, False, True], 'b':[-1, 0, 1]}) >>> ~df @@ -2528,7 +2493,6 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can use method name: @@ -2570,7 +2534,6 @@ def __ne__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], @@ -2610,7 +2573,6 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can use method name: @@ -2653,7 +2615,6 @@ def __le__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2693,7 +2654,6 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can use method name: @@ -2736,7 +2696,6 @@ def __lt__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2776,7 +2735,6 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can use method name: @@ -2819,7 +2777,6 @@ def __ge__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2859,7 +2816,6 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'angles': [0, 3, 4], ... 'degrees': [360, 180, 360]}, @@ -2900,7 +2856,6 @@ def __gt__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2937,7 +2892,6 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -2981,7 +2935,6 @@ def __add__(self, other) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'height': [1.5, 2.6], @@ -3056,7 +3009,6 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3119,7 +3071,6 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3163,7 +3114,6 @@ def __sub__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can subtract a scalar: @@ -3211,7 +3161,6 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3272,7 +3221,6 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3316,7 +3264,6 @@ def __mul__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can multiply with a scalar: @@ -3364,7 +3311,6 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3408,7 +3354,6 @@ def __rmul__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can multiply with a scalar: @@ -3456,7 +3401,6 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3500,7 +3444,6 @@ def __truediv__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can multiply with a scalar: @@ -3548,7 +3491,6 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3609,7 +3551,6 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3653,7 +3594,6 @@ def __floordiv__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can divide by a scalar: @@ -3701,7 +3641,6 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3762,7 +3701,6 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3806,7 +3744,6 @@ def __mod__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can modulo with a scalar: @@ -3854,7 +3791,6 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3916,7 +3852,6 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3961,7 +3896,6 @@ def __pow__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can exponentiate with a scalar: @@ -4010,7 +3944,6 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -4106,7 +4039,6 @@ def combine( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) @@ -4156,7 +4088,6 @@ def combine_first(self, other) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) @@ -4187,7 +4118,6 @@ def explode( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, @@ -4245,7 +4175,6 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], @@ -4279,7 +4208,6 @@ def cov(self, *, numeric_only) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], @@ -4318,7 +4246,6 @@ def corrwith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] @@ -4354,7 +4281,6 @@ def update( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) @@ -4419,7 +4345,6 @@ def groupby( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], @@ -4516,7 +4441,6 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it @@ -4613,7 +4537,6 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Join two DataFrames by specifying how to handle the operation: @@ -4765,7 +4688,6 @@ def merge( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Merge DataFrames df1 and df2 by specifying type of merge: @@ -4897,7 +4819,6 @@ def round(self, decimals): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -4982,7 +4903,6 @@ def apply(self, func, *, axis=0, args=(), **kwargs): >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df @@ -5132,7 +5052,6 @@ def any(self, *, axis=0, bool_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df @@ -5179,7 +5098,6 @@ def all(self, axis=0, *, bool_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df @@ -5222,7 +5140,6 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df @@ -5269,7 +5186,6 @@ def min(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5314,7 +5230,6 @@ def max(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5358,7 +5273,6 @@ def sum(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5400,7 +5314,6 @@ def mean(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5442,7 +5355,6 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5480,7 +5392,6 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5518,7 +5429,6 @@ def var(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5563,7 +5473,6 @@ def skew(self, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], ... 'B': [5, 4, 3, 2, 1], @@ -5604,7 +5513,6 @@ def kurt(self, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], @@ -5644,7 +5552,6 @@ def std(self, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], @@ -5686,7 +5593,6 @@ def count(self, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], @@ -5739,7 +5645,6 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], @@ -5831,7 +5736,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], @@ -5913,7 +5817,6 @@ def idxmin(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -5943,7 +5846,6 @@ def idxmax(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -5977,7 +5879,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], @@ -6052,7 +5953,6 @@ def nunique(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) >>> df @@ -6081,7 +5981,6 @@ def cummin(self) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6113,7 +6012,6 @@ def cummax(self) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6145,7 +6043,6 @@ def cumsum(self) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6182,7 +6079,6 @@ def cumprod(self) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6223,7 +6119,6 @@ def diff( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6271,7 +6166,6 @@ def agg(self, func): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6335,7 +6229,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df @@ -6407,7 +6300,6 @@ def pivot(self, *, columns, index=None, values=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "foo": ["one", "one", "one", "two", "two"], @@ -6477,7 +6369,6 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], @@ -6570,7 +6461,6 @@ def stack(self, level=-1): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df @@ -6609,7 +6499,6 @@ def unstack(self, level=-1): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df @@ -6650,7 +6539,6 @@ def index(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can access the index of a DataFrame via ``index`` property. @@ -6703,7 +6591,6 @@ def columns(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can access the column labels of a DataFrame via ``columns`` property. @@ -6751,7 +6638,6 @@ def value_counts( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, @@ -6832,7 +6718,6 @@ def eval(self, expr: str) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df @@ -6908,7 +6793,6 @@ def query(self, expr: str) -> DataFrame | None: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), @@ -6983,7 +6867,6 @@ def interpolate(self, method: str = "linear"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], @@ -7033,7 +6916,6 @@ def fillna(self, value): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], @@ -7110,7 +6992,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], @@ -7207,7 +7088,6 @@ def iat(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) @@ -7241,7 +7121,6 @@ def at(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) @@ -7290,7 +7169,6 @@ def dot(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left @@ -7384,7 +7262,6 @@ def __matmul__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left @@ -7444,7 +7321,6 @@ def __len__(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'a': [0, 1, 2], @@ -7467,7 +7343,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import numpy as np >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) @@ -7502,7 +7377,6 @@ def __getitem__(self, key): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], @@ -7577,7 +7451,6 @@ def __setitem__(self, key, value): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 273339efcf..c6f7429643 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -39,7 +39,6 @@ def size(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) >>> s.size @@ -66,7 +65,6 @@ def __iter__(self) -> Iterator: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -107,7 +105,6 @@ def astype(self, dtype): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Create a DataFrame: @@ -351,7 +348,6 @@ def get(self, key, default=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... [ @@ -462,7 +458,6 @@ def head(self, n: int = 5): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) @@ -562,7 +557,6 @@ def sample( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], @@ -644,7 +638,6 @@ def dtypes(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) >>> df.dtypes @@ -669,7 +662,6 @@ def copy(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Modification in the original Series will not affect the copy Series: @@ -743,7 +735,6 @@ def ffill(self, *, limit: Optional[int] = None): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], @@ -826,7 +817,6 @@ def isna(self) -> NDFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import numpy as np >>> df = bpd.DataFrame(dict( @@ -1068,7 +1058,6 @@ def rolling( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() @@ -1156,7 +1145,6 @@ def pipe( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 1e39ec8f94..ba24a6b2d8 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -45,7 +45,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df @@ -86,7 +85,6 @@ def any(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) @@ -125,7 +123,6 @@ def all(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) @@ -165,7 +162,6 @@ def count(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) @@ -204,7 +200,6 @@ def mean( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -264,7 +259,6 @@ def median( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -304,7 +298,6 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -345,7 +338,6 @@ def std( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -392,7 +384,6 @@ def var( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -436,7 +427,6 @@ def rank( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { @@ -512,7 +502,6 @@ def skew( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', @@ -547,7 +536,6 @@ def kurt( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) @@ -580,7 +568,6 @@ def kurtosis( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) @@ -607,7 +594,6 @@ def first(self, numeric_only: bool = False, min_count: int = -1): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() @@ -648,7 +634,6 @@ def last(self, numeric_only: bool = False, min_count: int = -1): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) >>> df.groupby("A").last() @@ -686,7 +671,6 @@ def sum( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -732,7 +716,6 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -768,7 +751,6 @@ def min( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -816,7 +798,6 @@ def max( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -860,7 +841,6 @@ def cumcount(self, ascending: bool = True): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b', 'c'] >>> ser = bpd.Series([5, 1, 2, 3, 4], index=lst) @@ -899,7 +879,6 @@ def cumprod(self, *args, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -938,7 +917,6 @@ def cumsum(self, *args, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -977,7 +955,6 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -1016,7 +993,6 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -1057,7 +1033,6 @@ def diff(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -1103,7 +1078,6 @@ def shift(self, periods: int = 1): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -1146,7 +1120,6 @@ def rolling(self, *args, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) @@ -1205,7 +1178,6 @@ def expanding(self, *args, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) @@ -1231,7 +1203,6 @@ def head(self, n: int = 5): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) @@ -1260,7 +1231,6 @@ def size(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For SeriesGroupBy: @@ -1314,7 +1284,6 @@ def __iter__(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For SeriesGroupBy: @@ -1379,7 +1348,6 @@ def agg(self, func): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) @@ -1412,7 +1380,6 @@ def aggregate(self, func): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) @@ -1445,7 +1412,6 @@ def nunique(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) @@ -1496,7 +1462,6 @@ def agg(self, func, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], @@ -1556,7 +1521,6 @@ def aggregate(self, func, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], @@ -1616,7 +1580,6 @@ def nunique(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], @@ -1652,7 +1615,6 @@ def value_counts( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 0dd487d056..09cce17c21 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -14,7 +14,6 @@ def day(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") ... ) @@ -44,7 +43,6 @@ def dayofweek(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -78,7 +76,6 @@ def day_of_week(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -108,7 +105,6 @@ def dayofyear(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -136,7 +132,6 @@ def day_of_year(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -168,7 +163,6 @@ def date(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -191,7 +185,6 @@ def hour(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -217,7 +210,6 @@ def minute(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -243,7 +235,6 @@ def month(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") ... ) @@ -269,7 +260,6 @@ def isocalendar(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() ... ) @@ -302,7 +292,6 @@ def second(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -331,7 +320,6 @@ def time(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -353,7 +341,6 @@ def quarter(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -376,7 +363,6 @@ def year(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") ... ) @@ -402,7 +388,6 @@ def days(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -420,7 +405,6 @@ def seconds(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -438,7 +422,6 @@ def microseconds(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -455,7 +438,6 @@ def total_seconds(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s 0 1 days 00:01:01.000001 @@ -472,7 +454,6 @@ def tz(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -495,7 +476,6 @@ def unit(self) -> str: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index eba47fc1f9..70d75c58c0 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -33,7 +33,6 @@ def name(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 2, 3], name='x') >>> idx @@ -64,7 +63,6 @@ def values(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -87,7 +85,6 @@ def ndim(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -122,7 +119,6 @@ def size(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For Series: @@ -157,7 +153,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bool(bpd.Index([1, 2, 3]).is_monotonic_increasing) True @@ -182,7 +177,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bool(bpd.Index([3, 2, 1]).is_monotonic_decreasing) True @@ -207,7 +201,6 @@ def from_frame(cls, frame) -> Index: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], @@ -247,7 +240,6 @@ def shape(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -269,7 +261,6 @@ def nlevels(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> mi = bpd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) >>> mi @@ -291,7 +282,6 @@ def is_unique(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 5, 7, 7]) >>> idx.is_unique @@ -314,7 +304,6 @@ def has_duplicates(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 5, 7, 7]) >>> bool(idx.has_duplicates) @@ -337,7 +326,6 @@ def dtype(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -365,7 +353,6 @@ def T(self) -> Index: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -404,7 +391,6 @@ def copy( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index(['a', 'b', 'c']) >>> new_idx = idx.copy() @@ -439,7 +425,6 @@ def astype(self, dtype): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -488,7 +473,6 @@ def get_level_values(self, level) -> Index: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index(list('abc')) >>> idx @@ -518,7 +502,6 @@ def to_series(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index(['Ant', 'Bear', 'Cow'], name='animal') @@ -572,7 +555,6 @@ def isin(self, values): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1,2,3]) >>> idx @@ -612,7 +594,6 @@ def all(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None True, because nonzero integers are considered True. @@ -640,7 +621,6 @@ def any(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> index = bpd.Index([0, 1, 2]) >>> bool(index.any()) @@ -666,7 +646,6 @@ def min(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.min()) @@ -688,7 +667,6 @@ def max(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.max()) @@ -714,7 +692,6 @@ def argmin(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Consider dataset containing cereal calories @@ -751,7 +728,6 @@ def get_loc( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> unique_index = bpd.Index(list('abc')) >>> unique_index.get_loc('b') @@ -795,7 +771,6 @@ def argmax(self) -> int: Consider dataset containing cereal calories >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) @@ -829,7 +804,6 @@ def nunique(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s @@ -861,7 +835,6 @@ def sort_values( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([10, 100, 1, 1000]) >>> idx @@ -906,7 +879,6 @@ def value_counts( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() @@ -963,7 +935,6 @@ def fillna(self, value) -> Index: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) @@ -993,7 +964,6 @@ def rename(self, name, *, inplace): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') @@ -1023,7 +993,6 @@ def drop(self, labels) -> Index: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index(['a', 'b', 'c']) >>> idx.drop(['a']) @@ -1044,7 +1013,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() @@ -1071,7 +1039,6 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Generate an pandas.Index with duplicate values. @@ -1114,7 +1081,6 @@ def unique(self, level: Hashable | int | None = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 1, 2, 3, 3]) >>> idx.unique() Index([1, 2, 3], dtype='Int64') @@ -1135,7 +1101,6 @@ def item(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1], index=['a']) >>> s.index.item() 'a' diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 105a376728..12085d601e 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -17,7 +17,6 @@ def year(self) -> base.Index: >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year @@ -33,7 +32,6 @@ def month(self) -> base.Index: >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month @@ -49,7 +47,6 @@ def day(self) -> base.Index: >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day @@ -65,7 +62,6 @@ def day_of_week(self) -> base.Index: >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week @@ -81,7 +77,6 @@ def dayofweek(self) -> base.Index: >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek @@ -97,7 +92,6 @@ def weekday(self) -> base.Index: >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py index a882aa40e3..c2b63b442f 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/multi.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -26,7 +26,6 @@ def from_tuples( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) @@ -63,7 +62,6 @@ def from_arrays( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) MultiIndex([(1, 'red'), diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 697c17f23c..d116465b71 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -34,7 +34,6 @@ def cut( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0, 1, 5, 10]) >>> s diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 932959a826..af3a9e1d34 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -39,7 +39,6 @@ def dt(self): >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series @@ -111,7 +110,6 @@ def index(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can access the index of a Series via ``index`` property. @@ -162,7 +160,6 @@ def shape(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape @@ -181,7 +178,6 @@ def dtype(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> s.dtype @@ -201,7 +197,6 @@ def name(self) -> Hashable: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For a Series: @@ -249,7 +244,6 @@ def hasnans(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, None]) >>> s @@ -273,7 +267,6 @@ def T(self) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -298,7 +291,6 @@ def transpose(self) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -339,7 +331,6 @@ def reset_index( >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) @@ -441,7 +432,6 @@ def keys(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s.keys() @@ -523,7 +513,6 @@ def to_markdown( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) @@ -579,7 +568,6 @@ def to_dict( >>> import bigframes.pandas as bpd >>> from collections import OrderedDict, defaultdict - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.to_dict() @@ -618,7 +606,6 @@ def to_frame(self, name=None) -> DataFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["a", "b", "c"], ... name="vals") @@ -715,7 +702,6 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> s @@ -750,7 +736,6 @@ def to_numpy( >>> import bigframes.pandas as bpd >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() @@ -804,7 +789,6 @@ def to_pickle(self, path, *, allow_large_results=None, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> original_df = bpd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df @@ -866,7 +850,6 @@ def agg(self, func): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s @@ -903,7 +886,6 @@ def count(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0.0, 1.0, bpd.NA]) >>> s @@ -929,7 +911,6 @@ def nunique(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s @@ -964,7 +945,6 @@ def unique(self, keep_order=True) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s @@ -1007,7 +987,6 @@ def mode(self) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([2, 4, 8, 2, 4, None]) >>> s.mode() @@ -1032,7 +1011,6 @@ def drop_duplicates( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Generate a Series with duplicated entries. @@ -1101,7 +1079,6 @@ def duplicated(self, keep="first") -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1173,7 +1150,6 @@ def idxmin(self) -> Hashable: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(data=[1, None, 4, 1], ... index=['A', 'B', 'C', 'D']) @@ -1202,7 +1178,6 @@ def idxmax(self) -> Hashable: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(data=[1, None, 4, 3, 4], ... index=['A', 'B', 'C', 'D', 'E']) @@ -1229,7 +1204,6 @@ def round(self, decimals: int = 0) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() @@ -1263,7 +1237,6 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) >>> s @@ -1302,7 +1275,6 @@ def corr(self, other, method="pearson", min_periods=None) -> float: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) @@ -1340,7 +1312,6 @@ def autocorr(self, lag: int = 1) -> float: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS @@ -1378,7 +1349,6 @@ def cov( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s1 = bpd.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = bpd.Series([0.12528585, 0.26962463, 0.51111198]) @@ -1407,7 +1377,6 @@ def diff(self) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Difference with previous row @@ -1473,7 +1442,6 @@ def dot(self, other) -> Series | np.ndarray: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) @@ -1531,7 +1499,6 @@ def sort_values( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s @@ -1630,7 +1597,6 @@ def sort_index( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() @@ -1690,7 +1656,6 @@ def nlargest( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, @@ -1776,7 +1741,6 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, @@ -1864,7 +1828,6 @@ def apply( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For applying arbitrary python function a `remote_function` is recommended. Let's use ``reuse=False`` flag to make sure a new `remote_function` @@ -2007,7 +1970,6 @@ def combine( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -2066,7 +2028,6 @@ def groupby( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can group by a named index level. @@ -2239,7 +2200,6 @@ def drop( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) >>> s @@ -2371,7 +2331,6 @@ def interpolate(self, method: str = "linear"): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None Filling in NaN in a Series via linear interpolation. @@ -2416,7 +2375,6 @@ def fillna( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s @@ -2470,7 +2428,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s @@ -2598,7 +2555,6 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None Drop NA values from a Series: @@ -2662,7 +2618,6 @@ def between( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None Boundary values are included by default: @@ -2721,7 +2676,6 @@ def case_when( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) @@ -2790,7 +2744,6 @@ def cumprod(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2828,7 +2781,6 @@ def cumsum(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2871,7 +2823,6 @@ def cummax(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2910,7 +2861,6 @@ def cummin(self): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2947,7 +2897,6 @@ def eq(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -2992,7 +2941,6 @@ def ne(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3039,7 +2987,6 @@ def le(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3085,7 +3032,6 @@ def lt(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3132,7 +3078,6 @@ def ge(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3179,7 +3124,6 @@ def gt(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3224,7 +3168,6 @@ def add(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 2, 3, bpd.NA]) >>> a @@ -3288,7 +3231,6 @@ def __add__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s @@ -3341,7 +3283,6 @@ def radd(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3406,7 +3347,6 @@ def sub( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3450,7 +3390,6 @@ def __sub__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s @@ -3503,7 +3442,6 @@ def rsub(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3565,7 +3503,6 @@ def mul(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3610,7 +3547,6 @@ def __mul__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can multiply with a scalar: @@ -3651,7 +3587,6 @@ def rmul(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3712,7 +3647,6 @@ def truediv(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3757,7 +3691,6 @@ def __truediv__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can multiply with a scalar: @@ -3798,7 +3731,6 @@ def rtruediv(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3860,7 +3792,6 @@ def floordiv(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3905,7 +3836,6 @@ def __floordiv__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can divide by a scalar: @@ -3946,7 +3876,6 @@ def rfloordiv(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4008,7 +3937,6 @@ def mod(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4053,7 +3981,6 @@ def __mod__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can modulo with a scalar: @@ -4093,7 +4020,6 @@ def rmod(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4156,7 +4082,6 @@ def pow(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4202,7 +4127,6 @@ def __pow__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can exponentiate with a scalar: @@ -4243,7 +4167,6 @@ def rpow(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4306,7 +4229,6 @@ def divmod(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4358,7 +4280,6 @@ def rdivmod(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4413,7 +4334,6 @@ def combine_first(self, other) -> Series: >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) @@ -4456,7 +4376,6 @@ def update(self, other) -> None: >>> import bigframes.pandas as bpd >>> import pandas as pd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) @@ -4549,7 +4468,6 @@ def any( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None For Series input, the output is a scalar indicating whether any element is True. @@ -4584,7 +4502,6 @@ def max( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Calculating the max of a Series: @@ -4626,7 +4543,6 @@ def min( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Calculating the min of a Series: @@ -4667,7 +4583,6 @@ def std( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], ... 'age': [21, 25, 62, 43], @@ -4715,7 +4630,6 @@ def sum(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Calculating the sum of a Series: @@ -4751,7 +4665,6 @@ def mean(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Calculating the mean of a Series: @@ -4787,7 +4700,6 @@ def median(self, *, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> s.median() @@ -4828,7 +4740,6 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) @@ -4881,7 +4792,6 @@ def describe(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['A', 'A', 'B']) >>> s @@ -4909,7 +4819,6 @@ def skew(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> s.skew() @@ -4947,7 +4856,6 @@ def kurt(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) >>> s @@ -4991,7 +4899,6 @@ def item(self: Series, *args, **kwargs): >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -5014,7 +4921,6 @@ def items(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): @@ -5036,7 +4942,6 @@ def where(self, cond, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s @@ -5104,7 +5009,6 @@ def mask(self, cond, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s @@ -5262,7 +5166,6 @@ def argmax(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Consider dataset containing cereal calories. @@ -5300,7 +5203,6 @@ def argmin(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Consider dataset containing cereal calories. @@ -5341,7 +5243,6 @@ def rename(self, index, *, inplace, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> s @@ -5393,7 +5294,6 @@ def rename_axis(self, mapper, *, inplace, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Series @@ -5458,7 +5358,6 @@ def value_counts( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") @@ -5536,7 +5435,6 @@ def str(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["A_Str_Series"]) >>> s @@ -5565,7 +5463,6 @@ def plot(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -5593,7 +5490,6 @@ def isin(self, values): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', ... 'hippo'], name='animal') @@ -5659,7 +5555,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing @@ -5683,7 +5578,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing @@ -5725,7 +5619,6 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) >>> s @@ -5791,7 +5684,6 @@ def iloc(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, @@ -5871,7 +5763,6 @@ def loc(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], @@ -5958,7 +5849,6 @@ def iat(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) @@ -5993,7 +5883,6 @@ def at(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) @@ -6029,7 +5918,6 @@ def values(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.Series([1, 2, 3]).values array([1, 2, 3]) @@ -6051,7 +5939,6 @@ def size(self) -> int: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For Series: @@ -6088,7 +5975,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import numpy as np >>> ser = bpd.Series([1, 2, 3]) @@ -6116,7 +6002,6 @@ def __len__(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> len(s) @@ -6132,7 +6017,6 @@ def __invert__(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([True, False, True]) >>> ~ser @@ -6153,7 +6037,6 @@ def __and__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0, 1, 2, 3]) @@ -6192,7 +6075,6 @@ def __or__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0, 1, 2, 3]) @@ -6231,7 +6113,6 @@ def __xor__(self, other): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([0, 1, 2, 3]) @@ -6270,7 +6151,6 @@ def __getitem__(self, indexer): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([15, 30, 45]) >>> s[1] diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index fe94bf3049..d2296d4a82 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,7 +20,6 @@ def __getitem__(self, key: typing.Union[int, slice]): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) >>> s.str[0] @@ -54,7 +53,6 @@ def extract(self, pat: str, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None A pattern with two groups will return a DataFrame with two columns. Non-matches will be `NaN`. @@ -115,7 +113,6 @@ def find(self, sub, start: int = 0, end=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) >>> ser.str.find("_") @@ -146,7 +143,6 @@ def len(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Returns the length (number of characters) in a string. @@ -172,7 +168,6 @@ def lower(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['lower', ... 'CAPITALS', @@ -197,7 +192,6 @@ def slice(self, start=None, stop=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["koala", "dog", "chameleon"]) >>> s @@ -250,7 +244,6 @@ def strip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([ ... '1. Ant.', @@ -293,7 +286,6 @@ def upper(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['lower', ... 'CAPITALS', @@ -322,7 +314,6 @@ def isnumeric(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() @@ -349,7 +340,6 @@ def isalpha(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalpha() @@ -375,7 +365,6 @@ def isdigit(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['23', '1a', '1/5', '']) >>> s.str.isdigit() @@ -401,7 +390,6 @@ def isalnum(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() @@ -439,7 +427,6 @@ def isspace(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) >>> s.str.isspace() @@ -465,7 +452,6 @@ def islower(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.islower() @@ -492,7 +478,6 @@ def isupper(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.isupper() @@ -519,7 +504,6 @@ def isdecimal(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None The `isdecimal` method checks for characters used to form numbers in base 10. @@ -550,7 +534,6 @@ def rstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.rstrip() @@ -583,7 +566,6 @@ def lstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.lstrip() @@ -611,7 +593,6 @@ def repeat(self, repeats: int): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['a', 'b', 'c']) >>> s @@ -645,7 +626,6 @@ def capitalize(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['lower', ... 'CAPITALS', @@ -673,7 +653,6 @@ def cat(self, others, *, join): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None You can concatenate each string in a Series to another string. @@ -730,7 +709,6 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Returning a Series of booleans using only a literal pattern. @@ -834,7 +812,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None When *pat* is a string and *regex* is True, the given *pat* is compiled as a regex. When *repl* is a string, it replaces matching regex patterns @@ -896,7 +873,6 @@ def startswith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) >>> s @@ -941,7 +917,6 @@ def endswith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) >>> s @@ -988,7 +963,6 @@ def split( >>> import bigframes.pandas as bpd >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ @@ -1031,7 +1005,6 @@ def match(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(["horse", "eagle", "donkey"]) >>> ser.str.match("e") @@ -1060,7 +1033,6 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r'd.+') @@ -1092,7 +1064,6 @@ def get(self, i: int): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "banana", "fig"]) >>> s.str.get(3) @@ -1122,7 +1093,6 @@ def pad( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["caribou", "tiger"]) >>> s @@ -1170,7 +1140,6 @@ def ljust( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.ljust(8, fillchar='.') @@ -1202,7 +1171,6 @@ def rjust( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.rjust(8, fillchar='.') @@ -1238,7 +1206,6 @@ def zfill( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) >>> s @@ -1278,7 +1245,6 @@ def center( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.center(8, fillchar='.') @@ -1310,7 +1276,6 @@ def join(self, sep: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import pandas as pd Example with a list that contains non-string elements. diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 9c17b9632e..655f801b3d 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -38,7 +38,6 @@ def to_datetime( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Converting a Scalar to datetime: diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index 9442e965fa..220b15f56e 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -55,7 +55,6 @@ def to_timedelta( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Converting a Scalar to timedelta diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 0fdca4dde1..3190c92b92 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -61,7 +61,6 @@ def read_gbq( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None If the input is a table ID: diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index aec911d2fe..c02c5e52c5 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -27,7 +27,6 @@ def read_parquet( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path, engine="bigquery") diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 4757f5ed9d..5a505c2859 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -71,7 +71,6 @@ def read_csv( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) @@ -192,7 +191,6 @@ def read_json( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 33088dc019..03f1afe35e 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -35,7 +35,6 @@ def read_pickle( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index b0c28ddfe9..5dac642af2 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -11,7 +11,6 @@ class PlotAccessor: For Series: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -57,9 +56,7 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -96,7 +93,6 @@ def line( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -164,7 +160,6 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], @@ -233,7 +228,6 @@ def bar( Basic plot. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) >>> ax = df.plot.bar(x='lab', y='val', rot=0) @@ -293,7 +287,6 @@ def barh( Basic plot. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) >>> ax = df.plot.barh(x='lab', y='val', rot=0) @@ -356,7 +349,6 @@ def pie( pie function to get a pie plot. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'mass': [0.330, 4.87 , 5.97], ... 'radius': [2439.7, 6051.8, 6378.1]}, @@ -399,7 +391,6 @@ def scatter( in a DataFrame's columns. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], ... [6.4, 3.2, 1], [5.9, 3.0, 2]], ... columns=['length', 'width', 'species']) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index a7344d49d4..44eefeddd7 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -30,7 +30,6 @@ class KMeans(_BaseKMeans): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> from bigframes.ml.cluster import KMeans >>> X = bpd.DataFrame({"feat0": [1, 1, 1, 10, 10, 10], "feat1": [2, 4, 0, 2, 4, 0]}) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index c3c3a77b71..e487a2e7c1 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,7 +24,6 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], ... "column": [0,1] * 7, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index f13c52bfb6..3535edc8f9 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -24,7 +24,6 @@ class PCA(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import PCA - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [-1, -2, -3, 1, 2, 3], "feat1": [-1, -1, -2, 1, 1, 2]}) >>> pca = PCA(n_components=2).fit(X) >>> pca.predict(X) # doctest:+SKIP diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 42eab24c82..175ad86b21 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -22,7 +22,6 @@ class SimpleImputer(_BaseImputer): >>> import bigframes.pandas as bpd >>> from bigframes.ml.impute import SimpleImputer - >>> bpd.options.display.progress_bar = None >>> X_train = bpd.DataFrame({"feat0": [7.0, 4.0, 10.0], "feat1": [2.0, None, 5.0], "feat2": [3.0, 6.0, 9.0]}) >>> imp_mean = SimpleImputer().fit(X_train) >>> X_test = bpd.DataFrame({"feat0": [None, 4.0, 10.0], "feat1": [2.0, None, None], "feat2": [3.0, 6.0, 9.0]}) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 21ba5a3bf8..7543edd10b 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -66,7 +66,6 @@ class LinearRegression(RegressorMixin, LinearModel): >>> from bigframes.ml.linear_model import LinearRegression >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index a85c6fae8d..d449a1040c 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -25,7 +25,6 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): >>> from bigframes.ml.linear_model import LogisticRegression >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index fd6e8678ea..e60cc8cec4 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -30,7 +30,6 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 2, 1, 3]) >>> y_pred = bpd.DataFrame([0, 1, 2, 3]) @@ -80,7 +79,6 @@ def confusion_matrix( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([2, 0, 2, 2, 0, 1]) >>> y_pred = bpd.DataFrame([0, 0, 2, 2, 0, 2]) @@ -132,7 +130,6 @@ def recall_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -181,7 +178,6 @@ def precision_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -232,7 +228,6 @@ def f1_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 9262ffbd3d..cd5bd2cbcd 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -33,7 +33,6 @@ def auc(x, y) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> x = bpd.DataFrame([1, 1, 2, 2]) >>> y = bpd.DataFrame([2, 3, 4, 5]) @@ -89,7 +88,6 @@ def roc_auc_score(y_true, y_score) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) @@ -139,7 +137,6 @@ def roc_curve( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([1, 1, 2, 2]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 1c14e8068b..85f0c1ecf9 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -46,7 +46,6 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -73,7 +72,6 @@ def mean_squared_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -100,7 +98,6 @@ def mean_absolute_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py index ec16fa8cf9..326589be7d 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py @@ -69,7 +69,6 @@ class KFold(_BaseKFold): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import KFold - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> kf = KFold(n_splits=3, random_state=42) @@ -162,7 +161,6 @@ def train_test_split( >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import train_test_split - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]}) >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index b93c47ea04..6f84018853 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -19,7 +19,6 @@ def cross_validate(estimator, X, y=None, *, cv=None): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import cross_validate, KFold >>> from bigframes.ml.linear_model import LinearRegression - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> model = LinearRegression() diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 5476a9fb3c..64a5786f17 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -25,7 +25,6 @@ class OneHotEncoder(BaseEstimator): >>> from bigframes.ml.preprocessing import OneHotEncoder >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> enc = OneHotEncoder() >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) From 9906cc84bdbde8fe6442d3e4d652246284b58794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 19:48:07 +0000 Subject: [PATCH 42/63] remove standardd community imports boilerplate --- bigframes/bigquery/_operations/array.py | 2 - bigframes/bigquery/_operations/datetime.py | 3 -- bigframes/bigquery/_operations/json.py | 1 - bigframes/bigquery/_operations/sql.py | 2 - bigframes/dataframe.py | 1 - bigframes/series.py | 1 - bigframes/session/__init__.py | 2 - .../bigframes_vendored/ibis/expr/api.py | 2 - .../ibis/expr/datatypes/core.py | 1 - .../ibis/expr/types/arrays.py | 1 - .../ibis/expr/types/maps.py | 6 --- .../pandas/core/arrays/arrow/accessors.py | 6 --- .../pandas/core/arrays/datetimelike.py | 2 - .../bigframes_vendored/pandas/core/frame.py | 7 --- .../bigframes_vendored/pandas/core/generic.py | 3 -- .../pandas/core/groupby/__init__.py | 24 ---------- .../pandas/core/indexes/accessor.py | 15 ------- .../pandas/core/indexes/base.py | 3 -- .../pandas/core/indexes/datetimes.py | 6 --- .../pandas/core/reshape/tile.py | 1 - .../bigframes_vendored/pandas/core/series.py | 44 ------------------- .../pandas/core/strings/accessor.py | 2 - .../pandas/plotting/_core.py | 1 - 23 files changed, 136 deletions(-) diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index b21453ec45..6f9dd20b54 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -77,7 +77,6 @@ def array_agg( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np For a SeriesGroupBy object: @@ -126,7 +125,6 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) >>> bbq.array_to_string(s, delimiter=", ") diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index 78272a514b..99467beb06 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -21,7 +21,6 @@ def unix_seconds(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq @@ -47,7 +46,6 @@ def unix_millis(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq @@ -73,7 +71,6 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index fb8ebcdfcb..4e1f43aab0 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -49,7 +49,6 @@ def json_set( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 21b490d708..295412fd75 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -36,8 +36,6 @@ def sql_scalar( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import pandas as pd - >>> import pyarrow as pa >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 69d9ce22bf..807a98f2bb 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4250,7 +4250,6 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/series.py b/bigframes/series.py index 4adb6a1730..be5d099751 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2419,7 +2419,6 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 960629ea87..4663301730 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -948,7 +948,6 @@ def read_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -2078,7 +2077,6 @@ def read_gbq_function( note, row processor implies that the function has only one input parameter. - >>> import pandas as pd >>> @bpd.remote_function(cloud_function_service_account="default") ... def row_sum(s: pd.Series) -> float: ... return s['a'] + s['b'] + s['c'] diff --git a/third_party/bigframes_vendored/ibis/expr/api.py b/third_party/bigframes_vendored/ibis/expr/api.py index 4ef10e449b..fa09e23b75 100644 --- a/third_party/bigframes_vendored/ibis/expr/api.py +++ b/third_party/bigframes_vendored/ibis/expr/api.py @@ -1532,7 +1532,6 @@ def read_parquet( Examples -------- >>> import ibis - >>> import pandas as pd >>> ibis.options.interactive = True >>> df = pd.DataFrame({"a": [1, 2, 3], "b": list("ghi")}) >>> df @@ -1582,7 +1581,6 @@ def read_delta( Examples -------- >>> import ibis - >>> import pandas as pd >>> ibis.options.interactive = True >>> df = pd.DataFrame({"a": [1, 2, 3], "b": list("ghi")}) >>> df diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/core.py b/third_party/bigframes_vendored/ibis/expr/datatypes/core.py index eb597cfc6a..4bacebd6d7 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/core.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/core.py @@ -62,7 +62,6 @@ def dtype(value: Any, nullable: bool = True) -> DataType: Or other type systems, like numpy/pandas/pyarrow types: - >>> import pyarrow as pa >>> ibis.dtype(pa.int32()) Int32(nullable=True) diff --git a/third_party/bigframes_vendored/ibis/expr/types/arrays.py b/third_party/bigframes_vendored/ibis/expr/types/arrays.py index 72f01334c1..47ae997738 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/arrays.py +++ b/third_party/bigframes_vendored/ibis/expr/types/arrays.py @@ -1008,7 +1008,6 @@ def flatten(self) -> ir.ArrayValue: ... "nulls_only": [None, None, None], ... "mixed_nulls": [[], None, [None]], ... } - >>> import pyarrow as pa >>> t = ibis.memtable( ... pa.Table.from_pydict( ... data, diff --git a/third_party/bigframes_vendored/ibis/expr/types/maps.py b/third_party/bigframes_vendored/ibis/expr/types/maps.py index 881f8327d0..65237decc7 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/maps.py +++ b/third_party/bigframes_vendored/ibis/expr/types/maps.py @@ -35,7 +35,6 @@ class MapValue(Value): -------- >>> import ibis >>> ibis.options.interactive = True - >>> import pyarrow as pa >>> tab = pa.table( ... { ... "m": pa.array( @@ -101,7 +100,6 @@ def get(self, key: ir.Value, default: ir.Value | None = None) -> ir.Value: Examples -------- >>> import ibis - >>> import pyarrow as pa >>> ibis.options.interactive = True >>> tab = pa.table( ... { @@ -167,7 +165,6 @@ def length(self) -> ir.IntegerValue: Examples -------- >>> import ibis - >>> import pyarrow as pa >>> ibis.options.interactive = True >>> tab = pa.table( ... { @@ -224,7 +221,6 @@ def __getitem__(self, key: ir.Value) -> ir.Value: Examples -------- >>> import ibis - >>> import pyarrow as pa >>> ibis.options.interactive = True >>> tab = pa.table( ... { @@ -276,7 +272,6 @@ def contains( Examples -------- >>> import ibis - >>> import pyarrow as pa >>> ibis.options.interactive = True >>> tab = pa.table( ... { @@ -321,7 +316,6 @@ def keys(self) -> ir.ArrayValue: Examples -------- >>> import ibis - >>> import pyarrow as pa >>> ibis.options.interactive = True >>> tab = pa.table( ... { diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 8515ed5769..dd098d41c0 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -19,7 +19,6 @@ def len(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa >>> s = bpd.Series( ... [ ... [1, 2, 3], @@ -44,7 +43,6 @@ def __getitem__(self, key: int | slice): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa >>> s = bpd.Series( ... [ ... [1, 2, 3], @@ -81,7 +79,6 @@ def field(self, name_or_index: str | int): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, @@ -126,7 +123,6 @@ def explode(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, @@ -161,7 +157,6 @@ def dtypes(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, @@ -195,7 +190,6 @@ def explode(self, column, *, separator: str = "."): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 95af4d5d2c..219df2ea6b 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -50,7 +50,6 @@ def normalize(self): **Examples:** - >>> import pandas as pd >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', ... freq='h', @@ -83,7 +82,6 @@ def floor(self, freq: str): **Examples:** - >>> import pandas as pd >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 1876a1d480..b91a9c7451 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1264,7 +1264,6 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame: - >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -1596,7 +1595,6 @@ def reset_index( >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -1636,7 +1634,6 @@ class max_speed You can also use ``reset_index`` with ``MultiIndex``. - >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), @@ -4117,7 +4114,6 @@ def explode( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, @@ -4902,7 +4898,6 @@ def apply(self, func, *, axis=0, args=(), **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df @@ -7343,7 +7338,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) @@ -7421,7 +7415,6 @@ def __getitem__(self, key): You can specify a pandas Index with desired column labels. - >>> import pandas as pd >>> df[pd.Index(["age", "location"])] age location 0 20 WA diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index c6f7429643..1e13a977ce 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -734,7 +734,6 @@ def ffill(self, *, limit: Optional[int] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], @@ -817,7 +816,6 @@ def isna(self) -> NDFrame: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], @@ -1144,7 +1142,6 @@ def pipe( Constructing a income DataFrame from a dictionary. >>> import bigframes.pandas as bpd - >>> import numpy as np >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ba24a6b2d8..db81e21efb 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -161,7 +161,6 @@ def count(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) @@ -199,7 +198,6 @@ def mean( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -258,7 +256,6 @@ def median( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -337,7 +334,6 @@ def std( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -383,7 +379,6 @@ def var( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -426,7 +421,6 @@ def rank( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame( ... { @@ -501,7 +495,6 @@ def skew( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', @@ -715,7 +708,6 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -750,7 +742,6 @@ def min( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -878,7 +869,6 @@ def cumprod(self, *args, **kwargs): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -916,7 +906,6 @@ def cumsum(self, *args, **kwargs): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -954,7 +943,6 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -992,7 +980,6 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -1032,7 +1019,6 @@ def diff(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -1077,7 +1063,6 @@ def shift(self, periods: int = 1): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -1119,7 +1104,6 @@ def rolling(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) @@ -1177,7 +1161,6 @@ def expanding(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) @@ -1347,7 +1330,6 @@ def agg(self, func): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) @@ -1379,7 +1361,6 @@ def aggregate(self, func): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) @@ -1411,7 +1392,6 @@ def nunique(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) @@ -1461,7 +1441,6 @@ def agg(self, func, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], @@ -1520,7 +1499,6 @@ def aggregate(self, func, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], @@ -1579,7 +1557,6 @@ def nunique(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], @@ -1614,7 +1591,6 @@ def value_counts( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 09cce17c21..aa8f490013 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -12,7 +12,6 @@ def day(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") @@ -41,7 +40,6 @@ def dayofweek(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() @@ -74,7 +72,6 @@ def day_of_week(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() @@ -103,7 +100,6 @@ def dayofyear(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() @@ -130,7 +126,6 @@ def day_of_year(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() @@ -183,7 +178,6 @@ def hour(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") @@ -208,7 +202,6 @@ def minute(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") @@ -233,7 +226,6 @@ def month(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") @@ -258,7 +250,6 @@ def isocalendar(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() @@ -290,7 +281,6 @@ def second(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") @@ -361,7 +351,6 @@ def year(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") @@ -386,7 +375,6 @@ def days(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s @@ -403,7 +391,6 @@ def seconds(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s @@ -420,7 +407,6 @@ def microseconds(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s @@ -436,7 +422,6 @@ def total_seconds(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 70d75c58c0..8025d04d33 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -878,7 +878,6 @@ def value_counts( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() @@ -934,7 +933,6 @@ def fillna(self, value) -> Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) @@ -1012,7 +1010,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 12085d601e..8c90951793 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -16,7 +16,6 @@ def year(self) -> base.Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year @@ -31,7 +30,6 @@ def month(self) -> base.Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month @@ -46,7 +44,6 @@ def day(self) -> base.Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day @@ -61,7 +58,6 @@ def day_of_week(self) -> base.Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week @@ -76,7 +72,6 @@ def dayofweek(self) -> base.Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek @@ -91,7 +86,6 @@ def weekday(self) -> base.Index: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index d116465b71..acfbc1d699 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -72,7 +72,6 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: - >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index af3a9e1d34..3f6126aa55 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -38,7 +38,6 @@ def dt(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series @@ -330,7 +329,6 @@ def reset_index( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) @@ -735,7 +733,6 @@ def to_numpy( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() @@ -1498,7 +1495,6 @@ def sort_values( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s @@ -1596,7 +1592,6 @@ def sort_index( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() @@ -1969,7 +1964,6 @@ def combine( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -2050,7 +2044,6 @@ def groupby( You can also group by more than one index levels. - >>> import pandas as pd >>> s = bpd.Series([380, 370., 24., 26.], ... index=pd.MultiIndex.from_tuples( ... [("Falcon", "Clear"), @@ -2216,7 +2209,6 @@ def drop( Drop 2nd level label in MultiIndex Series: - >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -2330,7 +2322,6 @@ def interpolate(self, method: str = "linear"): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np Filling in NaN in a Series via linear interpolation. @@ -2374,7 +2365,6 @@ def fillna( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s @@ -2554,7 +2544,6 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np Drop NA values from a Series: @@ -2617,7 +2606,6 @@ def between( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np Boundary values are included by default: @@ -2675,7 +2663,6 @@ def case_when( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) @@ -2743,7 +2730,6 @@ def cumprod(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2780,7 +2766,6 @@ def cumsum(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2822,7 +2807,6 @@ def cummax(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2860,7 +2844,6 @@ def cummin(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2896,7 +2879,6 @@ def eq(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -2940,7 +2922,6 @@ def ne(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -2986,7 +2967,6 @@ def le(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3031,7 +3011,6 @@ def lt(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3077,7 +3056,6 @@ def ge(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3123,7 +3101,6 @@ def gt(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3282,7 +3259,6 @@ def radd(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3346,7 +3322,6 @@ def sub( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3441,7 +3416,6 @@ def rsub(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3502,7 +3476,6 @@ def mul(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3586,7 +3559,6 @@ def rmul(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3646,7 +3618,6 @@ def truediv(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3730,7 +3701,6 @@ def rtruediv(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3791,7 +3761,6 @@ def floordiv(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3875,7 +3844,6 @@ def rfloordiv(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3936,7 +3904,6 @@ def mod(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4019,7 +3986,6 @@ def rmod(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4081,7 +4047,6 @@ def pow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4166,7 +4131,6 @@ def rpow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4228,7 +4192,6 @@ def divmod(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4279,7 +4242,6 @@ def rdivmod(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4333,7 +4295,6 @@ def combine_first(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) @@ -4374,8 +4335,6 @@ def update(self, other) -> None: **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> import numpy as np >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) @@ -4467,7 +4426,6 @@ def any( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np For Series input, the output is a scalar indicating whether any element is True. @@ -4898,7 +4856,6 @@ def item(self: Series, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -5975,7 +5932,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> ser = bpd.Series([1, 2, 3]) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index d2296d4a82..c9045d5e73 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -962,7 +962,6 @@ def split( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> s = bpd.Series( ... [ @@ -1276,7 +1275,6 @@ def join(self, sep: str): **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd Example with a list that contains non-string elements. diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 5dac642af2..6c2aed970d 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -56,7 +56,6 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) From 00b89d63071a9c0f1be3417cc0663ccc11972d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 19:49:14 +0000 Subject: [PATCH 43/63] restore bpd to datetimelike --- .../bigframes_vendored/pandas/core/arrays/datetimelike.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 219df2ea6b..e7afebd3cd 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -50,6 +50,8 @@ def normalize(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', ... freq='h', From 44a4cab7be0bc1db88ad29cde864bf88fd4b0843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 20:02:40 +0000 Subject: [PATCH 44/63] remove bpd boilerplate --- bigframes/dataframe.py | 3 - bigframes/operations/strings.py | 1 - bigframes/series.py | 3 - bigframes/session/__init__.py | 7 - .../pandas/core/computation/eval.py | 1 - .../pandas/core/config_init.py | 1 - .../bigframes_vendored/pandas/core/frame.py | 127 ------------------ .../bigframes_vendored/pandas/core/generic.py | 12 -- .../pandas/core/groupby/__init__.py | 38 ------ .../pandas/core/indexes/accessor.py | 20 --- .../pandas/core/indexes/base.py | 35 ----- .../pandas/core/indexes/datetimes.py | 6 - .../pandas/core/indexes/multi.py | 2 - .../pandas/core/reshape/tile.py | 1 - .../bigframes_vendored/pandas/core/series.py | 120 ----------------- .../pandas/core/tools/timedeltas.py | 1 - 16 files changed, 378 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 807a98f2bb..65e2d50582 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1770,7 +1770,6 @@ def to_pandas( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col': [4, 2, 2]}) Download the data from BigQuery and convert it into an in-memory pandas DataFrame. @@ -1891,7 +1890,6 @@ def to_pandas_batches( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) Iterate through the results in batches, limiting the total rows yielded @@ -4249,7 +4247,6 @@ def _resample( **Examples:** - >>> import bigframes.pandas as bpd >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index c69993849a..5761ae8ea7 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -67,7 +67,6 @@ def reverse(self) -> series.Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) >>> s.str.reverse() diff --git a/bigframes/series.py b/bigframes/series.py index be5d099751..6ebd129c7c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -532,7 +532,6 @@ def to_pandas( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([4, 3, 2]) Download the data from BigQuery and convert it into an in-memory pandas Series. @@ -659,7 +658,6 @@ def to_pandas_batches( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([4, 3, 2, 2, 3]) Iterate through the results in batches, limiting the total rows yielded @@ -2418,7 +2416,6 @@ def _resample( **Examples:** - >>> import bigframes.pandas as bpd >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4663301730..f82f3c5003 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -617,7 +617,6 @@ def read_gbq_query( **Examples:** - >>> import bigframes.pandas as bpd Simple query input: @@ -772,7 +771,6 @@ def read_gbq_table( **Examples:** - >>> import bigframes.pandas as bpd Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). @@ -850,7 +848,6 @@ def read_gbq_table_streaming( **Examples:** >>> import bigframes.streaming as bst - >>> import bigframes.pandas as bpd >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") @@ -878,7 +875,6 @@ def read_gbq_model(self, model_name: str): **Examples:** - >>> import bigframes.pandas as bpd Read an existing BigQuery ML model. @@ -947,7 +943,6 @@ def read_pandas( **Examples:** - >>> import bigframes.pandas as bpd >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -1823,7 +1818,6 @@ def udf( **Examples:** - >>> import bigframes.pandas as bpd >>> import datetime Turning an arbitrary python function into a BigQuery managed python udf: @@ -1986,7 +1980,6 @@ def read_gbq_function( **Examples:** - >>> import bigframes.pandas as bpd Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 3cca7ec5cb..a1809f6cb3 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -172,7 +172,6 @@ def eval( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index dc2b11ab94..20da78e094 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -19,7 +19,6 @@ Define Repr mode to "deferred" will prevent job execution in repr. - >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") >>> bpd.options.display.repr_mode = "deferred" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b91a9c7451..cdc639dd2a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -39,7 +39,6 @@ def shape(self) -> tuple[int, int]: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2, 3], ... 'col2': [4, 5, 6]}) @@ -62,7 +61,6 @@ def axes(self) -> list: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes[1:] @@ -76,7 +74,6 @@ def values(self) -> np.ndarray: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.values @@ -107,7 +104,6 @@ def T(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -142,7 +138,6 @@ def transpose(self) -> DataFrame: **Square DataFrame with homogeneous dtype** - >>> import bigframes.pandas as bpd >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = bpd.DataFrame(data=d1) @@ -251,7 +246,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) >>> df.select_dtypes(include=['Int64']) @@ -374,7 +368,6 @@ def to_numpy( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_numpy() @@ -412,7 +405,6 @@ def to_gbq( **Examples:** - >>> import bigframes.pandas as bpd Write a DataFrame to a BigQuery table. @@ -521,7 +513,6 @@ def to_parquet( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" @@ -577,7 +568,6 @@ def to_dict( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() @@ -656,7 +646,6 @@ def to_excel( **Examples:** - >>> import bigframes.pandas as bpd >>> import tempfile >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) @@ -692,7 +681,6 @@ def to_latex( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_latex()) @@ -742,7 +730,6 @@ def to_records( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_records() @@ -801,7 +788,6 @@ def to_string( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_string()) @@ -900,7 +886,6 @@ def to_html( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_html()) @@ -1009,7 +994,6 @@ def to_markdown( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_markdown()) @@ -1042,7 +1026,6 @@ def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" @@ -1063,7 +1046,6 @@ def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | No **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> import tempfile @@ -1172,7 +1154,6 @@ def insert(self, loc, column, value, allow_duplicates=False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) @@ -1224,7 +1205,6 @@ def drop( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) @@ -1381,7 +1361,6 @@ def rename( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df @@ -1452,7 +1431,6 @@ def set_index( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], @@ -1593,7 +1571,6 @@ def reset_index( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([('bird', 389.0), ... ('bird', 24.0), @@ -1769,7 +1746,6 @@ def dropna( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], @@ -1881,7 +1857,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) @@ -1936,7 +1911,6 @@ def keys(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -1956,7 +1930,6 @@ def iterrows(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1981,7 +1954,6 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2013,7 +1985,6 @@ def items(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, @@ -2053,7 +2024,6 @@ def where(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df @@ -2144,7 +2114,6 @@ def mask(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df @@ -2246,7 +2215,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], @@ -2389,7 +2357,6 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd You can use method name: @@ -2431,7 +2398,6 @@ def __eq__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], @@ -2461,7 +2427,6 @@ def __invert__(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'a':[True, False, True], 'b':[-1, 0, 1]}) >>> ~df @@ -2489,7 +2454,6 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd You can use method name: @@ -2530,7 +2494,6 @@ def __ne__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], @@ -2569,7 +2532,6 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd You can use method name: @@ -2611,7 +2573,6 @@ def __le__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2650,7 +2611,6 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd You can use method name: @@ -2692,7 +2652,6 @@ def __lt__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2731,7 +2690,6 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd You can use method name: @@ -2773,7 +2731,6 @@ def __ge__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2812,7 +2769,6 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'angles': [0, 3, 4], ... 'degrees': [360, 180, 360]}, @@ -2852,7 +2808,6 @@ def __gt__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], @@ -2888,7 +2843,6 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -2931,7 +2885,6 @@ def __add__(self, other) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'height': [1.5, 2.6], @@ -3005,7 +2958,6 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3067,7 +3019,6 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3110,7 +3061,6 @@ def __sub__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can subtract a scalar: @@ -3157,7 +3107,6 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3217,7 +3166,6 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3260,7 +3208,6 @@ def __mul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can multiply with a scalar: @@ -3307,7 +3254,6 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3350,7 +3296,6 @@ def __rmul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can multiply with a scalar: @@ -3397,7 +3342,6 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3440,7 +3384,6 @@ def __truediv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can multiply with a scalar: @@ -3487,7 +3430,6 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3547,7 +3489,6 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3590,7 +3531,6 @@ def __floordiv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can divide by a scalar: @@ -3637,7 +3577,6 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3697,7 +3636,6 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3740,7 +3678,6 @@ def __mod__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can modulo with a scalar: @@ -3787,7 +3724,6 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3848,7 +3784,6 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -3892,7 +3827,6 @@ def __pow__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can exponentiate with a scalar: @@ -3940,7 +3874,6 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -4035,7 +3968,6 @@ def combine( **Examples:** - >>> import bigframes.pandas as bpd >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) @@ -4084,7 +4016,6 @@ def combine_first(self, other) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) @@ -4113,7 +4044,6 @@ def explode( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, @@ -4170,7 +4100,6 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], @@ -4203,7 +4132,6 @@ def cov(self, *, numeric_only) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], @@ -4241,7 +4169,6 @@ def corrwith( **Examples:** - >>> import bigframes.pandas as bpd >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] @@ -4276,7 +4203,6 @@ def update( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) @@ -4340,7 +4266,6 @@ def groupby( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], @@ -4436,7 +4361,6 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it @@ -4532,7 +4456,6 @@ def join( **Examples:** - >>> import bigframes.pandas as bpd Join two DataFrames by specifying how to handle the operation: @@ -4683,7 +4606,6 @@ def merge( **Examples:** - >>> import bigframes.pandas as bpd Merge DataFrames df1 and df2 by specifying type of merge: @@ -4814,7 +4736,6 @@ def round(self, decimals): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -4897,7 +4818,6 @@ def apply(self, func, *, axis=0, args=(), **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df @@ -5046,7 +4966,6 @@ def any(self, *, axis=0, bool_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df @@ -5092,7 +5011,6 @@ def all(self, axis=0, *, bool_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df @@ -5134,7 +5052,6 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df @@ -5180,7 +5097,6 @@ def min(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5224,7 +5140,6 @@ def max(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5267,7 +5182,6 @@ def sum(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5308,7 +5222,6 @@ def mean(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5349,7 +5262,6 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5386,7 +5298,6 @@ def quantile( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5423,7 +5334,6 @@ def var(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df @@ -5467,7 +5377,6 @@ def skew(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], ... 'B': [5, 4, 3, 2, 1], @@ -5507,7 +5416,6 @@ def kurt(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], @@ -5546,7 +5454,6 @@ def std(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], @@ -5587,7 +5494,6 @@ def count(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], @@ -5639,7 +5545,6 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], @@ -5730,7 +5635,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], @@ -5811,7 +5715,6 @@ def idxmin(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -5840,7 +5743,6 @@ def idxmax(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -5873,7 +5775,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], @@ -5947,7 +5848,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) >>> df @@ -5975,7 +5875,6 @@ def cummin(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6006,7 +5905,6 @@ def cummax(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6037,7 +5935,6 @@ def cumsum(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6073,7 +5970,6 @@ def cumprod(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6113,7 +6009,6 @@ def diff( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6160,7 +6055,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df @@ -6223,7 +6117,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df @@ -6294,7 +6187,6 @@ def pivot(self, *, columns, index=None, values=None): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... "foo": ["one", "one", "one", "two", "two"], @@ -6363,7 +6255,6 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], @@ -6455,7 +6346,6 @@ def stack(self, level=-1): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df @@ -6493,7 +6383,6 @@ def unstack(self, level=-1): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df @@ -6533,7 +6422,6 @@ def index(self): **Examples:** - >>> import bigframes.pandas as bpd You can access the index of a DataFrame via ``index`` property. @@ -6585,7 +6473,6 @@ def columns(self): **Examples:** - >>> import bigframes.pandas as bpd You can access the column labels of a DataFrame via ``columns`` property. @@ -6632,7 +6519,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, @@ -6712,7 +6598,6 @@ def eval(self, expr: str) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df @@ -6787,7 +6672,6 @@ def query(self, expr: str) -> DataFrame | None: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), @@ -6861,7 +6745,6 @@ def interpolate(self, method: str = "linear"): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], @@ -6910,7 +6793,6 @@ def fillna(self, value): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], @@ -6986,7 +6868,6 @@ def replace( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], @@ -7082,7 +6963,6 @@ def iat(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) @@ -7115,7 +6995,6 @@ def at(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) @@ -7163,7 +7042,6 @@ def dot(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left @@ -7256,7 +7134,6 @@ def __matmul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left @@ -7315,7 +7192,6 @@ def __len__(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'a': [0, 1, 2], @@ -7337,7 +7213,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) @@ -7370,7 +7245,6 @@ def __getitem__(self, key): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], @@ -7443,7 +7317,6 @@ def __setitem__(self, key, value): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 1e13a977ce..e359c8b03d 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -38,7 +38,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) >>> s.size @@ -64,7 +63,6 @@ def __iter__(self) -> Iterator: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], @@ -104,7 +102,6 @@ def astype(self, dtype): **Examples:** - >>> import bigframes.pandas as bpd Create a DataFrame: @@ -347,7 +344,6 @@ def get(self, key, default=None): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame( ... [ @@ -457,7 +453,6 @@ def head(self, n: int = 5): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) @@ -556,7 +551,6 @@ def sample( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], @@ -637,7 +631,6 @@ def dtypes(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) >>> df.dtypes @@ -661,7 +654,6 @@ def copy(self): **Examples:** - >>> import bigframes.pandas as bpd Modification in the original Series will not affect the copy Series: @@ -733,7 +725,6 @@ def ffill(self, *, limit: Optional[int] = None): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], @@ -815,7 +806,6 @@ def isna(self) -> NDFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], @@ -1055,7 +1045,6 @@ def rolling( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() @@ -1141,7 +1130,6 @@ def pipe( Constructing a income DataFrame from a dictionary. - >>> import bigframes.pandas as bpd >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index db81e21efb..ace2f4e8a7 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -44,7 +44,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df @@ -84,7 +83,6 @@ def any(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) @@ -122,7 +120,6 @@ def all(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) @@ -160,7 +157,6 @@ def count(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) @@ -197,7 +193,6 @@ def mean( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -255,7 +250,6 @@ def median( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -294,7 +288,6 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -333,7 +326,6 @@ def std( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -378,7 +370,6 @@ def var( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -420,7 +411,6 @@ def rank( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame( ... { @@ -494,7 +484,6 @@ def skew( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', @@ -528,7 +517,6 @@ def kurt( **Examples:** - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) @@ -560,7 +548,6 @@ def kurtosis( **Examples:** - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) @@ -586,7 +573,6 @@ def first(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() @@ -626,7 +612,6 @@ def last(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) >>> df.groupby("A").last() @@ -663,7 +648,6 @@ def sum( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -707,7 +691,6 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -741,7 +724,6 @@ def min( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -788,7 +770,6 @@ def max( For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -831,7 +812,6 @@ def cumcount(self, ascending: bool = True): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b', 'c'] >>> ser = bpd.Series([5, 1, 2, 3, 4], index=lst) @@ -868,7 +848,6 @@ def cumprod(self, *args, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -905,7 +884,6 @@ def cumsum(self, *args, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -942,7 +920,6 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -979,7 +956,6 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) @@ -1018,7 +994,6 @@ def diff(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) @@ -1062,7 +1037,6 @@ def shift(self, periods: int = 1): For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) @@ -1103,7 +1077,6 @@ def rolling(self, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) @@ -1160,7 +1133,6 @@ def expanding(self, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) @@ -1185,7 +1157,6 @@ def head(self, n: int = 5): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) @@ -1213,7 +1184,6 @@ def size(self): **Examples:** - >>> import bigframes.pandas as bpd For SeriesGroupBy: @@ -1266,7 +1236,6 @@ def __iter__(self): **Examples:** - >>> import bigframes.pandas as bpd For SeriesGroupBy: @@ -1329,7 +1298,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) @@ -1360,7 +1328,6 @@ def aggregate(self, func): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) @@ -1391,7 +1358,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) @@ -1440,7 +1406,6 @@ def agg(self, func, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], @@ -1498,7 +1463,6 @@ def aggregate(self, func, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], @@ -1556,7 +1520,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], @@ -1590,7 +1553,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index aa8f490013..a74a4e71e6 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -12,7 +12,6 @@ def day(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") ... ) @@ -40,7 +39,6 @@ def dayofweek(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -72,7 +70,6 @@ def day_of_week(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -100,7 +97,6 @@ def dayofyear(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -126,7 +122,6 @@ def day_of_year(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -157,7 +152,6 @@ def date(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -178,7 +172,6 @@ def hour(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -202,7 +195,6 @@ def minute(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -226,7 +218,6 @@ def month(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") ... ) @@ -250,7 +241,6 @@ def isocalendar(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() ... ) @@ -281,7 +271,6 @@ def second(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -309,7 +298,6 @@ def time(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -330,7 +318,6 @@ def quarter(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -351,7 +338,6 @@ def year(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") ... ) @@ -375,7 +361,6 @@ def days(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -391,7 +376,6 @@ def seconds(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -407,7 +391,6 @@ def microseconds(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -422,7 +405,6 @@ def total_seconds(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s 0 1 days 00:01:01.000001 @@ -438,7 +420,6 @@ def tz(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -460,7 +441,6 @@ def unit(self) -> str: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 8025d04d33..782c52c1d6 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -32,7 +32,6 @@ def name(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 2, 3], name='x') >>> idx @@ -62,7 +61,6 @@ def values(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -84,7 +82,6 @@ def ndim(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -118,7 +115,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd For Series: @@ -152,7 +148,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> bool(bpd.Index([1, 2, 3]).is_monotonic_increasing) True @@ -176,7 +171,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> bool(bpd.Index([3, 2, 1]).is_monotonic_decreasing) True @@ -200,7 +194,6 @@ def from_frame(cls, frame) -> Index: **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], @@ -239,7 +232,6 @@ def shape(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -260,7 +252,6 @@ def nlevels(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd >>> mi = bpd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) >>> mi @@ -281,7 +272,6 @@ def is_unique(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 5, 7, 7]) >>> idx.is_unique @@ -303,7 +293,6 @@ def has_duplicates(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 5, 7, 7]) >>> bool(idx.has_duplicates) @@ -325,7 +314,6 @@ def dtype(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -352,7 +340,6 @@ def T(self) -> Index: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -390,7 +377,6 @@ def copy( **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['a', 'b', 'c']) >>> new_idx = idx.copy() @@ -424,7 +410,6 @@ def astype(self, dtype): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 2, 3]) >>> idx @@ -472,7 +457,6 @@ def get_level_values(self, level) -> Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(list('abc')) >>> idx @@ -501,7 +485,6 @@ def to_series(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['Ant', 'Bear', 'Cow'], name='animal') @@ -554,7 +537,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1,2,3]) >>> idx @@ -593,7 +575,6 @@ def all(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd True, because nonzero integers are considered True. @@ -620,7 +601,6 @@ def any(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> index = bpd.Index([0, 1, 2]) >>> bool(index.any()) @@ -645,7 +625,6 @@ def min(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.min()) @@ -666,7 +645,6 @@ def max(self): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.max()) @@ -691,7 +669,6 @@ def argmin(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd Consider dataset containing cereal calories @@ -727,7 +704,6 @@ def get_loc( **Examples:** - >>> import bigframes.pandas as bpd >>> unique_index = bpd.Index(list('abc')) >>> unique_index.get_loc('b') @@ -770,7 +746,6 @@ def argmax(self) -> int: Consider dataset containing cereal calories - >>> import bigframes.pandas as bpd >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) @@ -803,7 +778,6 @@ def nunique(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s @@ -834,7 +808,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([10, 100, 1, 1000]) >>> idx @@ -877,7 +850,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() @@ -932,7 +904,6 @@ def fillna(self, value) -> Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) @@ -961,7 +932,6 @@ def rename(self, name, *, inplace): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') @@ -990,7 +960,6 @@ def drop(self, labels) -> Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['a', 'b', 'c']) >>> idx.drop(['a']) @@ -1009,7 +978,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() @@ -1035,7 +1003,6 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** - >>> import bigframes.pandas as bpd Generate an pandas.Index with duplicate values. @@ -1077,7 +1044,6 @@ def unique(self, level: Hashable | int | None = None): **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([1, 1, 2, 3, 3]) >>> idx.unique() Index([1, 2, 3], dtype='Int64') @@ -1097,7 +1063,6 @@ def item(self, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1], index=['a']) >>> s.index.item() 'a' diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 8c90951793..f22554e174 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -15,7 +15,6 @@ def year(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year @@ -29,7 +28,6 @@ def month(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month @@ -43,7 +41,6 @@ def day(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day @@ -57,7 +54,6 @@ def day_of_week(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week @@ -71,7 +67,6 @@ def dayofweek(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek @@ -85,7 +80,6 @@ def weekday(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py index c2b63b442f..018e638de3 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/multi.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -25,7 +25,6 @@ def from_tuples( **Examples:** - >>> import bigframes.pandas as bpd >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) @@ -61,7 +60,6 @@ def from_arrays( **Examples:** - >>> import bigframes.pandas as bpd >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) MultiIndex([(1, 'red'), diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index acfbc1d699..0734bcee08 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -33,7 +33,6 @@ def cut( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0, 1, 5, 10]) >>> s diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 3f6126aa55..219e50869a 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -37,7 +37,6 @@ def dt(self): **Examples:** - >>> import bigframes.pandas as bpd >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series @@ -108,7 +107,6 @@ def index(self): **Examples:** - >>> import bigframes.pandas as bpd You can access the index of a Series via ``index`` property. @@ -158,7 +156,6 @@ def shape(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape @@ -176,7 +173,6 @@ def dtype(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s.dtype @@ -195,7 +191,6 @@ def name(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd For a Series: @@ -242,7 +237,6 @@ def hasnans(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, None]) >>> s @@ -265,7 +259,6 @@ def T(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -289,7 +282,6 @@ def transpose(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s @@ -328,7 +320,6 @@ def reset_index( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) @@ -429,7 +420,6 @@ def keys(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s.keys() @@ -510,7 +500,6 @@ def to_markdown( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) @@ -564,7 +553,6 @@ def to_dict( **Examples:** - >>> import bigframes.pandas as bpd >>> from collections import OrderedDict, defaultdict >>> s = bpd.Series([1, 2, 3, 4]) @@ -603,7 +591,6 @@ def to_frame(self, name=None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["a", "b", "c"], ... name="vals") @@ -699,7 +686,6 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s @@ -732,7 +718,6 @@ def to_numpy( **Examples:** - >>> import bigframes.pandas as bpd >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() @@ -785,7 +770,6 @@ def to_pickle(self, path, *, allow_large_results=None, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> original_df = bpd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df @@ -846,7 +830,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4]) >>> s @@ -882,7 +865,6 @@ def count(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0.0, 1.0, bpd.NA]) >>> s @@ -907,7 +889,6 @@ def nunique(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s @@ -941,7 +922,6 @@ def unique(self, keep_order=True) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s @@ -983,7 +963,6 @@ def mode(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, 4, 8, 2, 4, None]) >>> s.mode() @@ -1007,7 +986,6 @@ def drop_duplicates( **Examples:** - >>> import bigframes.pandas as bpd Generate a Series with duplicated entries. @@ -1075,7 +1053,6 @@ def duplicated(self, keep="first") -> Series: **Examples:** - >>> import bigframes.pandas as bpd By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1146,7 +1123,6 @@ def idxmin(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(data=[1, None, 4, 1], ... index=['A', 'B', 'C', 'D']) @@ -1174,7 +1150,6 @@ def idxmax(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(data=[1, None, 4, 3, 4], ... index=['A', 'B', 'C', 'D', 'E']) @@ -1200,7 +1175,6 @@ def round(self, decimals: int = 0) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() @@ -1233,7 +1207,6 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) >>> s @@ -1271,7 +1244,6 @@ def corr(self, other, method="pearson", min_periods=None) -> float: **Examples:** - >>> import bigframes.pandas as bpd >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) @@ -1308,7 +1280,6 @@ def autocorr(self, lag: int = 1) -> float: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS @@ -1345,7 +1316,6 @@ def cov( **Examples:** - >>> import bigframes.pandas as bpd >>> s1 = bpd.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = bpd.Series([0.12528585, 0.26962463, 0.51111198]) @@ -1373,7 +1343,6 @@ def diff(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd Difference with previous row @@ -1438,7 +1407,6 @@ def dot(self, other) -> Series | np.ndarray: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) @@ -1494,7 +1462,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s @@ -1591,7 +1558,6 @@ def sort_index( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() @@ -1650,7 +1616,6 @@ def nlargest( **Examples:** - >>> import bigframes.pandas as bpd >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, @@ -1735,7 +1700,6 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, @@ -1822,7 +1786,6 @@ def apply( **Examples:** - >>> import bigframes.pandas as bpd For applying arbitrary python function a `remote_function` is recommended. Let's use ``reuse=False`` flag to make sure a new `remote_function` @@ -1963,7 +1926,6 @@ def combine( **Examples:** - >>> import bigframes.pandas as bpd Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -2021,7 +1983,6 @@ def groupby( **Examples:** - >>> import bigframes.pandas as bpd You can group by a named index level. @@ -2192,7 +2153,6 @@ def drop( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) >>> s @@ -2321,7 +2281,6 @@ def interpolate(self, method: str = "linear"): **Examples:** - >>> import bigframes.pandas as bpd Filling in NaN in a Series via linear interpolation. @@ -2364,7 +2323,6 @@ def fillna( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s @@ -2417,7 +2375,6 @@ def replace( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s @@ -2543,7 +2500,6 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: **Examples:** - >>> import bigframes.pandas as bpd Drop NA values from a Series: @@ -2605,7 +2561,6 @@ def between( **Examples:** - >>> import bigframes.pandas as bpd Boundary values are included by default: @@ -2662,7 +2617,6 @@ def case_when( **Examples:** - >>> import bigframes.pandas as bpd >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) @@ -2729,7 +2683,6 @@ def cumprod(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2765,7 +2718,6 @@ def cumsum(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2806,7 +2758,6 @@ def cummax(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2843,7 +2794,6 @@ def cummin(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s @@ -2878,7 +2828,6 @@ def eq(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -2921,7 +2870,6 @@ def ne(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -2966,7 +2914,6 @@ def le(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3010,7 +2957,6 @@ def lt(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3055,7 +3001,6 @@ def ge(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3100,7 +3045,6 @@ def gt(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3144,7 +3088,6 @@ def add(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 2, 3, bpd.NA]) >>> a @@ -3207,7 +3150,6 @@ def __add__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s @@ -3258,7 +3200,6 @@ def radd(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3321,7 +3262,6 @@ def sub( **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3364,7 +3304,6 @@ def __sub__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s @@ -3415,7 +3354,6 @@ def rsub(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3475,7 +3413,6 @@ def mul(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3519,7 +3456,6 @@ def __mul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can multiply with a scalar: @@ -3558,7 +3494,6 @@ def rmul(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3617,7 +3552,6 @@ def truediv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3661,7 +3595,6 @@ def __truediv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can multiply with a scalar: @@ -3700,7 +3633,6 @@ def rtruediv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3760,7 +3692,6 @@ def floordiv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3804,7 +3735,6 @@ def __floordiv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can divide by a scalar: @@ -3843,7 +3773,6 @@ def rfloordiv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3903,7 +3832,6 @@ def mod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -3947,7 +3875,6 @@ def __mod__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can modulo with a scalar: @@ -3985,7 +3912,6 @@ def rmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4046,7 +3972,6 @@ def pow(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4091,7 +4016,6 @@ def __pow__(self, other): **Examples:** - >>> import bigframes.pandas as bpd You can exponentiate with a scalar: @@ -4130,7 +4054,6 @@ def rpow(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4191,7 +4114,6 @@ def divmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4241,7 +4163,6 @@ def rdivmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a @@ -4294,7 +4215,6 @@ def combine_first(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) @@ -4334,7 +4254,6 @@ def update(self, other) -> None: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) @@ -4425,7 +4344,6 @@ def any( **Examples:** - >>> import bigframes.pandas as bpd For Series input, the output is a scalar indicating whether any element is True. @@ -4459,7 +4377,6 @@ def max( **Examples:** - >>> import bigframes.pandas as bpd Calculating the max of a Series: @@ -4500,7 +4417,6 @@ def min( **Examples:** - >>> import bigframes.pandas as bpd Calculating the min of a Series: @@ -4540,7 +4456,6 @@ def std( **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], ... 'age': [21, 25, 62, 43], @@ -4587,7 +4502,6 @@ def sum(self): **Examples:** - >>> import bigframes.pandas as bpd Calculating the sum of a Series: @@ -4622,7 +4536,6 @@ def mean(self): **Examples:** - >>> import bigframes.pandas as bpd Calculating the mean of a Series: @@ -4657,7 +4570,6 @@ def median(self, *, exact: bool = True): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s.median() @@ -4697,7 +4609,6 @@ def quantile( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) @@ -4749,7 +4660,6 @@ def describe(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['A', 'A', 'B']) >>> s @@ -4776,7 +4686,6 @@ def skew(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s.skew() @@ -4813,7 +4722,6 @@ def kurt(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) >>> s @@ -4855,7 +4763,6 @@ def item(self: Series, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -4877,7 +4784,6 @@ def items(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): @@ -4898,7 +4804,6 @@ def where(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s @@ -4965,7 +4870,6 @@ def mask(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s @@ -5122,7 +5026,6 @@ def argmax(self): **Examples:** - >>> import bigframes.pandas as bpd Consider dataset containing cereal calories. @@ -5159,7 +5062,6 @@ def argmin(self): **Examples:** - >>> import bigframes.pandas as bpd Consider dataset containing cereal calories. @@ -5199,7 +5101,6 @@ def rename(self, index, *, inplace, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s @@ -5250,7 +5151,6 @@ def rename_axis(self, mapper, *, inplace, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd Series @@ -5314,7 +5214,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") @@ -5391,7 +5290,6 @@ def str(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(["A_Str_Series"]) >>> s @@ -5419,7 +5317,6 @@ def plot(self): **Examples:** - >>> import bigframes.pandas as bpd >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -5446,7 +5343,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', ... 'hippo'], name='animal') @@ -5511,7 +5407,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing @@ -5534,7 +5429,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing @@ -5575,7 +5469,6 @@ def map( **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) >>> s @@ -5640,7 +5533,6 @@ def iloc(self): **Examples:** - >>> import bigframes.pandas as bpd >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, @@ -5719,7 +5611,6 @@ def loc(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], @@ -5805,7 +5696,6 @@ def iat(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) @@ -5839,7 +5729,6 @@ def at(self): **Examples:** - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) @@ -5874,7 +5763,6 @@ def values(self): **Examples:** - >>> import bigframes.pandas as bpd >>> bpd.Series([1, 2, 3]).values array([1, 2, 3]) @@ -5895,7 +5783,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd For Series: @@ -5931,7 +5818,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** - >>> import bigframes.pandas as bpd >>> ser = bpd.Series([1, 2, 3]) @@ -5957,7 +5843,6 @@ def __len__(self): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> len(s) @@ -5972,7 +5857,6 @@ def __invert__(self): **Examples:** - >>> import bigframes.pandas as bpd >>> ser = bpd.Series([True, False, True]) >>> ~ser @@ -5992,7 +5876,6 @@ def __and__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0, 1, 2, 3]) @@ -6030,7 +5913,6 @@ def __or__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0, 1, 2, 3]) @@ -6068,7 +5950,6 @@ def __xor__(self, other): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([0, 1, 2, 3]) @@ -6106,7 +5987,6 @@ def __getitem__(self, indexer): **Examples:** - >>> import bigframes.pandas as bpd >>> s = bpd.Series([15, 30, 45]) >>> s[1] diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index 220b15f56e..f9ebe59f8d 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -54,7 +54,6 @@ def to_timedelta( **Examples:** - >>> import bigframes.pandas as bpd Converting a Scalar to timedelta From b81a6e635d23d2207d98b163bc6a85070a8096dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 20:54:40 +0000 Subject: [PATCH 45/63] avoid bpd.NA --- CHANGELOG.md | 2 +- bigframes/operations/strings.py | 2 +- tests/system/small/operations/test_strings.py | 6 +++--- tests/unit/test_pandas.py | 2 +- .../bigframes_vendored/pandas/core/frame.py | 21 +++++++++---------- .../bigframes_vendored/pandas/core/generic.py | 4 ++-- .../bigframes_vendored/pandas/core/series.py | 20 +++++++++--------- .../pandas/core/strings/accessor.py | 18 ++++++++-------- 8 files changed, 37 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86d7315896..25205f48d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -463,7 +463,7 @@ * Address `read_csv` with both `index_col` and `use_cols` behavior inconsistency with pandas ([#1785](https://github.com/googleapis/python-bigquery-dataframes/issues/1785)) ([ba7c313](https://github.com/googleapis/python-bigquery-dataframes/commit/ba7c313c8d308e3ff3f736b60978cb7a51715209)) * Allow KMeans model init parameter as k-means++ alias ([#1790](https://github.com/googleapis/python-bigquery-dataframes/issues/1790)) ([0b59cf1](https://github.com/googleapis/python-bigquery-dataframes/commit/0b59cf1008613770fa1433c6da395e755c86fe22)) -* Replace function now can handle bpd.NA value. ([#1786](https://github.com/googleapis/python-bigquery-dataframes/issues/1786)) ([7269512](https://github.com/googleapis/python-bigquery-dataframes/commit/7269512a28eb42029447d5380c764353278a74e1)) +* Replace function now can handle pd.NA value. ([#1786](https://github.com/googleapis/python-bigquery-dataframes/issues/1786)) ([7269512](https://github.com/googleapis/python-bigquery-dataframes/commit/7269512a28eb42029447d5380c764353278a74e1)) ### Documentation diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 5761ae8ea7..bc07ffaee0 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -68,7 +68,7 @@ def reverse(self) -> series.Series: **Examples:** - >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) + >>> s = bpd.Series(["apple", "banana", "", pd.NA]) >>> s.str.reverse() 0 elppa 1 ananab diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index d3e868db59..6cd6309cbb 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -288,7 +288,7 @@ def test_strip(scalars_dfs): ], ) def test_strip_w_to_strip(to_strip): - s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA]) + s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", pd.NA]) pd_s = s.to_pandas() bf_result = s.str.strip(to_strip=to_strip).to_pandas() @@ -434,7 +434,7 @@ def test_rstrip(scalars_dfs): ], ) def test_rstrip_w_to_strip(to_strip): - s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA]) + s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", pd.NA]) pd_s = s.to_pandas() bf_result = s.str.rstrip(to_strip=to_strip).to_pandas() @@ -469,7 +469,7 @@ def test_lstrip(scalars_dfs): ], ) def test_lstrip_w_to_strip(to_strip): - s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA]) + s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", pd.NA]) pd_s = s.to_pandas() bf_result = s.str.lstrip(to_strip=to_strip).to_pandas() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 5e75e6b20f..e1e713697d 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -174,7 +174,7 @@ def test_cut_raises_with_invalid_bins(bins: int, error_message: str): def test_pandas_attribute(): - assert bpd.NA is pd.NA + assert pd.NA is pd.NA assert bpd.BooleanDtype is pd.BooleanDtype assert bpd.Float64Dtype is pd.Float64Dtype assert bpd.Int64Dtype is pd.Int64Dtype diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index cdc639dd2a..b7ba169806 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1749,7 +1749,7 @@ def dropna( >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) + ... "born": [pd.NA, "1940-04-25", pd.NA]}) >>> df name toy born 0 Alfred @@ -2217,7 +2217,7 @@ def sort_values( >>> df = bpd.DataFrame({ - ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], + ... 'col1': ['A', 'A', 'B', pd.NA, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] @@ -4361,13 +4361,12 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -4384,8 +4383,8 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: [5 rows x 2 columns] - >>> df_hours = df_minutes.map(minutes_to_hours) - >>> df_hours + >>> df_hours = df_minutes.map(minutes_to_hours) # doctest: +SKIP + >>> df_hours # doctest: +SKIP system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4401,11 +4400,11 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> df_minutes = bpd.DataFrame( ... { - ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], - ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] + ... "system_minutes" : [0, 30, 60, None, 90, 120, pd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, pd.NA] ... }, dtype="Int64") - >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') - >>> df_hours + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') # doctest: +SKIP + >>> df_hours # doctest: +SKIP system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -6521,7 +6520,7 @@ def value_counts( >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], - ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, + ... 'num_wings': [2, 0, 0, 0, pd.NA]}, ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], ... dtype='Int64') >>> df diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index e359c8b03d..62e54e9e60 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -809,7 +809,7 @@ def isna(self) -> NDFrame: >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], - ... born=[bpd.NA, "1940-04-25", "1940-04-25"], + ... born=[pd.NA, "1940-04-25", "1940-04-25"], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'], ... )) @@ -841,7 +841,7 @@ def isna(self) -> NDFrame: Show which entries in a Series are NA: - >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) + >>> ser = bpd.Series([5, None, 6, np.nan, pd.NA]) >>> ser 0 5 1 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 219e50869a..59a183d459 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -160,7 +160,7 @@ def shape(self): >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape (4,) - >>> s = bpd.Series(['Alice', 'Bob', bpd.NA]) + >>> s = bpd.Series(['Alice', 'Bob', pd.NA]) >>> s.shape (3,) """ @@ -866,7 +866,7 @@ def count(self): **Examples:** - >>> s = bpd.Series([0.0, 1.0, bpd.NA]) + >>> s = bpd.Series([0.0, 1.0, pd.NA]) >>> s 0 0.0 1 1.0 @@ -2517,7 +2517,7 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') + >>> ser = bpd.Series(['2', pd.NA, '', None, 'I stay'], dtype='object') >>> ser 0 2 1 @@ -3089,7 +3089,7 @@ def add(self, other) -> Series: **Examples:** - >>> a = bpd.Series([1, 2, 3, bpd.NA]) + >>> a = bpd.Series([1, 2, 3, pd.NA]) >>> a 0 1 1 2 @@ -4391,7 +4391,7 @@ def max( Calculating the max of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4431,7 +4431,7 @@ def min( Calculating the min of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4516,7 +4516,7 @@ def sum(self): Calculating the sum of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4550,7 +4550,7 @@ def mean(self): Calculating the mean of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -5215,7 +5215,7 @@ def value_counts( **Examples:** - >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") + >>> s = bpd.Series([3, 1, 2, 3, 4, pd.NA], dtype="Int64") >>> s 0 3 @@ -5470,7 +5470,7 @@ def map( **Examples:** - >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) + >>> s = bpd.Series(['cat', 'dog', pd.NA, 'rabbit']) >>> s 0 cat 1 dog diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index c9045d5e73..9a72b98aee 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -146,7 +146,7 @@ def len(self): Returns the length (number of characters) in a string. - >>> s = bpd.Series(['dog', '', bpd.NA]) + >>> s = bpd.Series(['dog', '', pd.NA]) >>> s.str.len() 0 3 1 0 @@ -249,7 +249,7 @@ def strip(self, to_strip: typing.Optional[str] = None): ... '1. Ant.', ... ' 2. Bee? ', ... '\\t3. Cat!\\n', - ... bpd.NA, + ... pd.NA, ... ]) >>> s.str.strip() 0 1. Ant. @@ -535,7 +535,7 @@ def rstrip(self, to_strip: typing.Optional[str] = None): >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) >>> s.str.rstrip() 0 Ant 1 Bee @@ -567,7 +567,7 @@ def lstrip(self, to_strip: typing.Optional[str] = None): >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) >>> s.str.lstrip() 0 Ant 1 Bee @@ -817,7 +817,7 @@ def replace( as a regex. When *repl* is a string, it replaces matching regex patterns as with `re.sub()`. NaN value(s) in the Series are left as is: - >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) + >>> s = bpd.Series(['foo', 'fuz', pd.NA]) >>> s.str.replace('f.', 'ba', regex=True) 0 bao 1 baz @@ -827,7 +827,7 @@ def replace( When *pat* is a string and *regex* is False, every *pat* is replaced with *repl* as with `str.replace()`: - >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) + >>> s = bpd.Series(['f.o', 'fuz', pd.NA]) >>> s.str.replace('f.', 'ba', regex=False) 0 bao 1 fuz @@ -874,7 +874,7 @@ def startswith( >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) + >>> s = bpd.Series(['bat', 'Bear', 'caT', pd.NA]) >>> s 0 bat 1 Bear @@ -918,7 +918,7 @@ def endswith( >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) + >>> s = bpd.Series(['bat', 'bear', 'caT', pd.NA]) >>> s 0 bat 1 bear @@ -1206,7 +1206,7 @@ def zfill( >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) + >>> s = bpd.Series(['-1', '1', '1000', pd.NA]) >>> s 0 -1 1 1 From fe54febf993205384fe5624e24681c0a10b3209f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 14 Oct 2025 21:10:01 +0000 Subject: [PATCH 46/63] fix more docs --- .../bigframes_vendored/pandas/core/frame.py | 16 +++++++++------- .../bigframes_vendored/pandas/core/generic.py | 7 +++---- .../pandas/core/groupby/__init__.py | 12 +++++++----- .../pandas/core/indexes/accessor.py | 9 +++++++++ .../pandas/core/indexes/base.py | 1 + 5 files changed, 29 insertions(+), 16 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b7ba169806..49554d5861 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4735,6 +4735,7 @@ def round(self, decimals): **Examples:** + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -5051,7 +5052,7 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df A B @@ -5261,7 +5262,7 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5297,6 +5298,7 @@ def quantile( **Examples:** + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5544,7 +5546,7 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5634,7 +5636,7 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -6116,7 +6118,7 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df A B C @@ -6254,7 +6256,7 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], @@ -6867,7 +6869,7 @@ def replace( **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], ... 'string_col': ["a", "b", "c", "b"], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 62e54e9e60..805071f810 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -102,7 +102,6 @@ def astype(self, dtype): **Examples:** - Create a DataFrame: >>> d = {'col1': [1, 2], 'col2': [3, 4]} @@ -146,7 +145,7 @@ def astype(self, dtype): Note that this is equivalent of using ``to_datetime`` with ``unit='us'``: - >>> bpd.to_datetime(ser, unit='us', utc=True) + >>> bpd.to_datetime(ser, unit='us', utc=True) # doctest: +SKIP 0 2034-02-08 11:13:20.246789+00:00 1 2021-06-19 17:20:44.123101+00:00 2 2003-06-05 17:30:34.120101+00:00 @@ -551,7 +550,7 @@ def sample( **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, @@ -1045,7 +1044,7 @@ def rolling( **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() 0 diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ace2f4e8a7..ba6310507d 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -44,7 +44,7 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df A B C @@ -250,7 +250,7 @@ def median( For SeriesGroupBy: - + >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).median() @@ -288,6 +288,7 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -411,7 +412,7 @@ def rank( **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame( ... { ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], @@ -574,6 +575,7 @@ def first(self, numeric_only: bool = False, min_count: int = -1): **Examples:** + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() B C @@ -1077,7 +1079,7 @@ def rolling(self, *args, **kwargs): **Examples:** - + >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).rolling(2).min() @@ -1133,7 +1135,7 @@ def expanding(self, *args, **kwargs): **Examples:** - + >>> import bigframes.pandas as bpd >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).expanding().min() diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index a74a4e71e6..0cef18a2eb 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -122,6 +122,7 @@ def day_of_year(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -152,6 +153,7 @@ def date(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -172,6 +174,7 @@ def hour(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -195,6 +198,7 @@ def minute(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -271,6 +275,7 @@ def second(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -298,6 +303,7 @@ def time(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -318,6 +324,7 @@ def quarter(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -420,6 +427,7 @@ def tz(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -441,6 +449,7 @@ def unit(self) -> str: **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 782c52c1d6..f7133b8c93 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1003,6 +1003,7 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** + >>> import bigframes.pandas as bpd Generate an pandas.Index with duplicate values. From 150d8beef04edb77a9097356647ef0c00be50757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 14:32:30 +0000 Subject: [PATCH 47/63] dont skip tests if polars isnt installed --- conftest.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index e1f3f6d84c..41583b1a6b 100644 --- a/conftest.py +++ b/conftest.py @@ -23,16 +23,21 @@ @pytest.fixture(scope="session") -def polars_session(): - pytest.importorskip("polars") +def polars_session_or_bpd(): + # Since the doctest imports fixture is autouse=True, don't skip if polars + # isn't available. + try: + from bigframes.testing import polars_session - from bigframes.testing import polars_session + return polars_session.TestSession() + except ImportError: + import bigframes.pandas as bpd - return polars_session.TestSession() + return bpd @pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace, polars_session): +def default_doctest_imports(doctest_namespace, polars_session_or_bpd): """ Avoid some boilerplate in pandas-inspired tests. @@ -41,5 +46,5 @@ def default_doctest_imports(doctest_namespace, polars_session): doctest_namespace["np"] = np doctest_namespace["pd"] = pd doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = polars_session + doctest_namespace["bpd"] = polars_session_or_bpd bigframes._config.options.display.progress_bar = None From 4334a44a3b4b75c78e0b9494e356b2e2c8d71c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 15:11:44 +0000 Subject: [PATCH 48/63] fix more doctests --- bigframes/core/compile/polars/compiler.py | 12 +++++++++-- .../bigframes_vendored/pandas/core/frame.py | 17 ++++++++------- .../pandas/core/indexes/accessor.py | 1 + .../pandas/core/reshape/tile.py | 2 +- .../bigframes_vendored/pandas/core/series.py | 21 +++++++++---------- 5 files changed, 31 insertions(+), 22 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 059ec72076..4bc9edcb98 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -717,14 +717,22 @@ def _ordered_join( ] ) if how != "cross": + # Note: join_nulls renamed to nulls_equal for polars 1.24 + polars_version = tuple( + int(part) for part in pl.__version__.split(".") if part.isnumeric() + ) + if polars_version >= (1, 24, 0): + join_kwargs = {"nulls_equal": join_nulls} + else: + join_kwargs = {"join_nulls": join_nulls} + joined = left.join( right, how=how, left_on=left_on, right_on=right_on, - # Note: join_nulls renamed to nulls_equal for polars 1.24 - join_nulls=join_nulls, # type: ignore coalesce=False, + **join_kwargs, # type: ignore ) else: joined = left.join(right, how=how, coalesce=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 49554d5861..522cfbc752 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -405,6 +405,7 @@ def to_gbq( **Examples:** + >>> import bigframes.pandas as bpd Write a DataFrame to a BigQuery table. @@ -513,7 +514,7 @@ def to_parquet( **Examples:** - + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" >>> df.to_parquet(path=gcs_bucket) @@ -4843,14 +4844,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] ... result += row["col2"]*row["col2"] ... return result - >>> df[["col1", "col2"]].apply(foo, axis=1) + >>> df[["col1", "col2"]].apply(foo, axis=1) # doctest: +SKIP 0 11 1 19 dtype: Int64 @@ -4858,7 +4859,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -4875,8 +4876,8 @@ def apply(self, func, *, axis=0, args=(), **kwargs): ... "chemistry": [88, 56, 72], ... "algebra": [78, 91, 79] ... }, index=["Alice", "Bob", "Charlie"]) - >>> stats = df.apply(marks_analyzer, axis=1) - >>> stats + >>> stats = df.apply(marks_analyzer, axis=1) # doctest: +SKIP + >>> stats # doctest: +SKIP Alice [77.67 78. 77.19 76.71] Bob [75.67 80. 74.15 72.56] Charlie [75.33 75. 75.28 75.22] @@ -4899,14 +4900,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x ... result += y/z ... return result - >>> df.apply(foo, axis=1) + >>> df.apply(foo, axis=1) # doctest: +SKIP 0 2.6 1 3.8 dtype: Float64 diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 0cef18a2eb..b9eb363b29 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -97,6 +97,7 @@ def dayofyear(self): **Examples:** + >>> import bigframes.pandas as bpd >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 0734bcee08..0f42433384 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -33,7 +33,7 @@ def cut( **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([0, 1, 5, 10]) >>> s 0 0 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 59a183d459..bec8cc1b55 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -37,7 +37,7 @@ def dt(self): **Examples:** - + >>> import bigframes.pandas as bpd >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series 0 2000-01-01 00:00:00 @@ -1053,6 +1053,7 @@ def duplicated(self, keep="first") -> Series: **Examples:** + >>> import bigframes.pandas as bpd By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1616,7 +1617,7 @@ def nlargest( **Examples:** - + >>> import bigframes.pandas as bpd >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1700,7 +1701,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** - + >>> import bigframes.pandas as bpd >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -4570,7 +4571,7 @@ def median(self, *, exact: bool = True): **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3]) >>> s.median() np.float64(2.0) @@ -4870,7 +4871,6 @@ def mask(self, cond, other): **Examples:** - >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -4914,7 +4914,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -4927,12 +4927,12 @@ def mask(self, cond, other): 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask) + >>> s.mask(should_mask) # doctest: +SKIP 0 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask, "REDACTED") + >>> s.mask(should_mask, "REDACTED") # doctest: +SKIP 0 REDACTED 1 Bob 2 Caroline @@ -5469,7 +5469,6 @@ def map( **Examples:** - >>> s = bpd.Series(['cat', 'dog', pd.NA, 'rabbit']) >>> s 0 cat @@ -5490,7 +5489,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function(cloud_function_service_account="default") + >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: @@ -5499,7 +5498,7 @@ def map( ... ]) ... return "N/A" - >>> s.map(my_mapper) + >>> s.map(my_mapper) # doctest: +SKIP 0 cAt 1 dOg 2 N/A From 937fff77153138dd84e3f2b7ed94526eda903a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 16:42:29 +0000 Subject: [PATCH 49/63] skip remote functions in Series.apply --- bigframes/core/compile/polars/compiler.py | 689 +++++++++--------- tests/system/small/test_series.py | 5 +- tests/unit/test_series_polars.py | 5 +- .../bigframes_vendored/pandas/core/series.py | 92 +-- 4 files changed, 403 insertions(+), 388 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 4bc9edcb98..6c2e324206 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -538,370 +538,379 @@ def compile_agg_op( f"Aggregate op {op} not yet supported in polars engine." ) + @dataclasses.dataclass(frozen=True) + class PolarsCompiler: + """ + Compiles ArrayValue to polars LazyFrame and executes. + + This feature is in development and is incomplete. + While most node types are supported, this has the following limitations: + 1. GBQ data sources not supported. + 2. Joins do not order rows correctly + 3. Incomplete scalar op support + 4. Incomplete aggregate op support + 5. Incomplete analytic op support + 6. Some complex windowing types not supported (eg. groupby + rolling) + 7. UDFs are not supported. + 8. Returned types may not be entirely consistent with BigQuery backend + 9. Some operations are not entirely lazy - sampling and somse windowing. + """ -@dataclasses.dataclass(frozen=True) -class PolarsCompiler: - """ - Compiles ArrayValue to polars LazyFrame and executes. - - This feature is in development and is incomplete. - While most node types are supported, this has the following limitations: - 1. GBQ data sources not supported. - 2. Joins do not order rows correctly - 3. Incomplete scalar op support - 4. Incomplete aggregate op support - 5. Incomplete analytic op support - 6. Some complex windowing types not supported (eg. groupby + rolling) - 7. UDFs are not supported. - 8. Returned types may not be entirely consistent with BigQuery backend - 9. Some operations are not entirely lazy - sampling and somse windowing. - """ + expr_compiler = PolarsExpressionCompiler() + agg_compiler = PolarsAggregateCompiler() + + def compile(self, plan: nodes.BigFrameNode) -> pl.LazyFrame: + if not polars_installed: + raise ValueError( + "Polars is not installed, cannot compile to polars engine." + ) - expr_compiler = PolarsExpressionCompiler() - agg_compiler = PolarsAggregateCompiler() + # TODO: Create standard way to configure BFET -> BFET rewrites + # Polars has incomplete slice support in lazy mode + node = plan + node = bigframes.core.rewrite.column_pruning(node) + node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) + node = bigframes.core.rewrite.pull_out_window_order(node) + node = bigframes.core.rewrite.schema_binding.bind_schema_to_tree(node) + node = lowering.lower_ops_to_polars(node) + return self.compile_node(node) - def compile(self, plan: nodes.BigFrameNode) -> pl.LazyFrame: - if not polars_installed: - raise ValueError( - "Polars is not installed, cannot compile to polars engine." + @functools.singledispatchmethod + def compile_node(self, node: nodes.BigFrameNode) -> pl.LazyFrame: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + @compile_node.register + def compile_readlocal(self, node: nodes.ReadLocalNode): + cols_to_read = { + scan_item.source_id: scan_item.id.sql + for scan_item in node.scan_list.items + } + lazy_frame = cast( + pl.DataFrame, pl.from_arrow(node.local_data_source.data) + ).lazy() + lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) + if node.offsets_col: + lazy_frame = lazy_frame.with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.offsets_col.sql)] + ) + return lazy_frame + + @compile_node.register + def compile_filter(self, node: nodes.FilterNode): + return self.compile_node(node.child).filter( + self.expr_compiler.compile_expression(node.predicate) ) - # TODO: Create standard way to configure BFET -> BFET rewrites - # Polars has incomplete slice support in lazy mode - node = plan - node = bigframes.core.rewrite.column_pruning(node) - node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) - node = bigframes.core.rewrite.pull_out_window_order(node) - node = bigframes.core.rewrite.schema_binding.bind_schema_to_tree(node) - node = lowering.lower_ops_to_polars(node) - return self.compile_node(node) - - @functools.singledispatchmethod - def compile_node(self, node: nodes.BigFrameNode) -> pl.LazyFrame: - """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unrecognized node: {node}") - - @compile_node.register - def compile_readlocal(self, node: nodes.ReadLocalNode): - cols_to_read = { - scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items - } - lazy_frame = cast( - pl.DataFrame, pl.from_arrow(node.local_data_source.data) - ).lazy() - lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) - if node.offsets_col: - lazy_frame = lazy_frame.with_columns( - [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.offsets_col.sql)] + @compile_node.register + def compile_orderby(self, node: nodes.OrderByNode): + frame = self.compile_node(node.child) + if len(node.by) == 0: + # pragma: no cover + return frame + return self._sort(frame, node.by) + + def _sort( + self, frame: pl.LazyFrame, by: Sequence[ordering.OrderingExpression] + ) -> pl.LazyFrame: + sorted = frame.sort( + [ + self.expr_compiler.compile_expression(by.scalar_expression) + for by in by + ], + descending=[not by.direction.is_ascending for by in by], + nulls_last=[by.na_last for by in by], + maintain_order=True, ) - return lazy_frame - - @compile_node.register - def compile_filter(self, node: nodes.FilterNode): - return self.compile_node(node.child).filter( - self.expr_compiler.compile_expression(node.predicate) - ) - - @compile_node.register - def compile_orderby(self, node: nodes.OrderByNode): - frame = self.compile_node(node.child) - if len(node.by) == 0: - # pragma: no cover - return frame - return self._sort(frame, node.by) - - def _sort( - self, frame: pl.LazyFrame, by: Sequence[ordering.OrderingExpression] - ) -> pl.LazyFrame: - sorted = frame.sort( - [self.expr_compiler.compile_expression(by.scalar_expression) for by in by], - descending=[not by.direction.is_ascending for by in by], - nulls_last=[by.na_last for by in by], - maintain_order=True, - ) - return sorted - - @compile_node.register - def compile_reversed(self, node: nodes.ReversedNode): - return self.compile_node(node.child).reverse() - - @compile_node.register - def compile_selection(self, node: nodes.SelectionNode): - return self.compile_node(node.child).select( - **{new.sql: orig.id.sql for orig, new in node.input_output_pairs} - ) - - @compile_node.register - def compile_projection(self, node: nodes.ProjectionNode): - new_cols = [] - for proj_expr, name in node.assignments: - bound_expr = ex.bind_schema_fields(proj_expr, node.child.field_by_id) - new_col = self.expr_compiler.compile_expression(bound_expr).alias(name.sql) - if bound_expr.output_type is None: - new_col = new_col.cast( - _bigframes_dtype_to_polars_dtype(bigframes.dtypes.DEFAULT_DTYPE) + return sorted + + @compile_node.register + def compile_reversed(self, node: nodes.ReversedNode): + return self.compile_node(node.child).reverse() + + @compile_node.register + def compile_selection(self, node: nodes.SelectionNode): + return self.compile_node(node.child).select( + **{new.sql: orig.id.sql for orig, new in node.input_output_pairs} + ) + + @compile_node.register + def compile_projection(self, node: nodes.ProjectionNode): + new_cols = [] + for proj_expr, name in node.assignments: + bound_expr = ex.bind_schema_fields(proj_expr, node.child.field_by_id) + new_col = self.expr_compiler.compile_expression(bound_expr).alias( + name.sql ) - new_cols.append(new_col) - return self.compile_node(node.child).with_columns(new_cols) - - @compile_node.register - def compile_offsets(self, node: nodes.PromoteOffsetsNode): - return self.compile_node(node.child).with_columns( - [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.col_id.sql)] - ) - - @compile_node.register - def compile_join(self, node: nodes.JoinNode): - left = self.compile_node(node.left_child) - right = self.compile_node(node.right_child) - - left_on = [] - right_on = [] - for left_ex, right_ex in node.conditions: - left_ex, right_ex = lowering._coerce_comparables(left_ex, right_ex) - left_on.append(self.expr_compiler.compile_expression(left_ex)) - right_on.append(self.expr_compiler.compile_expression(right_ex)) - - if node.type == "right": + if bound_expr.output_type is None: + new_col = new_col.cast( + _bigframes_dtype_to_polars_dtype(bigframes.dtypes.DEFAULT_DTYPE) + ) + new_cols.append(new_col) + return self.compile_node(node.child).with_columns(new_cols) + + @compile_node.register + def compile_offsets(self, node: nodes.PromoteOffsetsNode): + return self.compile_node(node.child).with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.col_id.sql)] + ) + + @compile_node.register + def compile_join(self, node: nodes.JoinNode): + left = self.compile_node(node.left_child) + right = self.compile_node(node.right_child) + + left_on = [] + right_on = [] + for left_ex, right_ex in node.conditions: + left_ex, right_ex = lowering._coerce_comparables(left_ex, right_ex) + left_on.append(self.expr_compiler.compile_expression(left_ex)) + right_on.append(self.expr_compiler.compile_expression(right_ex)) + + if node.type == "right": + return self._ordered_join( + right, left, "left", right_on, left_on, node.joins_nulls + ).select([id.sql for id in node.ids]) return self._ordered_join( - right, left, "left", right_on, left_on, node.joins_nulls - ).select([id.sql for id in node.ids]) - return self._ordered_join( - left, right, node.type, left_on, right_on, node.joins_nulls - ) - - @compile_node.register - def compile_isin(self, node: nodes.InNode): - left = self.compile_node(node.left_child) - right = self.compile_node(node.right_child).unique(node.right_col.id.sql) - right = right.with_columns(pl.lit(True).alias(node.indicator_col.sql)) - - left_ex, right_ex = lowering._coerce_comparables(node.left_col, node.right_col) - - left_pl_ex = self.expr_compiler.compile_expression(left_ex) - right_pl_ex = self.expr_compiler.compile_expression(right_ex) - - joined = left.join( - right, - how="left", - left_on=left_pl_ex, - right_on=right_pl_ex, - # Note: join_nulls renamed to nulls_equal for polars 1.24 - join_nulls=node.joins_nulls, # type: ignore - coalesce=False, - ) - passthrough = [pl.col(id) for id in left.columns] - indicator = pl.col(node.indicator_col.sql).fill_null(False) - return joined.select((*passthrough, indicator)) - - def _ordered_join( - self, - left_frame: pl.LazyFrame, - right_frame: pl.LazyFrame, - how: Literal["inner", "outer", "left", "cross"], - left_on: Sequence[pl.Expr], - right_on: Sequence[pl.Expr], - join_nulls: bool, - ): - if how == "right": - # seems to cause seg faults as of v1.30 for no apparent reason - raise ValueError("right join not supported") - left = left_frame.with_columns( - [ - pl.int_range(pl.len()).alias("_bf_join_l"), - ] - ) - right = right_frame.with_columns( - [ - pl.int_range(pl.len()).alias("_bf_join_r"), - ] - ) - if how != "cross": - # Note: join_nulls renamed to nulls_equal for polars 1.24 - polars_version = tuple( - int(part) for part in pl.__version__.split(".") if part.isnumeric() + left, right, node.type, left_on, right_on, node.joins_nulls + ) + + @compile_node.register + def compile_isin(self, node: nodes.InNode): + left = self.compile_node(node.left_child) + right = self.compile_node(node.right_child).unique(node.right_col.id.sql) + right = right.with_columns(pl.lit(True).alias(node.indicator_col.sql)) + + left_ex, right_ex = lowering._coerce_comparables( + node.left_col, node.right_col ) - if polars_version >= (1, 24, 0): - join_kwargs = {"nulls_equal": join_nulls} - else: - join_kwargs = {"join_nulls": join_nulls} + + left_pl_ex = self.expr_compiler.compile_expression(left_ex) + right_pl_ex = self.expr_compiler.compile_expression(right_ex) joined = left.join( right, - how=how, - left_on=left_on, - right_on=right_on, + how="left", + left_on=left_pl_ex, + right_on=right_pl_ex, + # Note: join_nulls renamed to nulls_equal for polars 1.24 + join_nulls=node.joins_nulls, # type: ignore coalesce=False, - **join_kwargs, # type: ignore ) - else: - joined = left.join(right, how=how, coalesce=False) - - join_order = ( - ["_bf_join_l", "_bf_join_r"] - if how != "right" - else ["_bf_join_r", "_bf_join_l"] - ) - return joined.sort(join_order, nulls_last=True).drop( - ["_bf_join_l", "_bf_join_r"] - ) - - @compile_node.register - def compile_concat(self, node: nodes.ConcatNode): - child_frames = [self.compile_node(child) for child in node.child_nodes] - child_frames = [ - frame.rename( - {col: id.sql for col, id in zip(frame.columns, node.output_ids)} - ).cast( - { - field.id.sql: _bigframes_dtype_to_polars_dtype(field.dtype) - for field in node.fields - } + passthrough = [pl.col(id) for id in left.columns] + indicator = pl.col(node.indicator_col.sql).fill_null(False) + return joined.select((*passthrough, indicator)) + + def _ordered_join( + self, + left_frame: pl.LazyFrame, + right_frame: pl.LazyFrame, + how: Literal["inner", "outer", "left", "cross"], + left_on: Sequence[pl.Expr], + right_on: Sequence[pl.Expr], + join_nulls: bool, + ): + if how == "right": + # seems to cause seg faults as of v1.30 for no apparent reason + raise ValueError("right join not supported") + left = left_frame.with_columns( + [ + pl.int_range(pl.len()).alias("_bf_join_l"), + ] ) - for frame in child_frames - ] - df = pl.concat(child_frames) - return df - - @compile_node.register - def compile_agg(self, node: nodes.AggregateNode): - df = self.compile_node(node.child) - if node.dropna and len(node.by_column_ids) > 0: - df = df.filter( - [pl.col(ref.id.sql).is_not_null() for ref in node.by_column_ids] + right = right_frame.with_columns( + [ + pl.int_range(pl.len()).alias("_bf_join_r"), + ] ) - if node.order_by: - df = self._sort(df, node.order_by) - return self._aggregate(df, node.aggregations, node.by_column_ids) - - def _aggregate( - self, - df: pl.LazyFrame, - aggregations: Sequence[ - Tuple[agg_expressions.Aggregation, identifiers.ColumnId] - ], - grouping_keys: Tuple[ex.DerefOp, ...], - ) -> pl.LazyFrame: - # Need to materialize columns to broadcast constants - agg_inputs = [ - list( - map( - lambda x: x.alias(guid.generate_guid()), - self.agg_compiler.get_args(agg), + if how != "cross": + # Note: join_nulls renamed to nulls_equal for polars 1.24 + polars_version = tuple( + int(part) for part in pl.__version__.split(".") if part.isnumeric() ) - ) - for agg, _ in aggregations - ] - - df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) - - agg_exprs = [ - self.agg_compiler.compile_agg_op( - agg.op, list(map(lambda x: x.meta.output_name(), inputs)) - ).alias(id.sql) - for (agg, id), inputs in zip(aggregations, agg_inputs) - ] - - if len(grouping_keys) > 0: - group_exprs = [pl.col(ref.id.sql) for ref in grouping_keys] - grouped_df = df_agg_inputs.group_by(group_exprs) - return grouped_df.agg(agg_exprs).sort(group_exprs, nulls_last=True) - else: - return df_agg_inputs.select(agg_exprs) - - @compile_node.register - def compile_explode(self, node: nodes.ExplodeNode): - assert node.offsets_col is None - df = self.compile_node(node.child) - cols = [col.id.sql for col in node.column_ids] - return df.explode(cols) - - @compile_node.register - def compile_sample(self, node: nodes.RandomSampleNode): - df = self.compile_node(node.child) - # Sample is not available on lazyframe - return df.collect().sample(fraction=node.fraction).lazy() - - @compile_node.register - def compile_window(self, node: nodes.WindowOpNode): - df = self.compile_node(node.child) - - window = node.window_spec - # Should have been handled by reweriter - assert len(window.ordering) == 0 - if window.min_periods > 0: - raise NotImplementedError("min_period not yet supported for polars engine") - - if (window.bounds is None) or (window.is_unbounded): - # polars will automatically broadcast the aggregate to the matching input rows - agg_pl = self.agg_compiler.compile_agg_expr(node.expression) - if window.grouping_keys: - agg_pl = agg_pl.over( - self.expr_compiler.compile_expression(key) - for key in window.grouping_keys + if polars_version >= (1, 24, 0): + join_kwargs = {"nulls_equal": join_nulls} + else: + join_kwargs = {"join_nulls": join_nulls} + + joined = left.join( + right, + how=how, + left_on=left_on, + right_on=right_on, + coalesce=False, + **join_kwargs, # type: ignore ) - result = df.with_columns(agg_pl.alias(node.output_name.sql)) - else: # row-bounded window - window_result = self._calc_row_analytic_func( - df, node.expression, node.window_spec, node.output_name.sql - ) - result = pl.concat([df, window_result], how="horizontal") + else: + joined = left.join(right, how=how, coalesce=False) - # Probably easier just to pull this out as a rewriter - if ( - node.expression.op.skips_nulls - and not node.never_skip_nulls - and node.expression.column_references - ): - nullity_expr = functools.reduce( - operator.or_, - ( - pl.col(column.sql).is_null() - for column in node.expression.column_references - ), + join_order = ( + ["_bf_join_l", "_bf_join_r"] + if how != "right" + else ["_bf_join_r", "_bf_join_l"] ) - result = result.with_columns( - pl.when(nullity_expr) - .then(None) - .otherwise(pl.col(node.output_name.sql)) - .alias(node.output_name.sql) + return joined.sort(join_order, nulls_last=True).drop( + ["_bf_join_l", "_bf_join_r"] ) - return result - - def _calc_row_analytic_func( - self, - frame: pl.LazyFrame, - agg_expr: agg_expressions.Aggregation, - window: window_spec.WindowSpec, - name: str, - ) -> pl.LazyFrame: - if not isinstance(window.bounds, window_spec.RowsWindowBounds): - raise NotImplementedError("Only row bounds supported by polars engine") - groupby = None - if len(window.grouping_keys) > 0: - groupby = [ - self.expr_compiler.compile_expression(ref) - for ref in window.grouping_keys + + @compile_node.register + def compile_concat(self, node: nodes.ConcatNode): + child_frames = [self.compile_node(child) for child in node.child_nodes] + child_frames = [ + frame.rename( + {col: id.sql for col, id in zip(frame.columns, node.output_ids)} + ).cast( + { + field.id.sql: _bigframes_dtype_to_polars_dtype(field.dtype) + for field in node.fields + } + ) + for frame in child_frames ] + df = pl.concat(child_frames) + return df + + @compile_node.register + def compile_agg(self, node: nodes.AggregateNode): + df = self.compile_node(node.child) + if node.dropna and len(node.by_column_ids) > 0: + df = df.filter( + [pl.col(ref.id.sql).is_not_null() for ref in node.by_column_ids] + ) + if node.order_by: + df = self._sort(df, node.order_by) + return self._aggregate(df, node.aggregations, node.by_column_ids) + + def _aggregate( + self, + df: pl.LazyFrame, + aggregations: Sequence[ + Tuple[agg_expressions.Aggregation, identifiers.ColumnId] + ], + grouping_keys: Tuple[ex.DerefOp, ...], + ) -> pl.LazyFrame: + # Need to materialize columns to broadcast constants + agg_inputs = [ + list( + map( + lambda x: x.alias(guid.generate_guid()), + self.agg_compiler.get_args(agg), + ) + ) + for agg, _ in aggregations + ] + + df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) - # Polars API semi-bounded, and any grouped rolling window challenging - # https://github.com/pola-rs/polars/issues/4799 - # https://github.com/pola-rs/polars/issues/8976 - pl_agg_expr = self.agg_compiler.compile_agg_expr(agg_expr).alias(name) - index_col_name = "_bf_pl_engine_offsets" - indexed_df = frame.with_row_index(index_col_name) - # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html - period_n, offset_n = _get_period_and_offset(window.bounds) - return ( - indexed_df.rolling( - index_column=index_col_name, - period=f"{period_n}i", - offset=f"{offset_n}i" if (offset_n is not None) else None, - group_by=groupby, + agg_exprs = [ + self.agg_compiler.compile_agg_op( + agg.op, list(map(lambda x: x.meta.output_name(), inputs)) + ).alias(id.sql) + for (agg, id), inputs in zip(aggregations, agg_inputs) + ] + + if len(grouping_keys) > 0: + group_exprs = [pl.col(ref.id.sql) for ref in grouping_keys] + grouped_df = df_agg_inputs.group_by(group_exprs) + return grouped_df.agg(agg_exprs).sort(group_exprs, nulls_last=True) + else: + return df_agg_inputs.select(agg_exprs) + + @compile_node.register + def compile_explode(self, node: nodes.ExplodeNode): + assert node.offsets_col is None + df = self.compile_node(node.child) + cols = [col.id.sql for col in node.column_ids] + return df.explode(cols) + + @compile_node.register + def compile_sample(self, node: nodes.RandomSampleNode): + df = self.compile_node(node.child) + # Sample is not available on lazyframe + return df.collect().sample(fraction=node.fraction).lazy() + + @compile_node.register + def compile_window(self, node: nodes.WindowOpNode): + df = self.compile_node(node.child) + + window = node.window_spec + # Should have been handled by reweriter + assert len(window.ordering) == 0 + if window.min_periods > 0: + raise NotImplementedError( + "min_period not yet supported for polars engine" + ) + + if (window.bounds is None) or (window.is_unbounded): + # polars will automatically broadcast the aggregate to the matching input rows + agg_pl = self.agg_compiler.compile_agg_expr(node.expression) + if window.grouping_keys: + agg_pl = agg_pl.over( + self.expr_compiler.compile_expression(key) + for key in window.grouping_keys + ) + result = df.with_columns(agg_pl.alias(node.output_name.sql)) + else: # row-bounded window + window_result = self._calc_row_analytic_func( + df, node.expression, node.window_spec, node.output_name.sql + ) + result = pl.concat([df, window_result], how="horizontal") + + # Probably easier just to pull this out as a rewriter + if ( + node.expression.op.skips_nulls + and not node.never_skip_nulls + and node.expression.column_references + ): + nullity_expr = functools.reduce( + operator.or_, + ( + pl.col(column.sql).is_null() + for column in node.expression.column_references + ), + ) + result = result.with_columns( + pl.when(nullity_expr) + .then(None) + .otherwise(pl.col(node.output_name.sql)) + .alias(node.output_name.sql) + ) + return result + + def _calc_row_analytic_func( + self, + frame: pl.LazyFrame, + agg_expr: agg_expressions.Aggregation, + window: window_spec.WindowSpec, + name: str, + ) -> pl.LazyFrame: + if not isinstance(window.bounds, window_spec.RowsWindowBounds): + raise NotImplementedError("Only row bounds supported by polars engine") + groupby = None + if len(window.grouping_keys) > 0: + groupby = [ + self.expr_compiler.compile_expression(ref) + for ref in window.grouping_keys + ] + + # Polars API semi-bounded, and any grouped rolling window challenging + # https://github.com/pola-rs/polars/issues/4799 + # https://github.com/pola-rs/polars/issues/8976 + pl_agg_expr = self.agg_compiler.compile_agg_expr(agg_expr).alias(name) + index_col_name = "_bf_pl_engine_offsets" + indexed_df = frame.with_row_index(index_col_name) + # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html + period_n, offset_n = _get_period_and_offset(window.bounds) + return ( + indexed_df.rolling( + index_column=index_col_name, + period=f"{period_n}i", + offset=f"{offset_n}i" if (offset_n is not None) else None, + group_by=groupby, + ) + .agg(pl_agg_expr) + .select(name) ) - .agg(pl_agg_expr) - .select(name) - ) def _get_period_and_offset( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 65b170df32..df538329ce 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1979,7 +1979,10 @@ def test_series_small_repr(scalars_dfs): col_name = "int64_col" bf_series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name] - assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) + with bigframes.pandas.option_context("display.repr_mode", "head"): + assert repr(bf_series) == pd_series.to_string( + length=False, dtype=True, name=True + ) def test_sum(scalars_dfs): diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 64814126ea..e862b6b41e 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -2009,7 +2009,10 @@ def test_series_small_repr(scalars_dfs): col_name = "int64_col" bf_series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name] - assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) + with bigframes.pandas.option_context("display.repr_mode", "head"): + assert repr(bf_series) == pd_series.to_string( + length=False, dtype=True, name=True + ) def test_sum(scalars_dfs): diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index bec8cc1b55..223ad61735 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1787,14 +1787,47 @@ def apply( **Examples:** + Simple vectorized functions, lambdas or ufuncs can be applied directly + with `by_row=False`. + + >>> nums = bpd.Series([1, 2, 3, 4]) + >>> nums + 0 1 + 1 2 + 2 3 + 3 4 + dtype: Int64 + >>> nums.apply(lambda x: x*x + 2*x + 1, by_row=False) + 0 4 + 1 9 + 2 16 + 3 25 + dtype: Int64 + + >>> def is_odd(num): + ... return num % 2 == 1 + >>> nums.apply(is_odd, by_row=False) + 0 True + 1 False + 2 True + 3 False + dtype: boolean - For applying arbitrary python function a `remote_function` is recommended. - Let's use ``reuse=False`` flag to make sure a new `remote_function` - is created every time we run the following code, but you can skip it - to potentially reuse a previously deployed `remote_function` from - the same user defined function. + >>> nums.apply(np.log, by_row=False) + 0 0.0 + 1 0.693147 + 2 1.098612 + 3 1.386294 + dtype: Float64 + + Use `remote_function` to apply an arbitrary Python function. + Set ``reuse=False`` flag to make sure a new `remote_function` + is created every time you run the following code. Omit it + to reuse a previously deployed `remote_function` from + the same user defined function if the hash of the function definition + hasn't changed. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1807,8 +1840,8 @@ def apply( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) - >>> hours + >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP + >>> hours # doctest: +SKIP 0 0.0 1 0.5 2 1.0 @@ -1820,7 +1853,7 @@ def apply( a `remote_function`, you would provide the names of the packages via `packages` param. - >>> @bpd.remote_function( + >>> @bpd.remote_function( # doctest: +SKIP ... reuse=False, ... packages=["cryptography"], ... cloud_function_service_account="default" @@ -1837,11 +1870,11 @@ def apply( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) + >>> hashes = names.apply(get_hash) # doctest: +SKIP You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -1854,46 +1887,13 @@ def apply( ... "I love this product! It's amazing.", ... "Hungry? Wanna eat? Lets go!" ... ]) - >>> features = texts.apply(text_analyzer) - >>> features + >>> features = texts.apply(text_analyzer) # doctest: +SKIP + >>> features # doctest: +SKIP 0 [9 1 0 0] 1 [6 1 1 0] 2 [5 0 1 2] dtype: list[pyarrow] - Simple vectorized functions, lambdas or ufuncs can be applied directly - with `by_row=False`. - - >>> nums = bpd.Series([1, 2, 3, 4]) - >>> nums - 0 1 - 1 2 - 2 3 - 3 4 - dtype: Int64 - >>> nums.apply(lambda x: x*x + 2*x + 1, by_row=False) - 0 4 - 1 9 - 2 16 - 3 25 - dtype: Int64 - - >>> def is_odd(num): - ... return num % 2 == 1 - >>> nums.apply(is_odd, by_row=False) - 0 True - 1 False - 2 True - 3 False - dtype: boolean - - >>> nums.apply(np.log, by_row=False) - 0 0.0 - 1 0.693147 - 2 1.098612 - 3 1.386294 - dtype: Float64 - Args: func (function): BigFrames DataFrames ``remote_function`` to apply. The function From 358fc0ea40f8fa9a3e8f6fb74f69eb9317292e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:04:18 +0000 Subject: [PATCH 50/63] feat: implement cos, sin, and log operations for polars compiler --- bigframes/core/compile/polars/__init__.py | 1 + .../compile/polars/operations/numeric_ops.py | 91 +++++++++++++++++++ tests/unit/test_series_polars.py | 17 ++-- 3 files changed, 98 insertions(+), 11 deletions(-) create mode 100644 bigframes/core/compile/polars/operations/numeric_ops.py diff --git a/bigframes/core/compile/polars/__init__.py b/bigframes/core/compile/polars/__init__.py index 7ae6fcc755..ba9c146592 100644 --- a/bigframes/core/compile/polars/__init__.py +++ b/bigframes/core/compile/polars/__init__.py @@ -24,6 +24,7 @@ # polars shouldn't be needed at import time, as register is a no-op if polars # isn't installed. import bigframes.core.compile.polars.operations.generic_ops # noqa: F401 +import bigframes.core.compile.polars.operations.numeric_ops # noqa: F401 try: import bigframes._importing diff --git a/bigframes/core/compile/polars/operations/numeric_ops.py b/bigframes/core/compile/polars/operations/numeric_ops.py new file mode 100644 index 0000000000..83c5dc3545 --- /dev/null +++ b/bigframes/core/compile/polars/operations/numeric_ops.py @@ -0,0 +1,91 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +BigFrames -> Polars compilation for the operations in bigframes.operations.numeric_ops. + +Please keep implementations in sequential order by op name. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import bigframes.core.compile.polars.compiler as polars_compiler +from bigframes.operations import numeric_ops + +if TYPE_CHECKING: + import polars as pl + + +@polars_compiler.register_op(numeric_ops.CosOp) +def cos_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.CosOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + return input.cos() + + +@polars_compiler.register_op(numeric_ops.LnOp) +def ln_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.LnOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + import polars as pl + + return pl.when(input < 0).then(float("nan")).otherwise(input.log()) + + +@polars_compiler.register_op(numeric_ops.Log10Op) +def log10_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.Log10Op, # type: ignore + input: pl.Expr, +) -> pl.Expr: + import polars as pl + + return pl.when(input < 0).then(float("nan")).otherwise(input.log(base=10)) + + +@polars_compiler.register_op(numeric_ops.Log1pOp) +def log1p_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.Log1pOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + import polars as pl + + return pl.when(input < -1).then(float("nan")).otherwise((input + 1).log()) + + +@polars_compiler.register_op(numeric_ops.SinOp) +def sin_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.SinOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + return input.sin() + + +@polars_compiler.register_op(numeric_ops.SqrtOp) +def sqrt_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.SqrtOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + import polars as pl + + return pl.when(input < 0).then(float("nan")).otherwise(input.sqrt()) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 64814126ea..ee4ac245d3 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -4622,20 +4622,15 @@ def test_apply_lambda(scalars_dfs, col, lambda_): ) -@pytest.mark.skip( - reason="NotImplementedError: Polars compiler hasn't implemented log()" -) @pytest.mark.parametrize( ("ufunc",), [ - pytest.param(numpy.log), - pytest.param(numpy.sqrt), - pytest.param(numpy.sin), - ], - ids=[ - "log", - "sqrt", - "sin", + pytest.param(numpy.cos, id="cos"), + pytest.param(numpy.log, id="log"), + pytest.param(numpy.log10, id="log10"), + pytest.param(numpy.log1p, id="log1p"), + pytest.param(numpy.sqrt, id="sqrt"), + pytest.param(numpy.sin, id="sin"), ], ) def test_apply_numpy_ufunc(scalars_dfs, ufunc): From 42c858659cf21f8d29955786c5e9ada3225aa447 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:42:32 +0000 Subject: [PATCH 51/63] fix domain for log --- bigframes/core/compile/polars/operations/numeric_ops.py | 8 ++++---- bigframes/core/compile/sqlglot/expressions/numeric_ops.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bigframes/core/compile/polars/operations/numeric_ops.py b/bigframes/core/compile/polars/operations/numeric_ops.py index 83c5dc3545..83eee355fe 100644 --- a/bigframes/core/compile/polars/operations/numeric_ops.py +++ b/bigframes/core/compile/polars/operations/numeric_ops.py @@ -46,7 +46,7 @@ def ln_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < 0).then(float("nan")).otherwise(input.log()) + return pl.when(input <= 0).then(float("nan")).otherwise(input.log()) @polars_compiler.register_op(numeric_ops.Log10Op) @@ -57,7 +57,7 @@ def log10_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < 0).then(float("nan")).otherwise(input.log(base=10)) + return pl.when(input <= 0).then(float("nan")).otherwise(input.log(base=10)) @polars_compiler.register_op(numeric_ops.Log1pOp) @@ -68,7 +68,7 @@ def log1p_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < -1).then(float("nan")).otherwise((input + 1).log()) + return pl.when(input <= -1).then(float("nan")).otherwise((input + 1).log()) @polars_compiler.register_op(numeric_ops.SinOp) @@ -88,4 +88,4 @@ def sqrt_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < 0).then(float("nan")).otherwise(input.sqrt()) + return pl.when(input <= 0).then(float("nan")).otherwise(input.sqrt()) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index d86df93921..7b8cd4a058 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -158,7 +158,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(0), + this=expr.expr <= sge.convert(0), true=constants._NAN, ) ], @@ -171,7 +171,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(0), + this=expr.expr <= sge.convert(0), true=constants._NAN, ) ], @@ -184,7 +184,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(-1), + this=expr.expr <= sge.convert(-1), true=constants._NAN, ) ], @@ -207,7 +207,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(0), + this=expr.expr <= sge.convert(0), true=constants._NAN, ) ], From 22e6fb241b588a74210728612ce552ee60422f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:45:30 +0000 Subject: [PATCH 52/63] update snapshot --- .../expressions/snapshots/test_numeric_ops/test_ln/out.sql | 2 +- .../expressions/snapshots/test_numeric_ops/test_log10/out.sql | 2 +- .../expressions/snapshots/test_numeric_ops/test_log1p/out.sql | 2 +- .../expressions/snapshots/test_numeric_ops/test_sqrt/out.sql | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql index 1372c088d9..5d3d1ae09b 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE LN(`bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE LN(`bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql index b4cced439b..532776278d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE LOG(10, `bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE LOG(10, `bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql index c3902ec174..3904025cf8 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < -1 THEN CAST('NaN' AS FLOAT64) ELSE LN(1 + `bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= -1 THEN CAST('NaN' AS FLOAT64) ELSE LN(1 + `bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql index e6a93e5e6c..cd2b19a7a8 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT From 1157f41a598531b11073141355b4f2649cb86428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:42:32 +0000 Subject: [PATCH 53/63] fix domain for log --- bigframes/core/compile/polars/operations/numeric_ops.py | 8 ++++---- bigframes/core/compile/sqlglot/expressions/numeric_ops.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bigframes/core/compile/polars/operations/numeric_ops.py b/bigframes/core/compile/polars/operations/numeric_ops.py index 83c5dc3545..83eee355fe 100644 --- a/bigframes/core/compile/polars/operations/numeric_ops.py +++ b/bigframes/core/compile/polars/operations/numeric_ops.py @@ -46,7 +46,7 @@ def ln_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < 0).then(float("nan")).otherwise(input.log()) + return pl.when(input <= 0).then(float("nan")).otherwise(input.log()) @polars_compiler.register_op(numeric_ops.Log10Op) @@ -57,7 +57,7 @@ def log10_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < 0).then(float("nan")).otherwise(input.log(base=10)) + return pl.when(input <= 0).then(float("nan")).otherwise(input.log(base=10)) @polars_compiler.register_op(numeric_ops.Log1pOp) @@ -68,7 +68,7 @@ def log1p_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < -1).then(float("nan")).otherwise((input + 1).log()) + return pl.when(input <= -1).then(float("nan")).otherwise((input + 1).log()) @polars_compiler.register_op(numeric_ops.SinOp) @@ -88,4 +88,4 @@ def sqrt_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input < 0).then(float("nan")).otherwise(input.sqrt()) + return pl.when(input <= 0).then(float("nan")).otherwise(input.sqrt()) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index d86df93921..7b8cd4a058 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -158,7 +158,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(0), + this=expr.expr <= sge.convert(0), true=constants._NAN, ) ], @@ -171,7 +171,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(0), + this=expr.expr <= sge.convert(0), true=constants._NAN, ) ], @@ -184,7 +184,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(-1), + this=expr.expr <= sge.convert(-1), true=constants._NAN, ) ], @@ -207,7 +207,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr < sge.convert(0), + this=expr.expr <= sge.convert(0), true=constants._NAN, ) ], From f36cce2028bfa46237d8f2c66ed7af55a7575baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:45:30 +0000 Subject: [PATCH 54/63] update snapshot --- .../expressions/snapshots/test_numeric_ops/test_ln/out.sql | 2 +- .../expressions/snapshots/test_numeric_ops/test_log10/out.sql | 2 +- .../expressions/snapshots/test_numeric_ops/test_log1p/out.sql | 2 +- .../expressions/snapshots/test_numeric_ops/test_sqrt/out.sql | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql index 1372c088d9..5d3d1ae09b 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE LN(`bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE LN(`bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql index b4cced439b..532776278d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE LOG(10, `bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE LOG(10, `bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql index c3902ec174..3904025cf8 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < -1 THEN CAST('NaN' AS FLOAT64) ELSE LN(1 + `bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= -1 THEN CAST('NaN' AS FLOAT64) ELSE LN(1 + `bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql index e6a93e5e6c..cd2b19a7a8 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT From 7ed902974c8656b040852ffbef85dd595cab6eae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:49:00 +0000 Subject: [PATCH 55/63] revert sqrt change --- bigframes/core/compile/polars/operations/numeric_ops.py | 2 +- bigframes/core/compile/sqlglot/expressions/numeric_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/compile/polars/operations/numeric_ops.py b/bigframes/core/compile/polars/operations/numeric_ops.py index 83eee355fe..2572d862e3 100644 --- a/bigframes/core/compile/polars/operations/numeric_ops.py +++ b/bigframes/core/compile/polars/operations/numeric_ops.py @@ -88,4 +88,4 @@ def sqrt_op_impl( ) -> pl.Expr: import polars as pl - return pl.when(input <= 0).then(float("nan")).otherwise(input.sqrt()) + return pl.when(input < 0).then(float("nan")).otherwise(input.sqrt()) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index 7b8cd4a058..ac40e4a667 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -207,7 +207,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr <= sge.convert(0), + this=expr.expr < sge.convert(0), true=constants._NAN, ) ], From 2b97c2b4c145c677cdaca50fa37256e835e94df1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 17:49:45 +0000 Subject: [PATCH 56/63] revert sqrt change --- .../expressions/snapshots/test_numeric_ops/test_sqrt/out.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql index cd2b19a7a8..e6a93e5e6c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT From 7dc6db718d3f1673e03ba9e745e93e56a86f1575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 19:42:49 +0000 Subject: [PATCH 57/63] fix more samples --- bigframes/dataframe.py | 2 +- bigframes/operations/strings.py | 2 +- bigframes/series.py | 2 +- bigframes/session/__init__.py | 22 +++++++------- .../pandas/core/config_init.py | 1 + .../bigframes_vendored/pandas/core/series.py | 29 ++++++++++--------- .../pandas/core/tools/timedeltas.py | 2 +- 7 files changed, 31 insertions(+), 29 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 65e2d50582..ec458cc462 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4247,7 +4247,7 @@ def _resample( **Examples:** - + >>> import bigframes.pandas as bpd >>> data = { ... "timestamp_col": pd.date_range( ... start="2021-01-01 13:00:00", periods=30, freq="1s" diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index bc07ffaee0..efbdd865b0 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -67,7 +67,7 @@ def reverse(self) -> series.Series: **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["apple", "banana", "", pd.NA]) >>> s.str.reverse() 0 elppa diff --git a/bigframes/series.py b/bigframes/series.py index 6ebd129c7c..642e574627 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2416,7 +2416,7 @@ def _resample( **Examples:** - + >>> import bigframes.pandas as bpd >>> data = { ... "timestamp_col": pd.date_range( ... start="2021-01-01 13:00:00", periods=30, freq="1s" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f82f3c5003..6418f2b78f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -617,9 +617,9 @@ def read_gbq_query( **Examples:** - Simple query input: + >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq_query(''' ... SELECT ... pitcherFirstName, @@ -771,9 +771,9 @@ def read_gbq_table( **Examples:** - Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). + >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq_table("bigquery-public-data.ml_datasets.penguins") See also: :meth:`Session.read_gbq`. @@ -875,9 +875,9 @@ def read_gbq_model(self, model_name: str): **Examples:** - Read an existing BigQuery ML model. + >>> import bigframes.pandas as bpd >>> model_name = "bigframes-dev.bqml_tutorial.penguins_model" >>> model = bpd.read_gbq_model(model_name) @@ -1872,7 +1872,7 @@ def udf( You can clean-up the BigQuery functions created above using the BigQuery client from the BigQuery DataFrames session: - >>> session = bpd.get_global_session() + >>> session = bpd.get_global_session() # doctest: +SKIP >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) # doctest: +SKIP >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) # doctest: +SKIP @@ -1980,10 +1980,10 @@ def read_gbq_function( **Examples:** - Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. + >>> import bigframes.pandas as bpd >>> func = bpd.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") You can run it on scalar input. Usually you would do so to verify that @@ -2043,13 +2043,13 @@ def read_gbq_function( Another use case is to define your own remote function and use it later. For example, define the remote function: - >>> @bpd.remote_function(cloud_function_service_account="default") + >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP ... def tenfold(num: int) -> float: ... return num * 10 Then, read back the deployed BQ remote function: - >>> tenfold_ref = bpd.read_gbq_function( + >>> tenfold_ref = bpd.read_gbq_function( # doctest: +SKIP ... tenfold.bigframes_remote_function, ... ) @@ -2061,7 +2061,7 @@ def read_gbq_function( [2 rows x 3 columns] - >>> df['a'].apply(tenfold_ref) + >>> df['a'].apply(tenfold_ref) # doctest: +SKIP 0 10.0 1 20.0 Name: a, dtype: Float64 @@ -2070,11 +2070,11 @@ def read_gbq_function( note, row processor implies that the function has only one input parameter. - >>> @bpd.remote_function(cloud_function_service_account="default") + >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP ... def row_sum(s: pd.Series) -> float: ... return s['a'] + s['b'] + s['c'] - >>> row_sum_ref = bpd.read_gbq_function( + >>> row_sum_ref = bpd.read_gbq_function( # doctest: +SKIP ... row_sum.bigframes_remote_function, ... is_row_processor=True, ... ) @@ -2087,7 +2087,7 @@ def read_gbq_function( [2 rows x 3 columns] - >>> df.apply(row_sum_ref, axis=1) + >>> df.apply(row_sum_ref, axis=1) # doctest: +SKIP 0 9.0 1 12.0 dtype: Float64 diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 20da78e094..dc2b11ab94 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -19,6 +19,7 @@ Define Repr mode to "deferred" will prevent job execution in repr. + >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") >>> bpd.options.display.repr_mode = "deferred" diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 223ad61735..b494252bdf 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -986,9 +986,9 @@ def drop_duplicates( **Examples:** - Generate a Series with duplicated entries. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], ... name='animal') >>> s @@ -1176,7 +1176,7 @@ def round(self, decimals: int = 0) -> Series: **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() 0 0.0 @@ -1283,17 +1283,17 @@ def autocorr(self, lag: int = 1) -> float: >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) - >>> s.autocorr() # doctest: +ELLIPSIS - np.float64(0.10355263309024067) + >>> s.autocorr() + 0.10355263309024065 >>> s.autocorr(lag=2) - np.float64(-1.0) + -1.0 If the Pearson correlation is not well defined, then 'NaN' is returned. >>> s = bpd.Series([1, 0, 0, 0]) >>> s.autocorr() - np.float64(nan) + nan Args: lag (int, default 1): @@ -1927,10 +1927,10 @@ def combine( **Examples:** - Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. + >>> import bigframes.pandas as bpd >>> s1 = bpd.Series({'falcon': 330.0, 'eagle': 160.0}) >>> s1 falcon 330.0 @@ -2376,7 +2376,7 @@ def replace( **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s 0 1 @@ -2684,7 +2684,7 @@ def cumprod(self): **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -3973,7 +3973,7 @@ def pow(self, other) -> Series: **Examples:** - + >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4006,6 +4006,7 @@ def pow(self, other) -> Series: The result of the operation. """ + # TODO(b/452366836): adjust sample if needed to match pyarrow semantics. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __pow__(self, other): @@ -4055,7 +4056,7 @@ def rpow(self, other) -> Series: **Examples:** - + >>> import bigframes.pandas as bpd >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4610,7 +4611,7 @@ def quantile( **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) np.float64(2.5) @@ -5290,7 +5291,7 @@ def str(self): **Examples:** - + >>> import bigframes.pandas as bpd >>> s = bpd.Series(["A_Str_Series"]) >>> s 0 A_Str_Series @@ -5317,7 +5318,7 @@ def plot(self): **Examples:** - + >>> import bigframes.pandas as bpd >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") >>> plot diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index f9ebe59f8d..4e418af406 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -54,9 +54,9 @@ def to_timedelta( **Examples:** - Converting a Scalar to timedelta + >>> import bigframes.pandas as bpd >>> scalar = 2 >>> bpd.to_timedelta(scalar, unit='s') Timedelta('0 days 00:00:02') From 3bb0464eb77cb69f0f673c0e1898d33aa84356bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 19:53:22 +0000 Subject: [PATCH 58/63] sync polars compiler with main --- bigframes/core/compile/polars/compiler.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 6c2e324206..acaf1b8f22 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -724,22 +724,14 @@ def _ordered_join( ] ) if how != "cross": - # Note: join_nulls renamed to nulls_equal for polars 1.24 - polars_version = tuple( - int(part) for part in pl.__version__.split(".") if part.isnumeric() - ) - if polars_version >= (1, 24, 0): - join_kwargs = {"nulls_equal": join_nulls} - else: - join_kwargs = {"join_nulls": join_nulls} - joined = left.join( right, how=how, left_on=left_on, right_on=right_on, + # Note: join_nulls renamed to nulls_equal for polars 1.24 + join_nulls=join_nulls, # type: ignore coalesce=False, - **join_kwargs, # type: ignore ) else: joined = left.join(right, how=how, coalesce=False) From e7425fce77e8c48ff829d63f8791a9e5650339a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 15 Oct 2025 21:25:21 +0000 Subject: [PATCH 59/63] avoid np in output --- third_party/bigframes_vendored/pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b494252bdf..179013a1ce 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1283,16 +1283,16 @@ def autocorr(self, lag: int = 1) -> float: >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) - >>> s.autocorr() - 0.10355263309024065 + >>> float(s.autocorr()) # doctest: +ELLIPSIS + 0.1035526330902... - >>> s.autocorr(lag=2) + >>> float(s.autocorr(lag=2)) -1.0 If the Pearson correlation is not well defined, then 'NaN' is returned. >>> s = bpd.Series([1, 0, 0, 0]) - >>> s.autocorr() + >>> float(s.autocorr()) nan Args: From 2aa7f250e1a7f01ba90fbb0e4a57c33ddd69584f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 15 Oct 2025 16:30:37 -0500 Subject: [PATCH 60/63] Update tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql --- .../expressions/snapshots/test_numeric_ops/test_sqrt/out.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql index cd2b19a7a8..e6a93e5e6c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sqrt/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - CASE WHEN `bfcol_0` <= 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` + CASE WHEN `bfcol_0` < 0 THEN CAST('NaN' AS FLOAT64) ELSE SQRT(`bfcol_0`) END AS `bfcol_1` FROM `bfcte_0` ) SELECT From 941497181a088980372fd7055bb452f6100f730f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 15 Oct 2025 16:31:01 -0500 Subject: [PATCH 61/63] Update bigframes/core/compile/sqlglot/expressions/numeric_ops.py --- bigframes/core/compile/sqlglot/expressions/numeric_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index 7b8cd4a058..ac40e4a667 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -207,7 +207,7 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.Case( ifs=[ sge.If( - this=expr.expr <= sge.convert(0), + this=expr.expr < sge.convert(0), true=constants._NAN, ) ], From 8ced818e56d9cf74f9a5c47fdd053f8154edcdf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 16 Oct 2025 01:15:10 +0000 Subject: [PATCH 62/63] upgrade requirements in polars sample --- samples/polars/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/polars/requirements.txt b/samples/polars/requirements.txt index a1d8fbcdac..1626982536 100644 --- a/samples/polars/requirements.txt +++ b/samples/polars/requirements.txt @@ -1,3 +1,3 @@ -bigframes==1.11.1 -polars==1.3.0 -pyarrow==15.0.0 +bigframes==2.25.0 +polars==1.24.0 +pyarrow==21.0.0 From c3a9d61bcec7b4f66e2aec8d15808ae550de1f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 16 Oct 2025 20:31:15 +0000 Subject: [PATCH 63/63] add todo for making doctest more robust --- conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conftest.py b/conftest.py index 41583b1a6b..bd2053b092 100644 --- a/conftest.py +++ b/conftest.py @@ -48,3 +48,8 @@ def default_doctest_imports(doctest_namespace, polars_session_or_bpd): doctest_namespace["pa"] = pa doctest_namespace["bpd"] = polars_session_or_bpd bigframes._config.options.display.progress_bar = None + + # TODO(tswast): Consider setting the numpy printoptions here for better + # compatibility across numpy versions. + # https://numpy.org/doc/stable/release/2.0.0-notes.html#representation-of-numpy-scalars-changed + # https://numpy.org/doc/stable/reference/generated/numpy.set_printoptions.html#numpy-set-printoptions