From 8cfaca6e6b222f1140c25222ffffee9a40a2a229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 16:24:28 +0000 Subject: [PATCH 01/36] docs: remove import bigframes.pandas as bpd boilerplate from many samples Also, fixes several constructors that didn't take a session for compatibility with multi-session applications. --- bigframes/bigquery/_operations/ai.py | 21 +- bigframes/bigquery/_operations/approx_agg.py | 1 - bigframes/bigquery/_operations/array.py | 6 - bigframes/bigquery/_operations/datetime.py | 10 +- bigframes/bigquery/_operations/geo.py | 13 - bigframes/bigquery/_operations/json.py | 12 - bigframes/bigquery/_operations/search.py | 1 - bigframes/bigquery/_operations/sql.py | 3 - bigframes/bigquery/_operations/struct.py | 1 - {tests/unit => bigframes}/conftest.py | 21 + bigframes/core/compile/polars/compiler.py | 4 +- bigframes/core/indexes/base.py | 11 +- bigframes/core/log_adapter.py | 4 +- bigframes/core/reshape/tile.py | 7 +- bigframes/core/tools/datetimes.py | 10 +- bigframes/dataframe.py | 6 +- bigframes/ml/compose.py | 1 - bigframes/operations/ai.py | 17 +- bigframes/operations/base.py | 13 +- bigframes/operations/semantics.py | 20 +- bigframes/operations/strings.py | 1 - bigframes/pandas/__init__.py | 18 +- bigframes/series.py | 5 - bigframes/session/__init__.py | 102 +++- dummy.pkl | Bin 0 -> 1150 bytes .../bigframes_vendored/geopandas/geoseries.py | 9 - .../bigframes_vendored/pandas/AUTHORS.md | 1 - .../bigframes_vendored/pandas/README.md | 2 - .../bigframes_vendored/pandas/conftest.py | 45 ++ .../pandas/core/arrays/arrow/accessors.py | 25 +- .../pandas/core/arrays/datetimelike.py | 7 +- .../pandas/core/computation/eval.py | 3 - .../pandas/core/computation/expr.py | 3 - .../pandas/core/computation/ops.py | 1 - .../bigframes_vendored/pandas/core/frame.py | 436 ++--------------- .../bigframes_vendored/pandas/core/generic.py | 45 +- .../pandas/core/groupby/__init__.py | 127 +---- .../pandas/core/indexes/accessor.py | 47 -- .../pandas/core/indexes/base.py | 108 +---- .../pandas/core/indexes/datetimes.py | 24 - .../pandas/core/indexes/multi.py | 4 - .../pandas/core/reshape/tile.py | 3 - .../bigframes_vendored/pandas/core/series.py | 439 ++---------------- .../pandas/core/strings/accessor.py | 104 +---- .../pandas/core/tools/datetimes.py | 4 +- .../pandas/core/tools/timedeltas.py | 1 - .../bigframes_vendored/pandas/io/gbq.py | 1 - .../bigframes_vendored/pandas/io/parquet.py | 2 - .../pandas/io/parsers/readers.py | 4 - .../bigframes_vendored/pandas/io/pickle.py | 2 - .../pandas/pandas/_typing.py | 2 - .../pandas/plotting/_core.py | 12 +- .../sklearn/cluster/_kmeans.py | 1 - .../sklearn/decomposition/_mf.py | 1 - .../sklearn/decomposition/_pca.py | 1 - .../sklearn/impute/_base.py | 1 - .../sklearn/linear_model/_base.py | 1 - .../sklearn/linear_model/_logistic.py | 1 - .../sklearn/metrics/_classification.py | 5 - .../sklearn/metrics/_ranking.py | 3 - .../sklearn/metrics/_regression.py | 3 - .../sklearn/model_selection/_split.py | 2 - .../sklearn/model_selection/_validation.py | 1 - .../sklearn/preprocessing/_encoder.py | 1 - 64 files changed, 321 insertions(+), 1469 deletions(-) rename {tests/unit => bigframes}/conftest.py (57%) create mode 100644 dummy.pkl create mode 100644 third_party/bigframes_vendored/pandas/conftest.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index a789310683..7698c2c95c 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -52,14 +52,13 @@ def generate( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> country = bpd.Series(["Japan", "Canada"]) - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) # doctest: +SKIP 0 {'result': 'Tokyo\\n', 'full_response': '{"cand... 1 {'result': 'Ottawa\\n', 'full_response': '{"can... dtype: struct>, status: string>[pyarrow] - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") # doctest: +SKIP 0 Tokyo\\n 1 Ottawa\\n Name: result, dtype: string @@ -125,7 +124,6 @@ def generate_bool( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "col_1": ["apple", "bear", "pear"], ... "col_2": ["fruit", "animal", "animal"] @@ -203,8 +201,7 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... 1 {'result': 4, 'full_response': '{"candidates":... @@ -278,8 +275,7 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... 1 {'result': 4.0, 'full_response': '{"candidates... @@ -350,8 +346,7 @@ def if_( **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) + >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True 1 True @@ -400,8 +395,7 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) + >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df creature type @@ -451,8 +445,7 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) + >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 1 1.0 diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 696f8f5a66..73b6fdbb73 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -40,7 +40,6 @@ def approx_top_count( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) >>> bbq.approx_top_count(s, number=2) [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 4af1416127..239bc9566a 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -40,8 +40,6 @@ def array_length(series: series.Series) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) >>> bbq.array_length(s) 0 4 @@ -78,8 +76,6 @@ def array_agg( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None For a SeriesGroupBy object: @@ -128,8 +124,6 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) >>> bbq.array_to_string(s, delimiter=", ") diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index f8767336dd..c4aba91a29 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -21,11 +21,8 @@ def unix_seconds(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_seconds(s) 0 86400 @@ -48,11 +45,8 @@ def unix_millis(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_millis(s) 0 86400000 @@ -75,10 +69,8 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd + >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_micros(s) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 9a92a8960d..e5aa383779 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -53,8 +53,6 @@ def st_area( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -125,8 +123,6 @@ def st_buffer( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Point(0, 0), @@ -195,8 +191,6 @@ def st_centroid( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -250,8 +244,6 @@ def st_convexhull( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None - >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -312,7 +304,6 @@ def st_difference( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -407,7 +398,6 @@ def st_distance( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -489,7 +479,6 @@ def st_intersection( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -583,7 +572,6 @@ def st_isclosed( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point, LineString, Polygon - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -650,7 +638,6 @@ def st_length( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point, GeometryCollection - >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 656e59af0d..4e1f43aab0 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -49,8 +49,6 @@ def json_set( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) @@ -101,7 +99,6 @@ def json_extract( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") @@ -141,7 +138,6 @@ def json_extract_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_array(s) @@ -204,7 +200,6 @@ def json_extract_string_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_string_array(s) @@ -272,7 +267,6 @@ def json_query( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_query(s, json_path="$.class") @@ -303,7 +297,6 @@ def json_query_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_query_array(s) @@ -355,7 +348,6 @@ def json_value( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"name": "Jakob", "age": "6"}', '{"name": "Jakob", "age": []}']) >>> bbq.json_value(s, json_path="$.age") @@ -392,7 +384,6 @@ def json_value_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_value_array(s) @@ -439,7 +430,6 @@ def to_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json(s) @@ -473,7 +463,6 @@ def to_json_string( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json_string(s) @@ -512,7 +501,6 @@ def parse_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> s diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index c16c2af1a9..b65eed2475 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -111,7 +111,6 @@ def vector_search( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column is used as the search query: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index a2de61fc21..295412fd75 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -36,9 +36,6 @@ def sql_scalar( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> import pandas as pd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index 7cb826351c..a6304677ef 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -39,7 +39,6 @@ def struct(value: dataframe.DataFrame) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import bigframes.series as series - >>> bpd.options.display.progress_bar = None >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) >>> df = srs.struct.explode() diff --git a/tests/unit/conftest.py b/bigframes/conftest.py similarity index 57% rename from tests/unit/conftest.py rename to bigframes/conftest.py index a9b26afeef..e1f3f6d84c 100644 --- a/tests/unit/conftest.py +++ b/bigframes/conftest.py @@ -12,8 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + +import numpy as np +import pandas as pd +import pyarrow as pa import pytest +import bigframes._config + @pytest.fixture(scope="session") def polars_session(): @@ -22,3 +29,17 @@ def polars_session(): from bigframes.testing import polars_session return polars_session.TestSession() + + +@pytest.fixture(autouse=True) +def default_doctest_imports(doctest_namespace, polars_session): + """ + Avoid some boilerplate in pandas-inspired tests. + + See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["pa"] = pa + doctest_namespace["bpd"] = polars_session + bigframes._config.options.display.progress_bar = None diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index f7c742e852..059ec72076 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -493,9 +493,9 @@ def compile_agg_op( if isinstance(op, agg_ops.MedianOp): return pl.median(*inputs) if isinstance(op, agg_ops.AllOp): - return pl.all(*inputs) + return pl.col(inputs).cast(pl.Boolean).all() if isinstance(op, agg_ops.AnyOp): - return pl.any(*inputs) # type: ignore + return pl.col(inputs).cast(pl.Boolean).any() if isinstance(op, agg_ops.NuniqueOp): return pl.col(*inputs).drop_nulls().n_unique() if isinstance(op, agg_ops.MinOp): diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index a6b18fcb43..b79363aa0a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -383,9 +383,16 @@ def to_series( name = self.name if name is None else name if index is None: - return bigframes.series.Series(data=self, index=self, name=name) + return bigframes.series.Series( + data=self, index=self, name=name, session=self._session + ) else: - return bigframes.series.Series(data=self, index=Index(index), name=name) + return bigframes.series.Series( + data=self, + index=Index(index, session=self._session), + name=name, + session=self._session, + ) def get_level_values(self, level) -> Index: level_n = level if isinstance(level, int) else self.names.index(level) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 3ec1e86dc7..8179ffbeed 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -155,7 +155,9 @@ def method_logger(method=None, /, *, custom_base_name: Optional[str] = None): def outer_wrapper(method): @functools.wraps(method) def wrapper(*args, **kwargs): - api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) + api_method_name = getattr( + method, LOG_OVERRIDE_NAME, method.__name__ + ).lower() if custom_base_name is None: qualname_parts = getattr(method, "__qualname__", method.__name__).split( "." diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index 74a941be54..a2efa8f927 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import Optional, TYPE_CHECKING import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -31,6 +32,9 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series +if TYPE_CHECKING: + import bigframes.session + def cut( x, @@ -42,6 +46,7 @@ def cut( *, right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, + session: Optional[bigframes.session.Session] = None, ) -> bigframes.series.Series: if ( labels is not None @@ -65,7 +70,7 @@ def cut( raise ValueError("Cannot cut empty array.") if not isinstance(x, bigframes.series.Series): - x = bigframes.series.Series(x) + x = bigframes.series.Series(x, session=session) if isinstance(bins, int): if bins <= 0: diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 7edf2fa2e4..fd7561f4b4 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + from collections.abc import Mapping from datetime import date, datetime -from typing import Optional, Union +from typing import Optional, TYPE_CHECKING, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes @@ -25,6 +27,9 @@ import bigframes.operations as ops import bigframes.series +if TYPE_CHECKING: + import bigframes.session + def to_datetime( arg: Union[ @@ -37,6 +42,7 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, + session: Optional[bigframes.session.Session], ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( @@ -52,7 +58,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg) + arg = bigframes.series.Series(arg, session=session) if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1bde29506d..49ec2fced3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -688,7 +688,7 @@ def _getitem_label(self, key: blocks.Label): return DataFrame(block) if len(col_ids) == 1: - return bigframes.series.Series(block) + return bigframes.series.Series(block, name=key) return DataFrame(block) # Bool Series selects rows @@ -1771,7 +1771,6 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 2, 2]}) Download the data from BigQuery and convert it into an in-memory pandas DataFrame. @@ -1893,7 +1892,6 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) Iterate through the results in batches, limiting the total rows yielded @@ -4252,8 +4250,6 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> data = { ... "timestamp_col": pd.date_range( diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 92c98695cd..54ce7066cb 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -69,7 +69,6 @@ class SQLScalarColumnTransformer: >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index ac294b0fbd..dbbf16afc3 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -45,7 +45,6 @@ def filter( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -115,8 +114,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -134,8 +132,7 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -266,8 +263,7 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -356,8 +352,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -496,7 +491,6 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.ai_operators = True @@ -608,8 +602,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index f2bbcb3320..ebb5767264 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -115,8 +115,6 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) - if name: - block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -132,6 +130,13 @@ def __init__( block = read_pandas_func(pd_series)._get_block() # type:ignore assert block is not None + + # If we didn't get a block make sure the name is what the user + # explicitly chose even if it is None. This is important for the + # polars backend where the implicit column labels are integers. + if not isinstance(data, blocks.Block): + block = block.with_column_labels([name]) + self._block: blocks.Block = block @property @@ -160,7 +165,9 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - return series.Series(block.select_column(result_id)) + result = series.Series(block.select_column(result_id)) + result.name = getattr(self, "name", None) + return result def _apply_binary_op( self, diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 9fa5450748..b4f7af1aca 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -52,7 +52,6 @@ def agg( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -247,8 +246,7 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -321,8 +319,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -435,8 +432,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -558,8 +554,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -697,7 +692,6 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True @@ -800,8 +794,7 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -1001,8 +994,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 4743483954..c69993849a 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -68,7 +68,6 @@ def reverse(self) -> series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) >>> s.str.reverse() diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 2ea10132bc..19ea282762 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -16,8 +16,8 @@ from __future__ import annotations -from collections import namedtuple -from datetime import date, datetime +import collections +import datetime import inspect import sys import typing @@ -198,18 +198,18 @@ def to_datetime( @typing.overload def to_datetime( - arg: Union[int, float, str, datetime, date], + arg: Union[int, float, str, datetime.datetime, datetime.date], *, utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime]: +) -> Union[pandas.Timestamp, datetime.datetime]: ... def to_datetime( arg: Union[ - Union[int, float, str, datetime, date], + Union[int, float, str, datetime.datetime, datetime.date], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, @@ -218,13 +218,15 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: - return bigframes.core.tools.to_datetime( +) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + return global_session.with_default_session( + bigframes.session.Session.to_datetime, arg, utc=utc, format=format, unit=unit, ) + return bigframes.core.tools.to_datetime() to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ @@ -321,7 +323,7 @@ def clean_up_by_session_id( __version__ = bigframes.version.__version__ # Other public pandas attributes -NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +NamedAgg = collections.namedtuple("NamedAgg", ["column", "aggfunc"]) options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" diff --git a/bigframes/series.py b/bigframes/series.py index 490298d8dd..337a796739 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -533,7 +533,6 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2]) Download the data from BigQuery and convert it into an in-memory pandas Series. @@ -661,7 +660,6 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2, 2, 3]) Iterate through the results in batches, limiting the total rows yielded @@ -2421,9 +2419,6 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> data = { ... "timestamp_col": pd.date_range( ... start="2021-01-01 13:00:00", periods=30, freq="1s" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f0cec864b4..11621e8ea7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,6 +67,7 @@ import bigframes.constants import bigframes.core from bigframes.core import blocks, log_adapter, utils +import bigframes.core.indexes import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -83,7 +84,6 @@ # Avoid circular imports. if typing.TYPE_CHECKING: - import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series import bigframes.streaming.dataframe as streaming_dataframe @@ -315,6 +315,15 @@ def bqconnectionmanager(self): ) return self._bq_connection_manager + @property + def options(self) -> bigframes._config.Options: + """Options for configuring BigQuery DataFrames. + + Included for compatibility between bpd and Session. + """ + # TODO(tswast): Consider making a separate session-level options object. + return bigframes._config.options + @property def session_id(self): return self._session_id @@ -597,7 +606,6 @@ def read_gbq_query( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Simple query input: @@ -753,7 +761,6 @@ def read_gbq_table( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). @@ -832,7 +839,6 @@ def read_gbq_table_streaming( >>> import bigframes.streaming as bst >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") @@ -861,7 +867,6 @@ def read_gbq_model(self, model_name: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Read an existing BigQuery ML model. @@ -931,8 +936,6 @@ def read_pandas( **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -1810,7 +1813,6 @@ def udf( >>> import bigframes.pandas as bpd >>> import datetime - >>> bpd.options.display.progress_bar = None Turning an arbitrary python function into a BigQuery managed python udf: @@ -1973,7 +1975,6 @@ def read_gbq_function( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. @@ -2283,6 +2284,89 @@ def read_gbq_object_table( s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() + # ========================================================================= + # bigframes.pandas attributes + # + # These are included so that Session and bigframes.pandas can be used + # interchangeably. + # ========================================================================= + def cut(self, *args, **kwargs) -> bigframes.series.Series: + import bigframes.core.reshape.tile + + return bigframes.core.reshape.tile.cut( + *args, + session=self, + **kwargs, + ) + + def DataFrame(self, *args, **kwargs) -> bigframes.dataframe.DataFrame: + """Constructs a DataFrame. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.DataFrame` for full documentation. + """ + import bigframes.dataframe + + return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) + + def MultiIndex(self, *args, **kwargs) -> bigframes.core.indexes.MultiIndex: + """Constructs a MultiIndex. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.MulitIndex` for full documentation. + """ + import bigframes.core.indexes + + return bigframes.core.indexes.MultiIndex(*args, session=self, **kwargs) + + MultiIndex.from_tuples = bigframes.core.indexes.MultiIndex.from_tuples # type: ignore + MultiIndex.from_frame = bigframes.core.indexes.MultiIndex.from_frame # type: ignore + MultiIndex.from_arrays = bigframes.core.indexes.MultiIndex.from_arrays # type: ignore + + def Index(self, *args, **kwargs) -> bigframes.core.indexes.Index: + """Constructs a Index. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.Index` for full documentation. + """ + import bigframes.core.indexes + + return bigframes.core.indexes.Index(*args, session=self, **kwargs) + + def Series(self, *args, **kwargs) -> bigframes.series.Series: + """Constructs a Series. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.Series` for full documentation. + """ + import bigframes.series + + return bigframes.series.Series(*args, session=self, **kwargs) + + def to_datetime( + self, *args, **kwargs + ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + import bigframes.core.tools + + return bigframes.core.tools.to_datetime( + *args, + session=self, + **kwargs, + ) + + def to_timedelta(self, *args, **kwargs): + import bigframes.pandas.core.tools.timedeltas + + return bigframes.pandas.core.tools.timedeltas.to_timedelta( + *args, + session=self, + **kwargs, + ) + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/dummy.pkl b/dummy.pkl new file mode 100644 index 0000000000000000000000000000000000000000..76a409b1ded309cfc7b30cccd49d85a710e737bd GIT binary patch literal 1150 zcmbVMU2D`p6is$F+m9_uQPyHX!3T{XfkORwP()jV8eBodw=zt!lbx{nSTYmpf`UG@ zwlHsd?cdcqli4LgMKBM!H_4ql=bU>c-@Koq=a@@v&uB5GB8bb11xZD725RGwO8Um+ z3wZb)zJjlMB%f5E?zGF(Lb9r$nFw-P&UF%7emK7)(%AMgSEu&dnXFdB{C z{=&=LLPtUrdZ(b=1CUsxJd#r}1%A`N^i~^V0(?hxqP=#nFMsL9@0w1164RJ79LW+6 z-%^_>%#-~?i_XZxAL*eD(qha$lQ^RSN3+uw*L-0jh^WAcdq=uZQyUnvq@l`pRd0%w z$RveYM52z=dQ_*GObcx2i7bt^AfXewp{w->JNnKC{8}>|zO6|wHD8kNTM^c5T(@z< zM&P?zzlJlvcZF{ETi=l?5BMH}<1Y|Lr;X$cj=|@T)g~#}30bn_PmQG;-wfNn$!p|h zjE8T1B0d9+UahmU2&fpLJ^u>> import bigframes.geopandas >>> import bigframes.pandas as bpd >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -73,7 +72,6 @@ def x(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -100,7 +98,6 @@ def y(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -129,7 +126,6 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry - >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Polygon, LineString, Point >>> s = geopandas.GeoSeries( @@ -171,7 +167,6 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import bigframes.geopandas - >>> bpd.options.display.progress_bar = None >>> x = [2.5, 5, -3.0] >>> y = [0.5, 1, 1.5] @@ -210,7 +205,6 @@ def from_wkt(cls, data, index=None) -> bigframes.geopandas.GeoSeries: >>> import bigframes as bpd >>> import bigframes.geopandas - >>> bpd.options.display.progress_bar = None >>> wkts = [ ... 'POINT (1 1)', @@ -246,7 +240,6 @@ def to_wkt(self) -> bigframes.series.Series: >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Point - >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -279,7 +272,6 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -411,7 +403,6 @@ def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignor >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point - >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. diff --git a/third_party/bigframes_vendored/pandas/AUTHORS.md b/third_party/bigframes_vendored/pandas/AUTHORS.md index 84fcfe05e3..396bcbf9dd 100644 --- a/third_party/bigframes_vendored/pandas/AUTHORS.md +++ b/third_party/bigframes_vendored/pandas/AUTHORS.md @@ -47,7 +47,6 @@ file to indicate the copyright and license terms: Other licenses can be found in the LICENSES directory. -License ======= pandas is distributed under a 3-clause ("Simplified" or "New") BSD diff --git a/third_party/bigframes_vendored/pandas/README.md b/third_party/bigframes_vendored/pandas/README.md index 1aa5068d5e..f92a629a4c 100644 --- a/third_party/bigframes_vendored/pandas/README.md +++ b/third_party/bigframes_vendored/pandas/README.md @@ -60,7 +60,6 @@ Here are just a few of the things that pandas does well: generation and frequency conversion, moving window statistics, date shifting and lagging - [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures @@ -120,7 +119,6 @@ python setup.py install or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): - ```sh python -m pip install -e . --no-build-isolation --no-use-pep517 ``` diff --git a/third_party/bigframes_vendored/pandas/conftest.py b/third_party/bigframes_vendored/pandas/conftest.py new file mode 100644 index 0000000000..e1f3f6d84c --- /dev/null +++ b/third_party/bigframes_vendored/pandas/conftest.py @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import bigframes._config + + +@pytest.fixture(scope="session") +def polars_session(): + pytest.importorskip("polars") + + from bigframes.testing import polars_session + + return polars_session.TestSession() + + +@pytest.fixture(autouse=True) +def default_doctest_imports(doctest_namespace, polars_session): + """ + Avoid some boilerplate in pandas-inspired tests. + + See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["pa"] = pa + doctest_namespace["bpd"] = polars_session + bigframes._config.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index fe15e7b40d..9f6dfc1c74 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -19,14 +19,12 @@ def len(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.len() 0 3 @@ -45,14 +43,12 @@ def __getitem__(self, key: int | slice): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list[0] 0 1 @@ -83,15 +79,13 @@ def field(self, name_or_index: str | int): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -129,15 +123,13 @@ def explode(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -165,15 +157,13 @@ def dtypes(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -200,8 +190,6 @@ def explode(self, column, *, separator: str = "."): **Examples:** >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> bpd.options.display.progress_bar = None >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ @@ -209,7 +197,7 @@ def explode(self, column, *, separator: str = "."): ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=bpd.ArrowDtype(pa.struct( + ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -233,7 +221,6 @@ def explode(self, column, *, separator: str = "."): Separator/delimiter to use to separate the original column name from the sub-field column name. - Returns: DataFrame: Original DataFrame with exploded struct column(s). diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 1736a7f9ef..eeffbbdb7f 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -15,8 +15,6 @@ def strftime(self, date_format: str): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.to_datetime( ... ['2014-08-15 08:15:12', '2012-02-29 08:15:12+06:00', '2015-08-15 08:15:12+05:00'], ... utc=True @@ -36,6 +34,7 @@ def strftime(self, date_format: str): bigframes.pandas.Series: Series of formatted strings. """ + # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def normalize(self): @@ -51,7 +50,6 @@ def normalize(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', @@ -68,6 +66,7 @@ def normalize(self): bigframes.pandas.Series: Series of the same dtype as the data. """ + # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floor(self, freq: str): @@ -85,8 +84,6 @@ def floor(self, freq: str): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index d3d11a9c2a..2f01b7edfc 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -172,9 +172,6 @@ def eval( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df animal age diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py index 44f649e59d..ca9e6a60ce 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/expr.py +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -165,7 +165,6 @@ def _is_type(t): _is_list = _is_type(list) _is_str = _is_type(str) - # partition all AST nodes _all_nodes = frozenset( node @@ -197,11 +196,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) - # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) - _unsupported_expr_nodes = frozenset( [ "Yield", diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py index 75b914c876..a15972fc4c 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/ops.py +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -52,7 +52,6 @@ MATHOPS = _unary_math_ops + _binary_math_ops - LOCAL_TAG = "__pd_eval_local_" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 557c332797..b433c739cc 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -39,9 +39,6 @@ def shape(self) -> tuple[int, int]: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2, 3], ... 'col2': [4, 5, 6]}) >>> df.shape @@ -63,9 +60,6 @@ def axes(self) -> list: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes[1:] [Index(['col1', 'col2'], dtype='object')] @@ -78,9 +72,6 @@ def values(self) -> np.ndarray: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.values array([[1, 3], @@ -110,8 +101,6 @@ def T(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -146,9 +135,6 @@ def transpose(self) -> DataFrame: **Square DataFrame with homogeneous dtype** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = bpd.DataFrame(data=d1) >>> df1 @@ -256,9 +242,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) >>> df.select_dtypes(include=['Int64']) col1 @@ -274,7 +257,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: [2 rows x 2 columns] - Args: include (scalar or list-like): A selection of dtypes or strings to be included. @@ -380,9 +362,6 @@ def to_numpy( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_numpy() array([[1, 3], @@ -419,11 +398,9 @@ def to_gbq( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Write a DataFrame to a BigQuery table. + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> # destination_table = PROJECT_ID + "." + DATASET_ID + "." + TABLE_NAME >>> df.to_gbq("bigframes-dev.birds.test-numbers", if_exists="replace") @@ -510,7 +487,6 @@ def to_gbq( If an invalid value is provided for ``if_exists`` that is not one of ``fail``, ``replace``, or ``append``. - """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -530,8 +506,6 @@ def to_parquet( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" >>> df.to_parquet(path=gcs_bucket) @@ -586,9 +560,6 @@ def to_dict( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() {'col1': {np.int64(0): 1, np.int64(1): 2}, 'col2': {np.int64(0): 3, np.int64(1): 4}} @@ -666,12 +637,17 @@ def to_excel( **Examples:** - >>> import bigframes.pandas as bpd >>> import tempfile - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - >>> df.to_excel(tempfile.TemporaryFile()) + + >>> try: + ... import openpyxl + ... df.to_excel(tempfile.TemporaryFile()) + ... + ... except ImportError: + ... pass # openpyxl is required. Args: excel_writer (path-like, file-like, or ExcelWriter object): @@ -703,9 +679,6 @@ def to_latex( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_latex()) \begin{tabular}{lrr} @@ -754,9 +727,6 @@ def to_records( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_records() rec.array([(0, 1, 3), (1, 2, 4)], @@ -814,9 +784,6 @@ def to_string( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_string()) col1 col2 @@ -914,9 +881,6 @@ def to_html( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_html()) @@ -1024,9 +988,6 @@ def to_markdown( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_markdown()) | | col1 | col2 | @@ -1058,9 +1019,6 @@ def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" >>> df.to_pickle(path=gcs_bucket) @@ -1080,9 +1038,6 @@ def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | No **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> import tempfile >>> df.to_orc(tempfile.TemporaryFile()) @@ -1190,9 +1145,6 @@ def insert(self, loc, column, value, allow_duplicates=False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. @@ -1243,9 +1195,6 @@ def drop( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) >>> df @@ -1284,7 +1233,6 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame: - >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -1369,7 +1317,6 @@ def align( Join method is specified for each axis Index. - Args: other (DataFrame or Series): join ({'outer', 'inner', 'left', 'right'}, default 'outer'): @@ -1402,9 +1349,6 @@ def rename( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df A B @@ -1474,9 +1418,6 @@ def set_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], ... 'sale': [55, 40, 84, 31]}) @@ -1616,10 +1557,6 @@ def reset_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> import numpy as np >>> df = bpd.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -1659,7 +1596,6 @@ class max_speed You can also use ``reset_index`` with ``MultiIndex``. - >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), @@ -1700,7 +1636,6 @@ class name speed max [4 rows x 2 columns] - Args: level (int, str, tuple, or list, default None): Only remove the given levels from the index. Removes all levels by @@ -1795,12 +1730,9 @@ def dropna( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) + ... "born": [pd.NA, "1940-04-25", pd.NA]}) >>> df name toy born 0 Alfred @@ -1889,7 +1821,6 @@ def dropna( ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - Returns: bigframes.pandas.DataFrame: DataFrame with NA entries dropped from it. @@ -1908,9 +1839,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) >>> df @@ -1964,9 +1892,6 @@ def keys(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1985,8 +1910,6 @@ def iterrows(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2011,8 +1934,6 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2044,9 +1965,6 @@ def items(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, ... index=['panda', 'polar', 'koala']) @@ -2085,9 +2003,6 @@ def where(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2177,9 +2092,6 @@ def mask(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2280,11 +2192,8 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ - ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], + ... 'col1': ['A', 'A', 'B', pd.NA, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] @@ -2424,9 +2333,6 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2467,9 +2373,6 @@ def __eq__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2498,9 +2401,6 @@ def __invert__(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'a':[True, False, True], 'b':[-1, 0, 1]}) >>> ~df a b @@ -2527,9 +2427,6 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2569,9 +2466,6 @@ def __ne__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2609,9 +2503,6 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2652,9 +2543,6 @@ def __le__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2692,9 +2580,6 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2735,9 +2620,6 @@ def __lt__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2775,9 +2657,6 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2818,9 +2697,6 @@ def __ge__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2858,9 +2734,6 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'angles': [0, 3, 4], ... 'degrees': [360, 180, 360]}, ... index=['circle', 'triangle', 'rectangle']) @@ -2899,9 +2772,6 @@ def __gt__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2936,9 +2806,6 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2980,9 +2847,6 @@ def __add__(self, other) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'height': [1.5, 2.6], ... 'weight': [500, 800] @@ -3055,9 +2919,6 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3118,9 +2979,6 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3162,9 +3020,6 @@ def __sub__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can subtract a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3210,9 +3065,6 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3271,9 +3123,6 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3315,9 +3164,6 @@ def __mul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3363,9 +3209,6 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3407,9 +3250,6 @@ def __rmul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3455,9 +3295,6 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3499,9 +3336,6 @@ def __truediv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3547,9 +3381,6 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3608,9 +3439,6 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3652,9 +3480,6 @@ def __floordiv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can divide by a scalar: >>> df = bpd.DataFrame({"a": [15, 15, 15], "b": [30, 30, 30]}) @@ -3700,9 +3525,6 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3761,9 +3583,6 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3805,9 +3624,6 @@ def __mod__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can modulo with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3853,9 +3669,6 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3915,9 +3728,6 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3960,9 +3770,6 @@ def __pow__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can exponentiate with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -4009,9 +3816,6 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -4105,9 +3909,6 @@ def combine( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 @@ -4155,9 +3956,6 @@ def combine_first(self, other) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) @@ -4185,10 +3983,6 @@ def explode( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) @@ -4244,9 +4038,6 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4278,9 +4069,6 @@ def cov(self, *, numeric_only) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4317,9 +4105,6 @@ def corrwith( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] >>> df1 = bpd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) @@ -4353,9 +4138,6 @@ def update( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) >>> new_df = bpd.DataFrame({'B': [4, 5, 6], @@ -4418,9 +4200,6 @@ def groupby( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) @@ -4515,17 +4294,18 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") - ... def minutes_to_hours(x: int) -> float: - ... return x/60 + >>> def minutes_to_hours(x: int) -> float: + ... return x / 60 + >>> minutes_to_hours = bpd.deploy_remote_function( + ... minutes_to_hours, + ... reuse=False, + ... cloud_function_service_account="default", + ... ) # doctest: +SKIP >>> df_minutes = bpd.DataFrame( ... {"system_minutes" : [0, 30, 60, 90, 120], @@ -4540,8 +4320,8 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: [5 rows x 2 columns] - >>> df_hours = df_minutes.map(minutes_to_hours) - >>> df_hours + >>> df_hours = df_minutes.map(minutes_to_hours) # doctest: +SKIP + >>> df_hours # doctest: +SKIP system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4557,11 +4337,11 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> df_minutes = bpd.DataFrame( ... { - ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], - ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] + ... "system_minutes" : [0, 30, 60, None, 90, 120, pd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, pd.NA] ... }, dtype="Int64") - >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') - >>> df_hours + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') # doctest: +SKIP + >>> df_hours # doctest: +SKIP system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4612,9 +4392,6 @@ def join( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Join two DataFrames by specifying how to handle the operation: >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) @@ -4668,7 +4445,6 @@ def join( [1 rows x 4 columns] - Another option to join using the key columns is to use the on parameter: >>> df1.join(df2, on="col2", how="right") @@ -4764,9 +4540,6 @@ def merge( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Merge DataFrames df1 and df2 by specifying type of merge: >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) @@ -4897,7 +4670,6 @@ def round(self, decimals): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -4980,10 +4752,6 @@ def apply(self, func, *, axis=0, args=(), **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -5008,14 +4776,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] ... result += row["col2"]*row["col2"] ... return result - >>> df[["col1", "col2"]].apply(foo, axis=1) + >>> df[["col1", "col2"]].apply(foo, axis=1) # doctest: +SKIP 0 11 1 19 dtype: Int64 @@ -5023,7 +4791,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -5040,8 +4808,8 @@ def apply(self, func, *, axis=0, args=(), **kwargs): ... "chemistry": [88, 56, 72], ... "algebra": [78, 91, 79] ... }, index=["Alice", "Bob", "Charlie"]) - >>> stats = df.apply(marks_analyzer, axis=1) - >>> stats + >>> stats = df.apply(marks_analyzer, axis=1) # doctest: +SKIP + >>> stats # doctest: +SKIP Alice [77.67 78. 77.19 76.71] Bob [75.67 80. 74.15 72.56] Charlie [75.33 75. 75.28 75.22] @@ -5064,14 +4832,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x ... result += y/z ... return result - >>> df.apply(foo, axis=1) + >>> df.apply(foo, axis=1) # doctest: +SKIP 0 2.6 1 3.8 dtype: Float64 @@ -5131,9 +4899,6 @@ def any(self, *, axis=0, bool_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -5178,9 +4943,6 @@ def all(self, axis=0, *, bool_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -5222,8 +4984,6 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df A B @@ -5268,9 +5028,6 @@ def min(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5313,9 +5070,6 @@ def max(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5357,9 +5111,6 @@ def sum(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5399,9 +5150,6 @@ def mean(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5442,8 +5190,6 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5480,7 +5226,6 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5517,9 +5262,6 @@ def var(self, axis=0, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5542,7 +5284,6 @@ def var(self, axis=0, *, numeric_only: bool = False): 1 0.5 dtype: Float64 - Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -5562,9 +5303,6 @@ def skew(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], ... 'B': [5, 4, 3, 2, 1], ... 'C': [2, 2, 3, 2, 2]}) @@ -5603,9 +5341,6 @@ def kurt(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5643,9 +5378,6 @@ def std(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5685,9 +5417,6 @@ def count(self, *, numeric_only: bool = False): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -5739,8 +5468,6 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5831,8 +5558,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5880,7 +5605,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): [1 rows x 3 columns] - Args: n (int): Number of rows to return. @@ -5912,9 +5636,6 @@ def idxmin(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5942,9 +5663,6 @@ def idxmax(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5976,9 +5694,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -6027,7 +5742,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): [10 rows x 3 columns] - Args: id_vars (tuple, list, or ndarray, optional): Column(s) to use as identifier variables. @@ -6051,9 +5765,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) >>> df A B @@ -6080,9 +5791,6 @@ def cummin(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6112,9 +5820,6 @@ def cummax(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6144,9 +5849,6 @@ def cumsum(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6181,9 +5883,6 @@ def cumprod(self) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6222,9 +5921,6 @@ def diff( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6270,9 +5966,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6335,8 +6028,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df A B C @@ -6359,7 +6050,6 @@ def describe(self, include: None | Literal["all"] = None): [8 rows x 2 columns] - Using describe with include = "all": >>> df.describe(include="all") A B C @@ -6406,9 +6096,6 @@ def pivot(self, *, columns, index=None, values=None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... "foo": ["one", "one", "one", "two", "two"], ... "bar": ["A", "B", "C", "A", "B"], @@ -6477,8 +6164,6 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], @@ -6569,9 +6254,6 @@ def stack(self, level=-1): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6608,9 +6290,6 @@ def unstack(self, level=-1): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6649,9 +6328,6 @@ def index(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can access the index of a DataFrame via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6702,9 +6378,6 @@ def columns(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can access the column labels of a DataFrame via ``columns`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6750,11 +6423,8 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], - ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, + ... 'num_wings': [2, 0, 0, 0, pd.NA]}, ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], ... dtype='Int64') >>> df @@ -6831,9 +6501,6 @@ def eval(self, expr: str) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B @@ -6891,7 +6558,6 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 4 columns] - Args: expr (str): The expression string to evaluate. @@ -6907,9 +6573,6 @@ def query(self, expr: str) -> DataFrame | None: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) @@ -6982,9 +6645,6 @@ def interpolate(self, method: str = "linear"): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], ... 'B': [None, 6, None, 2, None, 3], @@ -7032,9 +6692,6 @@ def fillna(self, value): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -7110,8 +6767,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], ... 'string_col': ["a", "b", "c", "b"], @@ -7150,7 +6805,6 @@ def replace( [4 rows x 2 columns] - Args: to_replace (str, regex, list, int, float or None): How to find the values that will be replaced. @@ -7206,9 +6860,6 @@ def iat(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -7240,9 +6891,6 @@ def at(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -7289,9 +6937,6 @@ def dot(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7383,9 +7028,6 @@ def __matmul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7443,9 +7085,6 @@ def __len__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'a': [0, 1, 2], ... 'b': [3, 4, 5] @@ -7466,10 +7105,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) >>> np.array(df) @@ -7501,9 +7136,6 @@ def __getitem__(self, key): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], @@ -7547,7 +7179,6 @@ def __getitem__(self, key): You can specify a pandas Index with desired column labels. - >>> import pandas as pd >>> df[pd.Index(["age", "location"])] age location 0 20 WA @@ -7576,9 +7207,6 @@ def __setitem__(self, key, value): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 273339efcf..e8079e573b 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -38,9 +38,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) >>> s.size 3 @@ -65,9 +62,6 @@ def __iter__(self) -> Iterator: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -106,9 +100,6 @@ def astype(self, dtype): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Create a DataFrame: >>> d = {'col1': [1, 2], 'col2': [3, 4]} @@ -152,7 +143,7 @@ def astype(self, dtype): Note that this is equivalent of using ``to_datetime`` with ``unit='us'``: - >>> bpd.to_datetime(ser, unit='us', utc=True) + >>> bpd.to_datetime(ser, unit='us', utc=True) # doctest: +SKIP 0 2034-02-08 11:13:20.246789+00:00 1 2021-06-19 17:20:44.123101+00:00 2 2003-06-05 17:30:34.120101+00:00 @@ -350,9 +341,6 @@ def get(self, key, default=None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame( ... [ ... [24.3, 75.7, "high"], @@ -461,9 +449,6 @@ def head(self, n: int = 5): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df @@ -562,8 +547,6 @@ def sample( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, @@ -643,9 +626,6 @@ def dtypes(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) >>> df.dtypes float Float64 @@ -668,9 +648,6 @@ def copy(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Modification in the original Series will not affect the copy Series: >>> s = bpd.Series([1, 2], index=["a", "b"]) @@ -741,10 +718,6 @@ def ffill(self, *, limit: Optional[int] = None): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -770,7 +743,6 @@ def ffill(self, *, limit: Optional[int] = None): [4 rows x 4 columns] - Fill NA/NaN values in Series: >>> series = bpd.Series([1, np.nan, 2, 3]) @@ -790,7 +762,6 @@ def ffill(self, *, limit: Optional[int] = None): maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - Returns: bigframes.pandas.DataFrame or bigframes.pandas.Series or None: Object with missing values filled. @@ -825,13 +796,9 @@ def isna(self) -> NDFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], - ... born=[bpd.NA, "1940-04-25", "1940-04-25"], + ... born=[pd.NA, "1940-04-25", "1940-04-25"], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'], ... )) @@ -863,7 +830,7 @@ def isna(self) -> NDFrame: Show which entries in a Series are NA: - >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) + >>> ser = bpd.Series([5, None, 6, np.nan, pd.NA]) >>> ser 0 5 1 @@ -1068,8 +1035,6 @@ def rolling( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() 0 @@ -1154,10 +1119,6 @@ def pipe( Constructing a income DataFrame from a dictionary. - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) >>> df diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 1e39ec8f94..8dba97ff07 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -45,8 +45,6 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df A B C @@ -86,8 +84,6 @@ def any(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).any() @@ -125,8 +121,6 @@ def all(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).all() @@ -163,10 +157,6 @@ def count(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) >>> ser.groupby(level=0).count() @@ -202,9 +192,6 @@ def mean( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -263,9 +250,6 @@ def median( For SeriesGroupBy: >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).median() @@ -304,7 +288,6 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -343,10 +326,6 @@ def std( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).std() @@ -390,10 +369,6 @@ def var( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).var() @@ -435,9 +410,6 @@ def rank( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame( ... { ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], @@ -510,10 +482,6 @@ def skew( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', ... 'Parrot', 'Parrot', 'Parrot'], @@ -546,9 +514,6 @@ def kurt( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurt() @@ -579,9 +544,6 @@ def kurtosis( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurtosis() @@ -606,9 +568,8 @@ def first(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() B C @@ -647,8 +608,6 @@ def last(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) >>> df.groupby("A").last() @@ -685,9 +644,6 @@ def sum( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).sum() @@ -730,10 +686,6 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).prod() @@ -766,10 +718,6 @@ def min( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).min() @@ -815,9 +763,6 @@ def max( For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).max() @@ -859,9 +804,6 @@ def cumcount(self, ascending: bool = True): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b', 'c'] >>> ser = bpd.Series([5, 1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).cumcount() @@ -897,10 +839,6 @@ def cumprod(self, *args, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumprod() @@ -936,10 +874,6 @@ def cumsum(self, *args, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumsum() @@ -975,10 +909,6 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummin() @@ -1014,10 +944,6 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummax() @@ -1055,10 +981,6 @@ def diff(self): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).diff() @@ -1101,10 +1023,6 @@ def shift(self, periods: int = 1): For SeriesGroupBy: - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).shift(1) @@ -1145,9 +1063,6 @@ def rolling(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).rolling(2).min() @@ -1204,9 +1119,6 @@ def expanding(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).expanding().min() @@ -1230,9 +1142,6 @@ def head(self, n: int = 5): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) >>> df.groupby('A').head(1) @@ -1259,9 +1168,6 @@ def size(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For SeriesGroupBy: >>> lst = ['a', 'a', 'b'] @@ -1313,9 +1219,6 @@ def __iter__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For SeriesGroupBy: >>> lst = ["a", "a", "b"] @@ -1377,10 +1280,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) min max @@ -1410,10 +1309,6 @@ def aggregate(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) min max @@ -1443,10 +1338,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) >>> ser.groupby(level=0).nunique() @@ -1494,10 +1385,6 @@ def agg(self, func, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1554,10 +1441,6 @@ def aggregate(self, func, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1614,10 +1497,6 @@ def nunique(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], ... 'value1': [1, 5, 5, 2, 5, 5], @@ -1650,10 +1529,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 0dd487d056..0e74b3e178 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -12,9 +12,6 @@ def day(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") ... ) @@ -42,9 +39,6 @@ def dayofweek(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -76,9 +70,6 @@ def day_of_week(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -106,9 +97,7 @@ def dayofyear(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -134,9 +123,7 @@ def day_of_year(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -168,7 +155,6 @@ def date(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -189,9 +175,7 @@ def hour(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -215,9 +199,7 @@ def minute(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -241,9 +223,6 @@ def month(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") ... ) @@ -267,9 +246,6 @@ def isocalendar(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() ... ) @@ -287,11 +263,9 @@ def isocalendar(self): [9 rows x 3 columns] - Returns: DataFrame With columns year, week and day. - """ @property @@ -300,9 +274,7 @@ def second(self): **Examples:** - >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -331,7 +303,6 @@ def time(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -353,7 +324,6 @@ def quarter(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -374,9 +344,6 @@ def year(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") ... ) @@ -400,9 +367,6 @@ def days(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -418,9 +382,6 @@ def seconds(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -436,9 +397,6 @@ def microseconds(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -453,9 +411,6 @@ def total_seconds(self): **Examples:** - >>> import pandas as pd - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s 0 1 days 00:01:01.000001 @@ -472,7 +427,6 @@ def tz(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -495,7 +449,6 @@ def unit(self) -> str: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index eba47fc1f9..04f7f5938d 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -32,9 +32,6 @@ def name(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3], name='x') >>> idx Index([1, 2, 3], dtype='Int64', name='x') @@ -63,9 +60,6 @@ def values(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -86,9 +80,6 @@ def ndim(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -121,9 +112,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -156,9 +144,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bool(bpd.Index([1, 2, 3]).is_monotonic_increasing) True @@ -181,9 +166,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bool(bpd.Index([3, 2, 1]).is_monotonic_decreasing) True @@ -206,9 +188,6 @@ def from_frame(cls, frame) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) @@ -246,9 +225,6 @@ def shape(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -268,9 +244,6 @@ def nlevels(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> mi = bpd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) >>> mi MultiIndex([('a', 'b', 'c')], @@ -290,9 +263,6 @@ def is_unique(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 5, 7, 7]) >>> idx.is_unique False @@ -313,9 +283,6 @@ def has_duplicates(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 5, 7, 7]) >>> bool(idx.has_duplicates) True @@ -336,9 +303,6 @@ def dtype(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -364,9 +328,6 @@ def T(self) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -403,9 +364,6 @@ def copy( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['a', 'b', 'c']) >>> new_idx = idx.copy() >>> idx is new_idx @@ -438,14 +396,10 @@ def astype(self, dtype): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') - Args: dtype (str, data type, or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, @@ -487,9 +441,6 @@ def get_level_values(self, level) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(list('abc')) >>> idx Index(['a', 'b', 'c'], dtype='string') @@ -517,9 +468,6 @@ def to_series(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['Ant', 'Bear', 'Cow'], name='animal') By default, the original index and original name is reused. @@ -571,9 +519,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1,2,3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -611,9 +556,6 @@ def all(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - True, because nonzero integers are considered True. >>> bool(bpd.Index([1, 2, 3]).all()) @@ -639,9 +581,6 @@ def any(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> index = bpd.Index([0, 1, 2]) >>> bool(index.any()) True @@ -665,9 +604,6 @@ def min(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.min()) 1 @@ -687,9 +623,6 @@ def max(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.max()) 3 @@ -713,9 +646,6 @@ def argmin(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Consider dataset containing cereal calories >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -750,9 +680,6 @@ def get_loc( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> unique_index = bpd.Index(list('abc')) >>> unique_index.get_loc('b') 1 @@ -794,9 +721,6 @@ def argmax(self) -> int: Consider dataset containing cereal calories - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) >>> s @@ -828,9 +752,6 @@ def nunique(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -860,9 +781,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([10, 100, 1, 1000]) >>> idx Index([10, 100, 1, 1000], dtype='Int64') @@ -904,10 +822,6 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 @@ -961,10 +875,6 @@ def fillna(self, value) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) Index([0.0, 0.0, 3.0], dtype='Float64') @@ -992,9 +902,6 @@ def rename(self, name, *, inplace): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') Index(['A', 'C', 'A', 'B'], dtype='string', name='grade') @@ -1022,9 +929,6 @@ def drop(self, labels) -> Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index(['a', 'b', 'c']) >>> idx.drop(['a']) Index(['b', 'c'], dtype='string') @@ -1042,10 +946,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() Index([1.0, 3.0], dtype='Float64') @@ -1070,11 +970,9 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Generate an pandas.Index with duplicate values. + >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) The keep parameter controls which duplicate values are removed. @@ -1113,8 +1011,6 @@ def unique(self, level: Hashable | int | None = None): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 1, 2, 3, 3]) >>> idx.unique() Index([1, 2, 3], dtype='Int64') @@ -1134,8 +1030,6 @@ def item(self, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1], index=['a']) >>> s.index.item() 'a' diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 105a376728..973d5c763a 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -15,10 +15,6 @@ def year(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year Index([2025], dtype='Int64') @@ -31,10 +27,6 @@ def month(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month Index([2], dtype='Int64') @@ -47,10 +39,6 @@ def day(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day Index([15], dtype='Int64') @@ -63,10 +51,6 @@ def day_of_week(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week Index([5], dtype='Int64') @@ -79,10 +63,6 @@ def dayofweek(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek Index([5], dtype='Int64') @@ -95,10 +75,6 @@ def weekday(self) -> base.Index: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday Index([5], dtype='Int64') diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py index a882aa40e3..018e638de3 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/multi.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -25,8 +25,6 @@ def from_tuples( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) @@ -62,8 +60,6 @@ def from_arrays( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) MultiIndex([(1, 'red'), diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 697c17f23c..0f42433384 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -34,8 +34,6 @@ def cut( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 5, 10]) >>> s 0 0 @@ -73,7 +71,6 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: - >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 932959a826..c6ec5dfaf1 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -38,9 +38,6 @@ def dt(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series 0 2000-01-01 00:00:00 @@ -110,9 +107,6 @@ def index(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can access the index of a Series via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -161,13 +155,10 @@ def shape(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape (4,) - >>> s = bpd.Series(['Alice', 'Bob', bpd.NA]) + >>> s = bpd.Series(['Alice', 'Bob', pd.NA]) >>> s.shape (3,) """ @@ -180,9 +171,6 @@ def dtype(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.dtype Int64Dtype() @@ -200,9 +188,6 @@ def name(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For a Series: >>> s = bpd.Series([1, 2, 3], dtype="Int64", name='Numbers') @@ -248,9 +233,6 @@ def hasnans(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, None]) >>> s 0 1.0 @@ -272,9 +254,6 @@ def T(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -297,9 +276,6 @@ def transpose(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -337,10 +313,6 @@ def reset_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) >>> s.index.name = "idx" @@ -440,9 +412,6 @@ def keys(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s.keys() Index([0, 1, 2], dtype='Int64') @@ -522,9 +491,6 @@ def to_markdown( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) | | animal | @@ -577,16 +543,14 @@ def to_dict( **Examples:** - >>> import bigframes.pandas as bpd >>> from collections import OrderedDict, defaultdict - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.to_dict() {np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4} >>> s.to_dict(into=OrderedDict) - OrderedDict({np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4}) + OrderedDict([(np.int64(0), 1), (np.int64(1), 2), (np.int64(2), 3), (np.int64(3), 4)]) >>> dd = defaultdict(list) >>> s.to_dict(into=dd) @@ -617,9 +581,6 @@ def to_frame(self, name=None) -> DataFrame: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["a", "b", "c"], ... name="vals") >>> s.to_frame() @@ -714,9 +675,6 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -748,10 +706,6 @@ def to_numpy( **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -803,9 +757,6 @@ def to_pickle(self, path, *, allow_large_results=None, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> original_df = bpd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar @@ -865,9 +816,6 @@ def agg(self, func): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4]) >>> s 0 1 @@ -902,10 +850,7 @@ def count(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([0.0, 1.0, bpd.NA]) + >>> s = bpd.Series([0.0, 1.0, pd.NA]) >>> s 0 0.0 1 1.0 @@ -928,9 +873,6 @@ def nunique(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -963,9 +905,6 @@ def unique(self, keep_order=True) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -1006,9 +945,6 @@ def mode(self) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, 4, 8, 2, 4, None]) >>> s.mode() 0 2.0 @@ -1031,11 +967,9 @@ def drop_duplicates( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Generate a Series with duplicated entries. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], ... name='animal') >>> s @@ -1101,7 +1035,6 @@ def duplicated(self, keep="first") -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1172,9 +1105,6 @@ def idxmin(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(data=[1, None, 4, 1], ... index=['A', 'B', 'C', 'D']) >>> s @@ -1201,9 +1131,6 @@ def idxmax(self) -> Hashable: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(data=[1, None, 4, 3, 4], ... index=['A', 'B', 'C', 'D', 'E']) >>> s @@ -1229,8 +1156,6 @@ def round(self, decimals: int = 0) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() 0 0.0 @@ -1262,9 +1187,6 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) >>> s 0 [1 2 3] @@ -1301,9 +1223,6 @@ def corr(self, other, method="pearson", min_periods=None) -> float: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) >>> s1.corr(s2) @@ -1340,8 +1259,6 @@ def autocorr(self, lag: int = 1) -> float: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS np.float64(0.10355263309024067) @@ -1377,9 +1294,6 @@ def cov( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = bpd.Series([0.12528585, 0.26962463, 0.51111198]) >>> s1.cov(s2) @@ -1403,12 +1317,8 @@ def diff(self) -> Series: Calculates the difference of a Series element compared with another element in the Series (default is element in previous row). - **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Difference with previous row >>> s = bpd.Series([1, 1, 2, 3, 5, 8]) @@ -1472,9 +1382,6 @@ def dot(self, other) -> Series | np.ndarray: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) >>> s.dot(other) @@ -1496,7 +1403,6 @@ def dot(self, other) -> Series | np.ndarray: Series and each rows of other if other is a DataFrame or a numpy.ndarray between the Series and each columns of the numpy array. - """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1529,10 +1435,6 @@ def sort_values( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s 0 @@ -1628,10 +1530,6 @@ def sort_index( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() 1 c @@ -1690,8 +1588,6 @@ def nlargest( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1776,8 +1672,6 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1864,7 +1758,6 @@ def apply( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None For applying arbitrary python function a `remote_function` is recommended. Let's use ``reuse=False`` flag to make sure a new `remote_function` @@ -1872,9 +1765,13 @@ def apply( to potentially reuse a previously deployed `remote_function` from the same user defined function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") - ... def minutes_to_hours(x: int) -> float: + >>> def minutes_to_hours(x: int) -> float: ... return x/60 + >>> bpd.deploy_remote_function( # doctest: +SKIP + ... minutes_to_hours, + ... reuse=False, + ... cloud_function_service_account="default", + ... ) >>> minutes = bpd.Series([0, 30, 60, 90, 120]) >>> minutes @@ -1885,7 +1782,7 @@ def apply( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) + >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP >>> hours 0 0.0 1 0.5 @@ -1898,7 +1795,7 @@ def apply( a `remote_function`, you would provide the names of the packages via `packages` param. - >>> @bpd.remote_function( + >>> @bpd.remote_function( # doctest: +SKIP ... reuse=False, ... packages=["cryptography"], ... cloud_function_service_account="default" @@ -1915,11 +1812,11 @@ def apply( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) + >>> hashes = names.apply(get_hash) # doctest: +SKIP You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -1932,8 +1829,8 @@ def apply( ... "I love this product! It's amazing.", ... "Hungry? Wanna eat? Lets go!" ... ]) - >>> features = texts.apply(text_analyzer) - >>> features + >>> features = texts.apply(text_analyzer) # doctest: +SKIP + >>> features # doctest: +SKIP 0 [9 1 0 0] 1 [6 1 1 0] 2 [5 0 1 2] @@ -2006,8 +1903,6 @@ def combine( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -2065,9 +1960,6 @@ def groupby( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can group by a named index level. >>> s = bpd.Series([380, 370., 24., 26.], @@ -2089,7 +1981,6 @@ def groupby( You can also group by more than one index levels. - >>> import pandas as pd >>> s = bpd.Series([380, 370., 24., 26.], ... index=pd.MultiIndex.from_tuples( ... [("Falcon", "Clear"), @@ -2238,9 +2129,6 @@ def drop( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) >>> s A 0 @@ -2256,7 +2144,6 @@ def drop( Drop 2nd level label in MultiIndex Series: - >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -2322,7 +2209,6 @@ def reorder_levels(self, order: Sequence, axis) -> Series: axis ({0 or 'index', 1 or 'columns'}, default 0): For `Series` this parameter is unused and defaults to 0. - Returns: type of caller (new object) """ @@ -2369,10 +2255,6 @@ def interpolate(self, method: str = "linear"): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - Filling in NaN in a Series via linear interpolation. >>> s = bpd.Series([0, 1, np.nan, 3]) @@ -2414,10 +2296,6 @@ def fillna( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s 0 @@ -2470,8 +2348,6 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s 0 1 @@ -2596,10 +2472,6 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - Drop NA values from a Series: >>> ser = bpd.Series([1., 2., np.nan]) @@ -2616,7 +2488,7 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') + >>> ser = bpd.Series(['2', pd.NA, '', None, 'I stay'], dtype='object') >>> ser 0 2 1 @@ -2660,10 +2532,6 @@ def between( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - Boundary values are included by default: >>> s = bpd.Series([2, 0, 4, 8, np.nan]) @@ -2719,10 +2587,6 @@ def case_when( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) >>> b = bpd.Series([0, 3, 4, 5]) @@ -2789,9 +2653,6 @@ def cumprod(self): **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2826,10 +2687,6 @@ def cumsum(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2869,10 +2726,6 @@ def cummax(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2892,7 +2745,6 @@ def cummax(self): 4 5.0 dtype: Float64 - Returns: bigframes.pandas.Series: Return cumulative maximum of scalar or Series. @@ -2908,10 +2760,6 @@ def cummin(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2945,10 +2793,6 @@ def eq(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2990,10 +2834,6 @@ def ne(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3037,10 +2877,6 @@ def le(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3083,10 +2919,6 @@ def lt(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3130,10 +2962,6 @@ def ge(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3177,10 +3005,6 @@ def gt(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3223,10 +3047,7 @@ def add(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> a = bpd.Series([1, 2, 3, bpd.NA]) + >>> a = bpd.Series([1, 2, 3, pd.NA]) >>> a 0 1 1 2 @@ -3287,9 +3108,6 @@ def __add__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3339,10 +3157,6 @@ def radd(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3404,10 +3218,6 @@ def sub( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3449,9 +3259,6 @@ def __sub__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3501,10 +3308,6 @@ def rsub(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3563,10 +3366,6 @@ def mul(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3609,9 +3408,6 @@ def __mul__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3649,10 +3445,6 @@ def rmul(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3710,10 +3502,6 @@ def truediv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3756,9 +3544,6 @@ def __truediv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3796,10 +3581,6 @@ def rtruediv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3858,10 +3639,6 @@ def floordiv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3904,9 +3681,6 @@ def __floordiv__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can divide by a scalar: >>> s = bpd.Series([15, 30, 45]) @@ -3944,10 +3718,6 @@ def rfloordiv(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4006,10 +3776,6 @@ def mod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4052,9 +3818,6 @@ def __mod__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can modulo with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -4091,10 +3854,6 @@ def rmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4155,9 +3914,6 @@ def pow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4201,9 +3957,6 @@ def __pow__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can exponentiate with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -4242,9 +3995,6 @@ def rpow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4304,10 +4054,6 @@ def divmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4356,10 +4102,6 @@ def rdivmod(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4411,10 +4153,6 @@ def combine_first(self, other) -> Series: **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) >>> s1.combine_first(s2) @@ -4453,11 +4191,6 @@ def update(self, other) -> None: **Examples:** - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) >>> s @@ -4547,10 +4280,6 @@ def any( **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - For Series input, the output is a scalar indicating whether any element is True. >>> bpd.Series([False, False]).any() @@ -4583,9 +4312,6 @@ def max( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the max of a Series: >>> s = bpd.Series([1, 3]) @@ -4599,7 +4325,7 @@ def max( Calculating the max of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4625,9 +4351,6 @@ def min( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the min of a Series: >>> s = bpd.Series([1, 3]) @@ -4641,7 +4364,7 @@ def min( Calculating the min of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4666,9 +4389,6 @@ def std( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], ... 'age': [21, 25, 62, 43], ... 'height': [1.61, 1.87, 1.49, 2.01]} @@ -4714,9 +4434,6 @@ def sum(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the sum of a Series: >>> s = bpd.Series([1, 3]) @@ -4730,7 +4447,7 @@ def sum(self): Calculating the sum of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4750,9 +4467,6 @@ def mean(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Calculating the mean of a Series: >>> s = bpd.Series([1, 3]) @@ -4766,7 +4480,7 @@ def mean(self): Calculating the mean of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s = bpd.Series([1, 3, pd.NA]) >>> s 0 1 1 3 @@ -4787,8 +4501,6 @@ def median(self, *, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.median() np.float64(2.0) @@ -4828,8 +4540,6 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) np.float64(2.5) @@ -4880,9 +4590,6 @@ def describe(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['A', 'A', 'B']) >>> s 0 A @@ -4908,9 +4615,6 @@ def skew(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s.skew() np.float64(0.0) @@ -4946,9 +4650,6 @@ def kurt(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) >>> s cat 1 @@ -4989,9 +4690,6 @@ def item(self: Series, *args, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -5013,9 +4711,6 @@ def items(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): ... print(f"Index : {index}, Value : {value}") @@ -5035,9 +4730,6 @@ def where(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -5103,9 +4795,6 @@ def mask(self, cond, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -5149,7 +4838,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -5162,12 +4851,12 @@ def mask(self, cond, other): 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask) + >>> s.mask(should_mask) # doctest: +SKIP 0 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask, "REDACTED") + >>> s.mask(should_mask, "REDACTED") # doctest: +SKIP 0 REDACTED 1 Bob 2 Caroline @@ -5261,9 +4950,6 @@ def argmax(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -5299,9 +4985,6 @@ def argmin(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -5340,9 +5023,6 @@ def rename(self, index, *, inplace, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -5392,9 +5072,6 @@ def rename_axis(self, mapper, *, inplace, **kwargs): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Series >>> s = bpd.Series(["dog", "cat", "monkey"]) @@ -5457,10 +5134,7 @@ def value_counts( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") + >>> s = bpd.Series([3, 1, 2, 3, 4, pd.NA], dtype="Int64") >>> s 0 3 @@ -5536,8 +5210,6 @@ def str(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["A_Str_Series"]) >>> s 0 A_Str_Series @@ -5565,8 +5237,6 @@ def plot(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") >>> plot @@ -5592,9 +5262,6 @@ def isin(self, values): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', ... 'hippo'], name='animal') >>> s @@ -5658,9 +5325,6 @@ def is_monotonic_increasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing np.True_ @@ -5682,9 +5346,6 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing np.True_ @@ -5725,9 +5386,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) + >>> s = bpd.Series(['cat', 'dog', pd.NA, 'rabbit']) >>> s 0 cat 1 dog @@ -5747,7 +5406,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function(cloud_function_service_account="default") + >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: @@ -5756,7 +5415,7 @@ def map( ... ]) ... return "N/A" - >>> s.map(my_mapper) + >>> s.map(my_mapper) # doctest: +SKIP 0 cAt 1 dOg 2 N/A @@ -5790,9 +5449,6 @@ def iloc(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] @@ -5870,9 +5526,6 @@ def loc(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], ... columns=['max_speed', 'shield']) @@ -5957,9 +5610,6 @@ def iat(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -5992,9 +5642,6 @@ def at(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -6028,9 +5675,6 @@ def values(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.Series([1, 2, 3]).values array([1, 2, 3]) @@ -6050,9 +5694,6 @@ def size(self) -> int: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -6087,10 +5728,6 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> ser = bpd.Series([1, 2, 3]) >>> np.asarray(ser) @@ -6115,9 +5752,6 @@ def __len__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([1, 2, 3]) >>> len(s) 3 @@ -6131,9 +5765,6 @@ def __invert__(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series([True, False, True]) >>> ~ser 0 False @@ -6152,9 +5783,6 @@ def __and__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -6191,9 +5819,6 @@ def __or__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -6230,9 +5855,6 @@ def __xor__(self, other): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -6269,9 +5891,6 @@ def __getitem__(self, indexer): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([15, 30, 45]) >>> s[1] np.int64(30) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index fe94bf3049..7a37eba341 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,8 +20,6 @@ def __getitem__(self, key: typing.Union[int, slice]): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) >>> s.str[0] 0 A @@ -53,12 +51,10 @@ def extract(self, pat: str, flags: int = 0): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - A pattern with two groups will return a DataFrame with two columns. Non-matches will be `NaN`. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['a1', 'b2', 'c3']) >>> s.str.extract(r'([ab])(\\d)') 0 1 @@ -115,8 +111,6 @@ def find(self, sub, start: int = 0, end=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) >>> ser.str.find("_") 0 3 @@ -145,12 +139,10 @@ def len(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Returns the length (number of characters) in a string. - >>> s = bpd.Series(['dog', '', bpd.NA]) + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(['dog', '', pd.NA]) >>> s.str.len() 0 3 1 0 @@ -172,8 +164,6 @@ def lower(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -197,8 +187,6 @@ def slice(self, start=None, stop=None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["koala", "dog", "chameleon"]) >>> s 0 koala @@ -250,13 +238,11 @@ def strip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([ ... '1. Ant.', ... ' 2. Bee? ', ... '\\t3. Cat!\\n', - ... bpd.NA, + ... pd.NA, ... ]) >>> s.str.strip() 0 1. Ant. @@ -293,8 +279,6 @@ def upper(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -322,8 +306,6 @@ def isnumeric(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False @@ -349,8 +331,6 @@ def isalpha(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalpha() 0 True @@ -375,8 +355,6 @@ def isdigit(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['23', '1a', '1/5', '']) >>> s.str.isdigit() 0 True @@ -401,8 +379,6 @@ def isalnum(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True @@ -439,8 +415,6 @@ def isspace(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) >>> s.str.isspace() 0 True @@ -465,8 +439,6 @@ def islower(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.islower() 0 True @@ -492,8 +464,6 @@ def isupper(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.isupper() 0 False @@ -518,12 +488,10 @@ def isdecimal(self): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - The `isdecimal` method checks for characters used to form numbers in base 10. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['23', '³', '⅕', '']) >>> s.str.isdecimal() 0 True @@ -550,9 +518,7 @@ def rstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) >>> s.str.rstrip() 0 Ant 1 Bee @@ -583,9 +549,7 @@ def lstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) >>> s.str.lstrip() 0 Ant 1 Bee @@ -611,8 +575,6 @@ def repeat(self, repeats: int): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['a', 'b', 'c']) >>> s 0 a @@ -645,8 +607,6 @@ def capitalize(self): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -672,11 +632,9 @@ def cat(self, others, *, join): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - You can concatenate each string in a Series to another string. + >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Jane', 'John']) >>> s.str.cat(" Doe") 0 Jane Doe @@ -729,11 +687,9 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Returning a Series of booleans using only a literal pattern. + >>> import bigframes.pandas as bpd >>> s1 = bpd.Series(['Mouse', 'dog', 'house and parrot', '23', None]) >>> s1.str.contains('og') 0 False @@ -833,14 +789,12 @@ def replace( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - When *pat* is a string and *regex* is True, the given *pat* is compiled as a regex. When *repl* is a string, it replaces matching regex patterns as with `re.sub()`. NaN value(s) in the Series are left as is: - >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(['foo', 'fuz', pd.NA]) >>> s.str.replace('f.', 'ba', regex=True) 0 bao 1 baz @@ -850,7 +804,7 @@ def replace( When *pat* is a string and *regex* is False, every *pat* is replaced with *repl* as with `str.replace()`: - >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) + >>> s = bpd.Series(['f.o', 'fuz', pd.NA]) >>> s.str.replace('f.', 'ba', regex=False) 0 bao 1 fuz @@ -896,9 +850,7 @@ def startswith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) + >>> s = bpd.Series(['bat', 'Bear', 'caT', pd.NA]) >>> s 0 bat 1 Bear @@ -941,9 +893,7 @@ def endswith( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) + >>> s = bpd.Series(['bat', 'bear', 'caT', pd.NA]) >>> s 0 bat 1 bear @@ -987,9 +937,6 @@ def split( **Examples:** >>> import bigframes.pandas as bpd - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series( ... [ ... "a regular sentence", @@ -1031,8 +978,6 @@ def match(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(["horse", "eagle", "donkey"]) >>> ser.str.match("e") 0 False @@ -1060,8 +1005,6 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r'd.+') 0 False @@ -1092,8 +1035,6 @@ def get(self, i: int): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["apple", "banana", "fig"]) >>> s.str.get(3) 0 l @@ -1122,8 +1063,6 @@ def pad( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["caribou", "tiger"]) >>> s 0 caribou @@ -1170,8 +1109,6 @@ def ljust( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.ljust(8, fillchar='.') 0 dog..... @@ -1202,8 +1139,6 @@ def rjust( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.rjust(8, fillchar='.') 0 .....dog @@ -1238,9 +1173,7 @@ def zfill( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) + >>> s = bpd.Series(['-1', '1', '1000', pd.NA]) >>> s 0 -1 1 1 @@ -1278,8 +1211,6 @@ def center( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.center(8, fillchar='.') 0 ..dog... @@ -1309,12 +1240,9 @@ def join(self, sep: str): **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import pandas as pd - Example with a list that contains non-string elements. + >>> import bigframes.pandas as bpd >>> s = bpd.Series([['lion', 'elephant', 'zebra'], ... ['dragon'], ... ['duck', 'swan', 'fish', 'guppy']]) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 9c17b9632e..189dabcf24 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -37,11 +37,9 @@ def to_datetime( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - Converting a Scalar to datetime: + >>> import bigframes.pandas as bpd >>> scalar = 123456.789 >>> bpd.to_datetime(scalar, unit = 's') Timestamp('1970-01-02 10:17:36.789000') diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index 9442e965fa..220b15f56e 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -55,7 +55,6 @@ def to_timedelta( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None Converting a Scalar to timedelta diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 0fdca4dde1..3190c92b92 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -61,7 +61,6 @@ def read_gbq( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None If the input is a table ID: diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index aec911d2fe..7d5c108f93 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -27,8 +27,6 @@ def read_parquet( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path, engine="bigquery") diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 4757f5ed9d..9dc7b39873 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -71,8 +71,6 @@ def read_csv( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) >>> df.head(2) @@ -192,8 +190,6 @@ def read_json( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") >>> df.head(2) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 33088dc019..2950cf422a 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -35,8 +35,6 @@ def read_pickle( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) diff --git a/third_party/bigframes_vendored/pandas/pandas/_typing.py b/third_party/bigframes_vendored/pandas/pandas/_typing.py index e665339fc8..76e984a173 100644 --- a/third_party/bigframes_vendored/pandas/pandas/_typing.py +++ b/third_party/bigframes_vendored/pandas/pandas/_typing.py @@ -100,7 +100,6 @@ Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, datetime] IntStrT = TypeVar("IntStrT", int, str) - # timestamp and timedelta convertible types TimestampConvertibleTypes = Union[ @@ -267,7 +266,6 @@ def closed(self) -> bool: # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] - # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 4ed5c8eb0b..a7cd2c0cc9 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -8,10 +8,11 @@ class PlotAccessor: Make plots of Series or DataFrame with the `matplotlib` backend. **Examples:** - For Series: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + + For Series: + >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -57,9 +58,6 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> import numpy as np - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -96,7 +94,6 @@ def line( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -164,7 +161,6 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], @@ -233,7 +229,6 @@ def bar( Basic plot. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) >>> ax = df.plot.bar(x='lab', y='val', rot=0) @@ -296,7 +291,6 @@ def scatter( in a DataFrame's columns. >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], ... [6.4, 3.2, 1], [5.9, 3.0, 2]], ... columns=['length', 'width', 'species']) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index a7344d49d4..44eefeddd7 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -30,7 +30,6 @@ class KMeans(_BaseKMeans): **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> from bigframes.ml.cluster import KMeans >>> X = bpd.DataFrame({"feat0": [1, 1, 1, 10, 10, 10], "feat1": [2, 4, 0, 2, 4, 0]}) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index c3c3a77b71..e487a2e7c1 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,7 +24,6 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], ... "column": [0,1] * 7, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index f13c52bfb6..3535edc8f9 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -24,7 +24,6 @@ class PCA(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import PCA - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [-1, -2, -3, 1, 2, 3], "feat1": [-1, -1, -2, 1, 1, 2]}) >>> pca = PCA(n_components=2).fit(X) >>> pca.predict(X) # doctest:+SKIP diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 42eab24c82..175ad86b21 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -22,7 +22,6 @@ class SimpleImputer(_BaseImputer): >>> import bigframes.pandas as bpd >>> from bigframes.ml.impute import SimpleImputer - >>> bpd.options.display.progress_bar = None >>> X_train = bpd.DataFrame({"feat0": [7.0, 4.0, 10.0], "feat1": [2.0, None, 5.0], "feat2": [3.0, 6.0, 9.0]}) >>> imp_mean = SimpleImputer().fit(X_train) >>> X_test = bpd.DataFrame({"feat0": [None, 4.0, 10.0], "feat1": [2.0, None, None], "feat2": [3.0, 6.0, 9.0]}) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 21ba5a3bf8..7543edd10b 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -66,7 +66,6 @@ class LinearRegression(RegressorMixin, LinearModel): >>> from bigframes.ml.linear_model import LinearRegression >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index a85c6fae8d..d449a1040c 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -25,7 +25,6 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): >>> from bigframes.ml.linear_model import LogisticRegression >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index fd6e8678ea..e60cc8cec4 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -30,7 +30,6 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 2, 1, 3]) >>> y_pred = bpd.DataFrame([0, 1, 2, 3]) @@ -80,7 +79,6 @@ def confusion_matrix( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([2, 0, 2, 2, 0, 1]) >>> y_pred = bpd.DataFrame([0, 0, 2, 2, 0, 2]) @@ -132,7 +130,6 @@ def recall_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -181,7 +178,6 @@ def precision_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -232,7 +228,6 @@ def f1_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 9262ffbd3d..cd5bd2cbcd 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -33,7 +33,6 @@ def auc(x, y) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> x = bpd.DataFrame([1, 1, 2, 2]) >>> y = bpd.DataFrame([2, 3, 4, 5]) @@ -89,7 +88,6 @@ def roc_auc_score(y_true, y_score) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) @@ -139,7 +137,6 @@ def roc_curve( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([1, 1, 2, 2]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 1c14e8068b..85f0c1ecf9 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -46,7 +46,6 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -73,7 +72,6 @@ def mean_squared_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -100,7 +98,6 @@ def mean_absolute_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics - >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py index ec16fa8cf9..326589be7d 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py @@ -69,7 +69,6 @@ class KFold(_BaseKFold): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import KFold - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> kf = KFold(n_splits=3, random_state=42) @@ -162,7 +161,6 @@ def train_test_split( >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import train_test_split - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]}) >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index b93c47ea04..6f84018853 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -19,7 +19,6 @@ def cross_validate(estimator, X, y=None, *, cv=None): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import cross_validate, KFold >>> from bigframes.ml.linear_model import LinearRegression - >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> model = LinearRegression() diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 5476a9fb3c..64a5786f17 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -25,7 +25,6 @@ class OneHotEncoder(BaseEstimator): >>> from bigframes.ml.preprocessing import OneHotEncoder >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> enc = OneHotEncoder() >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) From c85d47fad87bfbaacaf6bdc33c285d29aae4369c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 21:59:14 +0000 Subject: [PATCH 02/36] fix docs --- bigframes/bigquery/_operations/datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index c4aba91a29..e27a3de0c8 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -69,7 +69,7 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** - >>> import bigframes.pandas as bpd + >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) From 78bfccffa58d2a260f235170e86c2c94ae321c0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 23:45:28 +0000 Subject: [PATCH 03/36] fix unit tests --- bigframes/bigquery/_operations/ai.py | 10 +++---- bigframes/core/tools/datetimes.py | 2 +- bigframes/operations/ai.py | 10 +++---- bigframes/operations/base.py | 2 +- bigframes/operations/semantics.py | 12 ++++----- bigframes/pandas/__init__.py | 1 - bigframes/session/__init__.py | 26 ++++++++++++++++--- scripts/publish_api_coverage.py | 1 + tests/unit/conftest.py | 24 +++++++++++++++++ tests/unit/test_pandas.py | 26 ++++++++++++------- .../pandas/core/tools/datetimes.py | 1 + 11 files changed, 82 insertions(+), 33 deletions(-) create mode 100644 tests/unit/conftest.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 7698c2c95c..3a9c7b130e 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -201,7 +201,7 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... 1 {'result': 4, 'full_response': '{"candidates":... @@ -275,7 +275,7 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) + >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... 1 {'result': 4.0, 'full_response': '{"candidates... @@ -346,7 +346,7 @@ def if_( **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) + >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True 1 True @@ -395,7 +395,7 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) + >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df creature type @@ -445,7 +445,7 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq - >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) + >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 1 1.0 diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index fd7561f4b4..0e5594d498 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -42,7 +42,7 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, - session: Optional[bigframes.session.Session], + session: Optional[bigframes.session.Session] = None, ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index dbbf16afc3..253b838e90 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -114,7 +114,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -132,7 +132,7 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -263,7 +263,7 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -352,7 +352,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -602,7 +602,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index ebb5767264..7d4c996ea5 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -135,7 +135,7 @@ def __init__( # explicitly chose even if it is None. This is important for the # polars backend where the implicit column labels are integers. if not isinstance(data, blocks.Block): - block = block.with_column_labels([name]) + block = block.with_column_labels([name or getattr(data, "name", None)]) self._block: blocks.Block = block diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index b4f7af1aca..176e0ad83a 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -246,7 +246,7 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -319,7 +319,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -432,7 +432,7 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -554,7 +554,7 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -794,7 +794,7 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -994,7 +994,7 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 19ea282762..0193dc629d 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -226,7 +226,6 @@ def to_datetime( format=format, unit=unit, ) - return bigframes.core.tools.to_datetime() to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 11621e8ea7..54755482f3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -2291,6 +2291,12 @@ def read_gbq_object_table( # interchangeably. # ========================================================================= def cut(self, *args, **kwargs) -> bigframes.series.Series: + """Cuts a BigQuery DataFrames object. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.cut` for full documentation. + """ import bigframes.core.reshape.tile return bigframes.core.reshape.tile.cut( @@ -2299,7 +2305,7 @@ def cut(self, *args, **kwargs) -> bigframes.series.Series: **kwargs, ) - def DataFrame(self, *args, **kwargs) -> bigframes.dataframe.DataFrame: + def DataFrame(self, *args, **kwargs): """Constructs a DataFrame. Included for compatibility between bpd and Session. @@ -2310,7 +2316,7 @@ def DataFrame(self, *args, **kwargs) -> bigframes.dataframe.DataFrame: return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) - def MultiIndex(self, *args, **kwargs) -> bigframes.core.indexes.MultiIndex: + def MultiIndex(self, *args, **kwargs): """Constructs a MultiIndex. Included for compatibility between bpd and Session. @@ -2325,7 +2331,7 @@ def MultiIndex(self, *args, **kwargs) -> bigframes.core.indexes.MultiIndex: MultiIndex.from_frame = bigframes.core.indexes.MultiIndex.from_frame # type: ignore MultiIndex.from_arrays = bigframes.core.indexes.MultiIndex.from_arrays # type: ignore - def Index(self, *args, **kwargs) -> bigframes.core.indexes.Index: + def Index(self, *args, **kwargs): """Constructs a Index. Included for compatibility between bpd and Session. @@ -2336,7 +2342,7 @@ def Index(self, *args, **kwargs) -> bigframes.core.indexes.Index: return bigframes.core.indexes.Index(*args, session=self, **kwargs) - def Series(self, *args, **kwargs) -> bigframes.series.Series: + def Series(self, *args, **kwargs): """Constructs a Series. Included for compatibility between bpd and Session. @@ -2350,6 +2356,12 @@ def Series(self, *args, **kwargs) -> bigframes.series.Series: def to_datetime( self, *args, **kwargs ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + """Converts a BigQuery DataFrames object to datetime dtype. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.to_datetime` for full documentation. + """ import bigframes.core.tools return bigframes.core.tools.to_datetime( @@ -2359,6 +2371,12 @@ def to_datetime( ) def to_timedelta(self, *args, **kwargs): + """Converts a BigQuery DataFrames object to timedelta/duration dtype. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.to_timedelta` for full documentation. + """ import bigframes.pandas.core.tools.timedeltas return bigframes.pandas.core.tools.timedeltas.to_timedelta( diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 8f305bcc0f..6c94c06456 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -25,6 +25,7 @@ import pandas.core.indexes.accessors import pandas.core.strings.accessor import pandas.core.window.rolling +import sklearn # noqa import bigframes import bigframes.core.groupby diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000000..a9b26afeef --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +@pytest.fixture(scope="session") +def polars_session(): + pytest.importorskip("polars") + + from bigframes.testing import polars_session + + return polars_session.TestSession() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 73e0b7f2d6..5e75e6b20f 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -64,8 +64,12 @@ def test_method_matches_session(method_name: str): pandas_method = getattr(bigframes.pandas, method_name) pandas_doc = inspect.getdoc(pandas_method) assert pandas_doc is not None, "docstrings are required" - assert re.sub(leading_whitespace, "", pandas_doc) == re.sub( - leading_whitespace, "", session_doc + + pandas_doc_stripped = re.sub(leading_whitespace, "", pandas_doc) + session_doc_stripped = re.sub(leading_whitespace, "", session_doc) + assert ( + pandas_doc_stripped == session_doc_stripped + or ":`bigframes.pandas" in session_doc_stripped ) # Add `eval_str = True` so that deferred annotations are turned into their @@ -75,18 +79,20 @@ def test_method_matches_session(method_name: str): eval_str=True, globals={**vars(bigframes.session), **{"dataframe": bigframes.dataframe}}, ) - pandas_signature = inspect.signature(pandas_method, eval_str=True) - assert [ - # Kind includes position, which will be an offset. - parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) - for parameter in pandas_signature.parameters.values() - ] == [ + session_args = [ # Kind includes position, which will be an offset. parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) for parameter in session_signature.parameters.values() # Don't include the first parameter, which is `self: Session` - ][ - 1: + ][1:] + pandas_signature = inspect.signature(pandas_method, eval_str=True) + pandas_args = [ + # Kind includes position, which will be an offset. + parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) + for parameter in pandas_signature.parameters.values() + ] + assert session_args == pandas_args or ["args", "kwargs"] == [ + parameter.name for parameter in session_args ] assert pandas_signature.return_annotation == session_signature.return_annotation diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 189dabcf24..105277dbf0 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -21,6 +21,7 @@ def to_datetime( utc=False, format=None, unit=None, + session=None, ) -> Union[pd.Timestamp, datetime, series.Series]: """ This function converts a scalar, array-like or Series to a datetime object. From 210dc9abdebca934183659d6580cafdcbc1a99f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 23:51:29 +0000 Subject: [PATCH 04/36] skip sklearn test --- scripts/publish_api_coverage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 6c94c06456..181b8c3365 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -25,7 +25,6 @@ import pandas.core.indexes.accessors import pandas.core.strings.accessor import pandas.core.window.rolling -import sklearn # noqa import bigframes import bigframes.core.groupby @@ -205,6 +204,9 @@ def generate_pandas_api_coverage(): def generate_sklearn_api_coverage(): """Explore all SKLearn modules, and for each item contained generate a regex to detect it being imported, and record whether we implement it""" + + import sklearn # noqa + sklearn_modules = [ "sklearn", "sklearn.model_selection", From bed4069f29f1caf01094aad3b174de452a7a9736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 7 Oct 2025 23:55:37 +0000 Subject: [PATCH 05/36] fix snapshot --- .../snapshots/test_blob_ops/test_obj_get_access_url/out.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql index 4a963b4972..25004c424d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql @@ -21,5 +21,5 @@ WITH `bfcte_0` AS ( ) SELECT `bfcol_0` AS `rowindex`, - `bfcol_10` AS `string_col` + `bfcol_10` AS `0` FROM `bfcte_3` \ No newline at end of file From 20cae2d5370c193469cb60f476dbecd68dcc02aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 15:00:39 +0000 Subject: [PATCH 06/36] plumb through session for from_tuples and from_arrays --- bigframes/core/indexes/multi.py | 41 ++++++++++++++++++++++++++++++--- bigframes/session/__init__.py | 12 ++++------ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index 182d1f101c..dc81f812b5 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -14,13 +14,16 @@ from __future__ import annotations -from typing import cast, Hashable, Iterable, Sequence +from typing import cast, Hashable, Iterable, Optional, Sequence, TYPE_CHECKING import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas from bigframes.core.indexes.base import Index +if TYPE_CHECKING: + import bigframes.session + class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ @@ -31,10 +34,12 @@ def from_tuples( tuples: Iterable[tuple[Hashable, ...]], sortorder: int | None = None, names: Sequence[Hashable] | Hashable | None = None, + *, + session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index)) + return cast(MultiIndex, Index(pd_index, session=session)) @classmethod def from_arrays( @@ -42,7 +47,37 @@ def from_arrays( arrays, sortorder: int | None = None, names=None, + *, + session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index)) + return cast(MultiIndex, Index(pd_index, session=session)) + + +class MultiIndexAccessor: + """Proxy to MultiIndex constructors to allow a session to be passed in.""" + + def __init__(self, session: bigframes.session.Session): + self._session = session + + def __call__(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :class:`bigframes.pandas.MultiIndex`. + """ + return MultiIndex(*args, session=self._session, **kwargs) + + def from_arrays(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_arrays`. + """ + return MultiIndex.from_arrays(*args, session=self._session, **kwargs) + + def from_tuples(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_tuples`. + """ + return MultiIndex.from_tuples(*args, session=self._session, **kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 54755482f3..1250cfa9e8 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,6 +68,7 @@ import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.indexes +import bigframes.core.indexes.multi import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed @@ -2316,20 +2317,17 @@ def DataFrame(self, *args, **kwargs): return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) - def MultiIndex(self, *args, **kwargs): + @property + def MultiIndex(self) -> bigframes.core.indexes.multi.MultiIndexAccessor: """Constructs a MultiIndex. Included for compatibility between bpd and Session. See :class:`bigframes.pandas.MulitIndex` for full documentation. """ - import bigframes.core.indexes - - return bigframes.core.indexes.MultiIndex(*args, session=self, **kwargs) + import bigframes.core.indexes.multi - MultiIndex.from_tuples = bigframes.core.indexes.MultiIndex.from_tuples # type: ignore - MultiIndex.from_frame = bigframes.core.indexes.MultiIndex.from_frame # type: ignore - MultiIndex.from_arrays = bigframes.core.indexes.MultiIndex.from_arrays # type: ignore + return bigframes.core.indexes.multi.MultiIndexAccessor(self) def Index(self, *args, **kwargs): """Constructs a Index. From 1dc648b8a33d90648f5ffe1bec7a9ab6ad8f3b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 15:10:53 +0000 Subject: [PATCH 07/36] add from_frame --- bigframes/core/indexes/multi.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index a4133927bc..a611442b88 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -100,6 +100,13 @@ def from_arrays(self, *args, **kwargs) -> MultiIndex: """ return MultiIndex.from_arrays(*args, session=self._session, **kwargs) + def from_frame(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_frame`. + """ + return cast(MultiIndex, MultiIndex.from_frame(*args, **kwargs)) + def from_tuples(self, *args, **kwargs) -> MultiIndex: """Construct a MultiIndex using the associated Session. From 9de6f9fbf2244fdd886e4a2d762b1cc70cedebdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:08:00 +0000 Subject: [PATCH 08/36] make sure polars session isnt skipped on Kokoro --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index a46dc36b3e..099d17f631 100644 --- a/noxfile.py +++ b/noxfile.py @@ -115,7 +115,7 @@ # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. "3.10": ["tests", "scikit-learn", "anywidget"], - "3.11": ["tests", "scikit-learn", "polars", "anywidget"], + LATEST_FULLY_SUPPORTED_PYTHON: ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars", "anywidget"], } From 5d23dee5fab501399bd3998436b1ba7e21e7080d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:09:26 +0000 Subject: [PATCH 09/36] fix apply doctest --- third_party/bigframes_vendored/pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c6ec5dfaf1..5e9c9e0113 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1783,7 +1783,7 @@ def apply( dtype: Int64 >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP - >>> hours + >>> hours # doctest: +SKIP 0 0.0 1 0.5 2 1.0 From 20d7c27543e51dc2a1e1bb5d796026b07bfc9343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:18:55 +0000 Subject: [PATCH 10/36] make doctest conftest available everywhere --- bigframes/conftest.py | 21 +++++++++---------- .../{pandas => }/conftest.py | 21 +++++++++---------- 2 files changed, 20 insertions(+), 22 deletions(-) rename third_party/bigframes_vendored/{pandas => }/conftest.py (77%) diff --git a/bigframes/conftest.py b/bigframes/conftest.py index e1f3f6d84c..f418c9feba 100644 --- a/bigframes/conftest.py +++ b/bigframes/conftest.py @@ -22,24 +22,23 @@ import bigframes._config -@pytest.fixture(scope="session") -def polars_session(): - pytest.importorskip("polars") - - from bigframes.testing import polars_session - - return polars_session.TestSession() - - @pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace, polars_session): +def default_doctest_imports(doctest_namespace): """ Avoid some boilerplate in pandas-inspired tests. See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture """ + try: + from bigframes.testing import polars_session + + bpd = polars_session.TestSession() + except ImportError: + # Don't skip doctest if polars isn't available. + import bigframes.pandas as bpd # type: ignore + doctest_namespace["np"] = np doctest_namespace["pd"] = pd doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = polars_session + doctest_namespace["bpd"] = bpd bigframes._config.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/conftest.py b/third_party/bigframes_vendored/conftest.py similarity index 77% rename from third_party/bigframes_vendored/pandas/conftest.py rename to third_party/bigframes_vendored/conftest.py index e1f3f6d84c..cafd6a1b7c 100644 --- a/third_party/bigframes_vendored/pandas/conftest.py +++ b/third_party/bigframes_vendored/conftest.py @@ -22,24 +22,23 @@ import bigframes._config -@pytest.fixture(scope="session") -def polars_session(): - pytest.importorskip("polars") - - from bigframes.testing import polars_session - - return polars_session.TestSession() - - @pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace, polars_session): +def default_doctest_imports(doctest_namespace): """ Avoid some boilerplate in pandas-inspired tests. See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture """ + try: + from bigframes.testing import polars_session + + bpd = polars_session.TestSession() + except ImportError: + # Don't skip doctest if polars isn't available. + import bigframes.pandas as bpd + doctest_namespace["np"] = np doctest_namespace["pd"] = pd doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = polars_session + doctest_namespace["bpd"] = bpd bigframes._config.options.display.progress_bar = None From fbe606e0c61d94344cbb3ab45541eb620874df18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 16:44:56 +0000 Subject: [PATCH 11/36] add python version flexibility for to_dict --- third_party/bigframes_vendored/pandas/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5e9c9e0113..b089c65d3b 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -549,8 +549,8 @@ def to_dict( >>> s.to_dict() {np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4} - >>> s.to_dict(into=OrderedDict) - OrderedDict([(np.int64(0), 1), (np.int64(1), 2), (np.int64(2), 3), (np.int64(3), 4)]) + >>> s.to_dict(into=OrderedDict) # doctest:+ELLIPSIS + OrderedDict(...) >>> dd = defaultdict(list) >>> s.to_dict(into=dd) From 171f3ece378e77993e630cedbc343e987ea1ccd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 18:54:17 +0000 Subject: [PATCH 12/36] disambiguate explicit names --- bigframes/operations/base.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 7d4c996ea5..38aa1f4b9b 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -15,7 +15,7 @@ from __future__ import annotations import typing -from typing import List, Sequence, Union +from typing import Any, List, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -34,6 +34,8 @@ import bigframes.series as series import bigframes.session +_NO_NAME_SENTINEL = object() + class SeriesMethods: def __init__( @@ -134,8 +136,17 @@ def __init__( # If we didn't get a block make sure the name is what the user # explicitly chose even if it is None. This is important for the # polars backend where the implicit column labels are integers. - if not isinstance(data, blocks.Block): - block = block.with_column_labels([name or getattr(data, "name", None)]) + if name: + default_name: Any = name + elif hasattr(data, "name"): + default_name = getattr(data, "name", None) + elif hasattr(data, "_name"): + default_name = getattr(data, "_name", None) + else: + default_name = _NO_NAME_SENTINEL + + if default_name is not _NO_NAME_SENTINEL: + block = block.with_column_labels([default_name]) self._block: blocks.Block = block @@ -165,8 +176,7 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - result = series.Series(block.select_column(result_id)) - result.name = getattr(self, "name", None) + result = series.Series(block.select_column(result_id), name=self._name) return result def _apply_binary_op( From ded5c1e548629b106c811af20ca79ed509a7989e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 19:17:27 +0000 Subject: [PATCH 13/36] disambiguate explicit name none versus no name --- bigframes/operations/base.py | 46 ++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 38aa1f4b9b..91226ac7b6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,8 +14,9 @@ from __future__ import annotations +import enum import typing -from typing import Any, List, Sequence, Union +from typing import List, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -34,7 +35,17 @@ import bigframes.series as series import bigframes.session -_NO_NAME_SENTINEL = object() + +class Default(enum.Enum): + """Sentinel that can disambiguate explicit None from missing. + + See https://stackoverflow.com/a/76606310/101923 + """ + + token = 0 + + +DEFAULT = Default.token class SeriesMethods: @@ -45,7 +56,7 @@ def __init__( dtype: typing.Optional[ bigframes.dtypes.DtypeString | bigframes.dtypes.Dtype ] = None, - name: str | None = None, + name: str | None | Default = DEFAULT, copy: typing.Optional[bool] = None, *, session: typing.Optional[bigframes.session.Session] = None, @@ -73,6 +84,16 @@ def __init__( f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}" ) + if name is DEFAULT: + if isinstance(data, blocks.Block): + name = data.column_labels[0] + elif hasattr(data, "name"): + name = getattr(data, "name") + elif hasattr(data, "_name"): + name = getattr(data, "_name") + else: + name = None + if isinstance(data, blocks.Block): block = data elif isinstance(data, SeriesMethods): @@ -109,6 +130,7 @@ def __init__( block = data_block if block: + # Data was a bigframes object. assert len(block.value_columns) == 1 assert len(block.column_labels) == 1 if index is not None: # reindexing operation @@ -121,6 +143,7 @@ def __init__( bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: + # Data was local. if isinstance(dtype, str) and dtype.lower() == "json": dtype = bigframes.dtypes.JSON_DTYPE pd_series = pd.Series( @@ -129,25 +152,12 @@ def __init__( dtype=dtype, # type:ignore name=name, ) + name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore assert block is not None - # If we didn't get a block make sure the name is what the user - # explicitly chose even if it is None. This is important for the - # polars backend where the implicit column labels are integers. - if name: - default_name: Any = name - elif hasattr(data, "name"): - default_name = getattr(data, "name", None) - elif hasattr(data, "_name"): - default_name = getattr(data, "_name", None) - else: - default_name = _NO_NAME_SENTINEL - - if default_name is not _NO_NAME_SENTINEL: - block = block.with_column_labels([default_name]) - + block = block.with_column_labels([name]) self._block: blocks.Block = block @property From 841bc64dad7304fbabf8fbab5ae0a8799a836a6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 19:39:32 +0000 Subject: [PATCH 14/36] fix for column name comparison in pandas bin op --- bigframes/core/blocks.py | 2 +- noxfile.py | 4 +--- .../snapshots/test_blob_ops/test_obj_get_access_url/out.sql | 2 +- tests/unit/test_dataframe_polars.py | 3 ++- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f9896784bb..cf3518ff29 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2471,7 +2471,7 @@ def _align_series_block_axis_1( def _align_pd_series_axis_1( self, other: pd.Series, how: str ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: - if self.column_labels.equals(other.index): + if self.column_labels.astype("object").equals(other.index.astype("object")): columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: if not (self.column_labels.is_unique and other.index.is_unique): diff --git a/noxfile.py b/noxfile.py index 099d17f631..703937d453 100644 --- a/noxfile.py +++ b/noxfile.py @@ -46,9 +46,7 @@ "3.11", ] -# pytest-retry is not yet compatible with pytest 8.x. -# https://github.com/str0zzapreti/pytest-retry/issues/32 -PYTEST_VERSION = "pytest<8.0.0dev" +PYTEST_VERSION = "pytest==8.4.2" SPHINX_VERSION = "sphinx==4.5.0" LINT_PATHS = [ "docs", diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql index 25004c424d..4a963b4972 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_blob_ops/test_obj_get_access_url/out.sql @@ -21,5 +21,5 @@ WITH `bfcte_0` AS ( ) SELECT `bfcol_0` AS `rowindex`, - `bfcol_10` AS `0` + `bfcol_10` AS `string_col` FROM `bfcte_3` \ No newline at end of file diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index a6f5c3d1ef..c95c647fa8 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2319,7 +2319,8 @@ def test_binop_with_self_aggregate(session, scalars_dfs): df_columns = ["int64_col", "float64_col", "int64_too"] bf_df = scalars_df[df_columns] - bf_result = (bf_df - bf_df.mean()).to_pandas() + bf_deviation = bf_df - bf_df.mean() + bf_result = bf_deviation.to_pandas() pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() From 81f49a6cce0622ffbcf739dc374392bc6fae74da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:07:39 +0000 Subject: [PATCH 15/36] avoid setting column labels in special case of Series(block) --- bigframes/operations/base.py | 16 ++++------------ bigframes/session/__init__.py | 14 +++++++------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 91226ac7b6..7d6a1c3b68 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -84,16 +84,6 @@ def __init__( f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}" ) - if name is DEFAULT: - if isinstance(data, blocks.Block): - name = data.column_labels[0] - elif hasattr(data, "name"): - name = getattr(data, "name") - elif hasattr(data, "_name"): - name = getattr(data, "_name") - else: - name = None - if isinstance(data, blocks.Block): block = data elif isinstance(data, SeriesMethods): @@ -139,6 +129,8 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) + if name is not DEFAULT: + block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -150,14 +142,14 @@ def __init__( data=data, index=index, # type:ignore dtype=dtype, # type:ignore - name=name, + name=name if name is not DEFAULT else None, ) name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore + block = block.with_column_labels([name]) assert block is not None - block = block.with_column_labels([name]) self._block: blocks.Block = block @property diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index d12117dd73..0490152003 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1829,7 +1829,7 @@ def udf( Turning an arbitrary python function into a BigQuery managed python udf: >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) + >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) # doctest: +SKIP ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1842,8 +1842,8 @@ def udf( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) - >>> hours + >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP + >>> hours # doctest: +SKIP 0 0.0 1 0.5 2 1.0 @@ -1856,7 +1856,7 @@ def udf( packages (optionally with the package version) via `packages` param. >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf( + >>> @bpd.udf( # doctest: +SKIP ... dataset="bigfranes_testing", ... name=bq_name, ... packages=["cryptography"] @@ -1873,14 +1873,14 @@ def udf( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) + >>> hashes = names.apply(get_hash) # doctest: +SKIP You can clean-up the BigQuery functions created above using the BigQuery client from the BigQuery DataFrames session: >>> session = bpd.get_global_session() - >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) - >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) + >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) # doctest: +SKIP + >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) # doctest: +SKIP Args: input_types (type or sequence(type), Optional): From 5b605054b439e5a910340df444979adbf5cb7c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:30:18 +0000 Subject: [PATCH 16/36] revert doctest changes --- bigframes/bigquery/_operations/ai.py | 11 +- bigframes/bigquery/_operations/approx_agg.py | 1 + bigframes/conftest.py | 44 -- bigframes/operations/semantics.py | 20 +- bigframes/operations/strings.py | 1 + .../bigframes_vendored/geopandas/geoseries.py | 9 + .../bigframes_vendored/pandas/AUTHORS.md | 1 + .../bigframes_vendored/pandas/README.md | 2 + .../pandas/core/arrays/arrow/accessors.py | 25 +- .../pandas/core/arrays/datetimelike.py | 7 +- .../pandas/core/computation/eval.py | 3 + .../pandas/core/computation/expr.py | 3 + .../pandas/core/computation/ops.py | 1 + .../bigframes_vendored/pandas/core/frame.py | 436 +++++++++++++++-- .../bigframes_vendored/pandas/core/generic.py | 45 +- .../pandas/core/groupby/__init__.py | 127 ++++- .../pandas/core/indexes/accessor.py | 47 ++ .../pandas/core/indexes/base.py | 108 ++++- .../pandas/core/indexes/datetimes.py | 24 + .../pandas/core/indexes/multi.py | 4 + .../pandas/core/reshape/tile.py | 3 + .../bigframes_vendored/pandas/core/series.py | 443 ++++++++++++++++-- .../pandas/core/strings/accessor.py | 104 +++- .../pandas/core/tools/datetimes.py | 5 +- .../pandas/core/tools/timedeltas.py | 1 + .../bigframes_vendored/pandas/io/gbq.py | 1 + .../bigframes_vendored/pandas/io/parquet.py | 2 + .../pandas/io/parsers/readers.py | 4 + .../bigframes_vendored/pandas/io/pickle.py | 2 + .../pandas/pandas/_typing.py | 2 + .../pandas/plotting/_core.py | 12 +- .../sklearn/cluster/_kmeans.py | 1 + .../sklearn/decomposition/_mf.py | 1 + .../sklearn/decomposition/_pca.py | 1 + .../sklearn/impute/_base.py | 1 + .../sklearn/linear_model/_base.py | 1 + .../sklearn/linear_model/_logistic.py | 1 + .../sklearn/metrics/_classification.py | 5 + .../sklearn/metrics/_ranking.py | 3 + .../sklearn/metrics/_regression.py | 3 + .../sklearn/model_selection/_split.py | 2 + .../sklearn/model_selection/_validation.py | 1 + .../sklearn/preprocessing/_encoder.py | 1 + 43 files changed, 1370 insertions(+), 149 deletions(-) delete mode 100644 bigframes/conftest.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 0213e81658..0c5eba9496 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -52,13 +52,14 @@ def generate( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> country = bpd.Series(["Japan", "Canada"]) - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) # doctest: +SKIP + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) 0 {'result': 'Tokyo\\n', 'full_response': '{"cand... 1 {'result': 'Ottawa\\n', 'full_response': '{"can... dtype: struct>, status: string>[pyarrow] - >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") # doctest: +SKIP + >>> bbq.ai.generate(("What's the capital city of ", country, " one word only")).struct.field("result") 0 Tokyo\\n 1 Ottawa\\n Name: result, dtype: string @@ -146,6 +147,7 @@ def generate_bool( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... "col_1": ["apple", "bear", "pear"], ... "col_2": ["fruit", "animal", "animal"] @@ -223,6 +225,7 @@ def generate_int( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) 0 {'result': 2, 'full_response': '{"candidates":... @@ -297,6 +300,7 @@ def generate_double( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"]) >>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) 0 {'result': 2.0, 'full_response': '{"candidates... @@ -368,6 +372,7 @@ def if_( **Examples:** >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> us_state = bpd.Series(["Massachusetts", "Illinois", "Hawaii"]) >>> bbq.ai.if_((us_state, " has a city called Springfield")) 0 True @@ -417,6 +422,7 @@ def classify( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'creature': ['Cat', 'Salmon']}) >>> df['type'] = bbq.ai.classify(df['creature'], ['Mammal', 'Fish']) >>> df @@ -467,6 +473,7 @@ def score( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> animal = bpd.Series(["Tiger", "Rabbit", "Blue Whale"]) >>> bbq.ai.score(("Rank the relative weights of ", animal, " on the scale from 1 to 3")) # doctest: +SKIP 0 2.0 diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 73b6fdbb73..696f8f5a66 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -40,6 +40,7 @@ def approx_top_count( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) >>> bbq.approx_top_count(s, number=2) [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] diff --git a/bigframes/conftest.py b/bigframes/conftest.py deleted file mode 100644 index f418c9feba..0000000000 --- a/bigframes/conftest.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import bigframes._config - - -@pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace): - """ - Avoid some boilerplate in pandas-inspired tests. - - See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture - """ - try: - from bigframes.testing import polars_session - - bpd = polars_session.TestSession() - except ImportError: - # Don't skip doctest if polars isn't available. - import bigframes.pandas as bpd # type: ignore - - doctest_namespace["np"] = np - doctest_namespace["pd"] = pd - doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = bpd - bigframes._config.options.display.progress_bar = None diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 176e0ad83a..9fa5450748 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -52,6 +52,7 @@ def agg( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 @@ -246,7 +247,8 @@ def cluster_by( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -319,7 +321,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -432,7 +435,8 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -554,7 +558,8 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -692,6 +697,7 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True @@ -794,7 +800,8 @@ def top_k( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -994,7 +1001,8 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index c69993849a..4743483954 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -68,6 +68,7 @@ def reverse(self) -> series.Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) >>> s.str.reverse() diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index 20587b4d57..92a58b3dc6 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -18,6 +18,7 @@ class GeoSeries: >>> import bigframes.geopandas >>> import bigframes.pandas as bpd >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -72,6 +73,7 @@ def x(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -98,6 +100,7 @@ def y(self) -> bigframes.series.Series: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.geometry.Point(1, 2), shapely.geometry.Point(2, 3), shapely.geometry.Point(3, 4)], @@ -126,6 +129,7 @@ def boundary(self) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import geopandas.array >>> import shapely.geometry + >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Polygon, LineString, Point >>> s = geopandas.GeoSeries( @@ -167,6 +171,7 @@ def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: >>> import bigframes.pandas as bpd >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None >>> x = [2.5, 5, -3.0] >>> y = [0.5, 1, 1.5] @@ -205,6 +210,7 @@ def from_wkt(cls, data, index=None) -> bigframes.geopandas.GeoSeries: >>> import bigframes as bpd >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None >>> wkts = [ ... 'POINT (1 1)', @@ -240,6 +246,7 @@ def to_wkt(self) -> bigframes.series.Series: >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s @@ -272,6 +279,7 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -403,6 +411,7 @@ def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignor >>> import bigframes as bpd >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. diff --git a/third_party/bigframes_vendored/pandas/AUTHORS.md b/third_party/bigframes_vendored/pandas/AUTHORS.md index 396bcbf9dd..84fcfe05e3 100644 --- a/third_party/bigframes_vendored/pandas/AUTHORS.md +++ b/third_party/bigframes_vendored/pandas/AUTHORS.md @@ -47,6 +47,7 @@ file to indicate the copyright and license terms: Other licenses can be found in the LICENSES directory. +License ======= pandas is distributed under a 3-clause ("Simplified" or "New") BSD diff --git a/third_party/bigframes_vendored/pandas/README.md b/third_party/bigframes_vendored/pandas/README.md index f92a629a4c..1aa5068d5e 100644 --- a/third_party/bigframes_vendored/pandas/README.md +++ b/third_party/bigframes_vendored/pandas/README.md @@ -60,6 +60,7 @@ Here are just a few of the things that pandas does well: generation and frequency conversion, moving window statistics, date shifting and lagging + [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures @@ -119,6 +120,7 @@ python setup.py install or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): + ```sh python -m pip install -e . --no-build-isolation --no-use-pep517 ``` diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 9f6dfc1c74..fe15e7b40d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -19,12 +19,14 @@ def len(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.len() 0 3 @@ -43,12 +45,14 @@ def __getitem__(self, key: int | slice): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list[0] 0 1 @@ -79,13 +83,15 @@ def field(self, name_or_index: str | int): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -123,13 +129,15 @@ def explode(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -157,13 +165,15 @@ def dtypes(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -190,6 +200,8 @@ def explode(self, column, *, separator: str = "."): **Examples:** >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ @@ -197,7 +209,7 @@ def explode(self, column, *, separator: str = "."): ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( + ... dtype=bpd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -221,6 +233,7 @@ def explode(self, column, *, separator: str = "."): Separator/delimiter to use to separate the original column name from the sub-field column name. + Returns: DataFrame: Original DataFrame with exploded struct column(s). diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index eeffbbdb7f..1736a7f9ef 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -15,6 +15,8 @@ def strftime(self, date_format: str): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.to_datetime( ... ['2014-08-15 08:15:12', '2012-02-29 08:15:12+06:00', '2015-08-15 08:15:12+05:00'], ... utc=True @@ -34,7 +36,6 @@ def strftime(self, date_format: str): bigframes.pandas.Series: Series of formatted strings. """ - # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def normalize(self): @@ -50,6 +51,7 @@ def normalize(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> s = bpd.Series(pd.date_range( ... start='2014-08-01 10:00', @@ -66,7 +68,6 @@ def normalize(self): bigframes.pandas.Series: Series of the same dtype as the data. """ - # TODO(tswast): remove bpd boilerplate when normalize is implemented in polars session. raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floor(self, freq: str): @@ -84,6 +85,8 @@ def floor(self, freq: str): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') >>> bpd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 2f01b7edfc..d3d11a9c2a 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -172,6 +172,9 @@ def eval( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df animal age diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py index ca9e6a60ce..44f649e59d 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/expr.py +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -165,6 +165,7 @@ def _is_type(t): _is_list = _is_type(list) _is_str = _is_type(str) + # partition all AST nodes _all_nodes = frozenset( node @@ -196,9 +197,11 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) + # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + _unsupported_expr_nodes = frozenset( [ "Yield", diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py index a15972fc4c..75b914c876 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/ops.py +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -52,6 +52,7 @@ MATHOPS = _unary_math_ops + _binary_math_ops + LOCAL_TAG = "__pd_eval_local_" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b433c739cc..557c332797 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -39,6 +39,9 @@ def shape(self) -> tuple[int, int]: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2, 3], ... 'col2': [4, 5, 6]}) >>> df.shape @@ -60,6 +63,9 @@ def axes(self) -> list: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes[1:] [Index(['col1', 'col2'], dtype='object')] @@ -72,6 +78,9 @@ def values(self) -> np.ndarray: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.values array([[1, 3], @@ -101,6 +110,8 @@ def T(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -135,6 +146,9 @@ def transpose(self) -> DataFrame: **Square DataFrame with homogeneous dtype** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = bpd.DataFrame(data=d1) >>> df1 @@ -242,6 +256,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) >>> df.select_dtypes(include=['Int64']) col1 @@ -257,6 +274,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: [2 rows x 2 columns] + Args: include (scalar or list-like): A selection of dtypes or strings to be included. @@ -362,6 +380,9 @@ def to_numpy( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_numpy() array([[1, 3], @@ -398,9 +419,11 @@ def to_gbq( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Write a DataFrame to a BigQuery table. - >>> import bigframes.pandas as bpd >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> # destination_table = PROJECT_ID + "." + DATASET_ID + "." + TABLE_NAME >>> df.to_gbq("bigframes-dev.birds.test-numbers", if_exists="replace") @@ -487,6 +510,7 @@ def to_gbq( If an invalid value is provided for ``if_exists`` that is not one of ``fail``, ``replace``, or ``append``. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -506,6 +530,8 @@ def to_parquet( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_parquet*.parquet" >>> df.to_parquet(path=gcs_bucket) @@ -560,6 +586,9 @@ def to_dict( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_dict() {'col1': {np.int64(0): 1, np.int64(1): 2}, 'col2': {np.int64(0): 3, np.int64(1): 4}} @@ -637,17 +666,12 @@ def to_excel( **Examples:** - >>> import tempfile >>> import bigframes.pandas as bpd + >>> import tempfile + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - - >>> try: - ... import openpyxl - ... df.to_excel(tempfile.TemporaryFile()) - ... - ... except ImportError: - ... pass # openpyxl is required. + >>> df.to_excel(tempfile.TemporaryFile()) Args: excel_writer (path-like, file-like, or ExcelWriter object): @@ -679,6 +703,9 @@ def to_latex( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_latex()) \begin{tabular}{lrr} @@ -727,6 +754,9 @@ def to_records( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_records() rec.array([(0, 1, 3), (1, 2, 4)], @@ -784,6 +814,9 @@ def to_string( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_string()) col1 col2 @@ -881,6 +914,9 @@ def to_html( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_html())
@@ -988,6 +1024,9 @@ def to_markdown( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> print(df.to_markdown()) | | col1 | col2 | @@ -1019,6 +1058,9 @@ def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> gcs_bucket = "gs://bigframes-dev-testing/sample_pickle_gcs.pkl" >>> df.to_pickle(path=gcs_bucket) @@ -1038,6 +1080,9 @@ def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | No **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> import tempfile >>> df.to_orc(tempfile.TemporaryFile()) @@ -1145,6 +1190,9 @@ def insert(self, loc, column, value, allow_duplicates=False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. @@ -1195,6 +1243,9 @@ def drop( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) >>> df @@ -1233,6 +1284,7 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame: + >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -1317,6 +1369,7 @@ def align( Join method is specified for each axis Index. + Args: other (DataFrame or Series): join ({'outer', 'inner', 'left', 'right'}, default 'outer'): @@ -1349,6 +1402,9 @@ def rename( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> df A B @@ -1418,6 +1474,9 @@ def set_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], ... 'sale': [55, 40, 84, 31]}) @@ -1557,6 +1616,10 @@ def reset_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import numpy as np >>> df = bpd.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -1596,6 +1659,7 @@ class max_speed You can also use ``reset_index`` with ``MultiIndex``. + >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), @@ -1636,6 +1700,7 @@ class name speed max [4 rows x 2 columns] + Args: level (int, str, tuple, or list, default None): Only remove the given levels from the index. Removes all levels by @@ -1730,9 +1795,12 @@ def dropna( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [pd.NA, "1940-04-25", pd.NA]}) + ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) >>> df name toy born 0 Alfred @@ -1821,6 +1889,7 @@ def dropna( ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + Returns: bigframes.pandas.DataFrame: DataFrame with NA entries dropped from it. @@ -1839,6 +1908,9 @@ def isin(self, values): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) >>> df @@ -1892,6 +1964,9 @@ def keys(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1910,6 +1985,8 @@ def iterrows(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1934,6 +2011,8 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -1965,6 +2044,9 @@ def items(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, ... index=['panda', 'polar', 'koala']) @@ -2003,6 +2085,9 @@ def where(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2092,6 +2177,9 @@ def mask(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) >>> df a b @@ -2192,8 +2280,11 @@ def sort_values( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ - ... 'col1': ['A', 'A', 'B', pd.NA, 'D', 'C'], + ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] @@ -2333,6 +2424,9 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2373,6 +2467,9 @@ def __eq__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2401,6 +2498,9 @@ def __invert__(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'a':[True, False, True], 'b':[-1, 0, 1]}) >>> ~df a b @@ -2427,6 +2527,9 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2466,6 +2569,9 @@ def __ne__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, 3, 4], ... 'b': [360, 0, 180] @@ -2503,6 +2609,9 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2543,6 +2652,9 @@ def __le__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2580,6 +2692,9 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2620,6 +2735,9 @@ def __lt__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2657,6 +2775,9 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can use method name: >>> df = bpd.DataFrame({'angles': [0, 3, 4], @@ -2697,6 +2818,9 @@ def __ge__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2734,6 +2858,9 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'angles': [0, 3, 4], ... 'degrees': [360, 180, 360]}, ... index=['circle', 'triangle', 'rectangle']) @@ -2772,6 +2899,9 @@ def __gt__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, -1, 1], ... 'b': [1, 0, -1] @@ -2806,6 +2936,9 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2847,6 +2980,9 @@ def __add__(self, other) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'height': [1.5, 2.6], ... 'weight': [500, 800] @@ -2919,6 +3055,9 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -2979,6 +3118,9 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3020,6 +3162,9 @@ def __sub__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can subtract a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3065,6 +3210,9 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3123,6 +3271,9 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3164,6 +3315,9 @@ def __mul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3209,6 +3363,9 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3250,6 +3407,9 @@ def __rmul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3295,6 +3455,9 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3336,6 +3499,9 @@ def __truediv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3381,6 +3547,9 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3439,6 +3608,9 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3480,6 +3652,9 @@ def __floordiv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can divide by a scalar: >>> df = bpd.DataFrame({"a": [15, 15, 15], "b": [30, 30, 30]}) @@ -3525,6 +3700,9 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3583,6 +3761,9 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3624,6 +3805,9 @@ def __mod__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can modulo with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3669,6 +3853,9 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3728,6 +3915,9 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3770,6 +3960,9 @@ def __pow__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can exponentiate with a scalar: >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -3816,6 +4009,9 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -3909,6 +4105,9 @@ def combine( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 @@ -3956,6 +4155,9 @@ def combine_first(self, other) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) @@ -3983,6 +4185,10 @@ def explode( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], ... 'B': 1, ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) @@ -4038,6 +4244,9 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4069,6 +4278,9 @@ def cov(self, *, numeric_only) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) @@ -4105,6 +4317,9 @@ def corrwith( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] >>> df1 = bpd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) @@ -4138,6 +4353,9 @@ def update( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3], ... 'B': [400, 500, 600]}) >>> new_df = bpd.DataFrame({'B': [4, 5, 6], @@ -4200,6 +4418,9 @@ def groupby( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) @@ -4294,18 +4515,17 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` is created every time we run the following code, but you can skip it to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> def minutes_to_hours(x: int) -> float: - ... return x / 60 - >>> minutes_to_hours = bpd.deploy_remote_function( - ... minutes_to_hours, - ... reuse=False, - ... cloud_function_service_account="default", - ... ) # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + ... def minutes_to_hours(x: int) -> float: + ... return x/60 >>> df_minutes = bpd.DataFrame( ... {"system_minutes" : [0, 30, 60, 90, 120], @@ -4320,8 +4540,8 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: [5 rows x 2 columns] - >>> df_hours = df_minutes.map(minutes_to_hours) # doctest: +SKIP - >>> df_hours # doctest: +SKIP + >>> df_hours = df_minutes.map(minutes_to_hours) + >>> df_hours system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4337,11 +4557,11 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> df_minutes = bpd.DataFrame( ... { - ... "system_minutes" : [0, 30, 60, None, 90, 120, pd.NA], - ... "user_minutes" : [0, 15, 75, 90, 6, None, pd.NA] + ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] ... }, dtype="Int64") - >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') # doctest: +SKIP - >>> df_hours # doctest: +SKIP + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') + >>> df_hours system_minutes user_minutes 0 0.0 0.0 1 0.5 0.25 @@ -4392,6 +4612,9 @@ def join( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Join two DataFrames by specifying how to handle the operation: >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) @@ -4445,6 +4668,7 @@ def join( [1 rows x 4 columns] + Another option to join using the key columns is to use the on parameter: >>> df1.join(df2, on="col2", how="right") @@ -4540,6 +4764,9 @@ def merge( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Merge DataFrames df1 and df2 by specifying type of merge: >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) @@ -4670,6 +4897,7 @@ def round(self, decimals): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats']) >>> df @@ -4752,6 +4980,10 @@ def apply(self, func, *, axis=0, args=(), **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 @@ -4776,14 +5008,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] ... result += row["col2"]*row["col2"] ... return result - >>> df[["col1", "col2"]].apply(foo, axis=1) # doctest: +SKIP + >>> df[["col1", "col2"]].apply(foo, axis=1) 0 11 1 19 dtype: Int64 @@ -4791,7 +5023,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -4808,8 +5040,8 @@ def apply(self, func, *, axis=0, args=(), **kwargs): ... "chemistry": [88, 56, 72], ... "algebra": [78, 91, 79] ... }, index=["Alice", "Bob", "Charlie"]) - >>> stats = df.apply(marks_analyzer, axis=1) # doctest: +SKIP - >>> stats # doctest: +SKIP + >>> stats = df.apply(marks_analyzer, axis=1) + >>> stats Alice [77.67 78. 77.19 76.71] Bob [75.67 80. 74.15 72.56] Charlie [75.33 75. 75.28 75.22] @@ -4832,14 +5064,14 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x ... result += y/z ... return result - >>> df.apply(foo, axis=1) # doctest: +SKIP + >>> df.apply(foo, axis=1) 0 2.6 1 3.8 dtype: Float64 @@ -4899,6 +5131,9 @@ def any(self, *, axis=0, bool_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -4943,6 +5178,9 @@ def all(self, axis=0, *, bool_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) >>> df A B @@ -4984,6 +5222,8 @@ def prod(self, axis=0, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) >>> df A B @@ -5028,6 +5268,9 @@ def min(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5070,6 +5313,9 @@ def max(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5111,6 +5357,9 @@ def sum(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5150,6 +5399,9 @@ def mean(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5190,6 +5442,8 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5226,6 +5480,7 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), ... columns=['a', 'b']) >>> df.quantile(.1) @@ -5262,6 +5517,9 @@ def var(self, axis=0, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) >>> df A B @@ -5284,6 +5542,7 @@ def var(self, axis=0, *, numeric_only: bool = False): 1 0.5 dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -5303,6 +5562,9 @@ def skew(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], ... 'B': [5, 4, 3, 2, 1], ... 'C': [2, 2, 3, 2, 2]}) @@ -5341,6 +5603,9 @@ def kurt(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5378,6 +5643,9 @@ def std(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], ... "B": [3, 4, 3, 2, 1], ... "C": [2, 2, 3, 2, 2]}) @@ -5417,6 +5685,9 @@ def count(self, *, numeric_only: bool = False): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -5468,6 +5739,8 @@ def nlargest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5558,6 +5831,8 @@ def nsmallest(self, n: int, columns, keep: str = "first"): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], ... "B": [5, 6, 3, 4, 1, 2], ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) @@ -5605,6 +5880,7 @@ def nsmallest(self, n: int, columns, keep: str = "first"): [1 rows x 3 columns] + Args: n (int): Number of rows to return. @@ -5636,6 +5912,9 @@ def idxmin(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5663,6 +5942,9 @@ def idxmax(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5694,6 +5976,9 @@ def melt(self, id_vars, value_vars, var_name, value_name): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], ... "B": [1, 2, 3, 4, 5], ... "C": [None, 3.5, None, 4.5, 5.0]}) @@ -5742,6 +6027,7 @@ def melt(self, id_vars, value_vars, var_name, value_name): [10 rows x 3 columns] + Args: id_vars (tuple, list, or ndarray, optional): Column(s) to use as identifier variables. @@ -5765,6 +6051,9 @@ def nunique(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) >>> df A B @@ -5791,6 +6080,9 @@ def cummin(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5820,6 +6112,9 @@ def cummax(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5849,6 +6144,9 @@ def cumsum(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5883,6 +6181,9 @@ def cumprod(self) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5921,6 +6222,9 @@ def diff( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -5966,6 +6270,9 @@ def agg(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) >>> df A B @@ -6028,6 +6335,8 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df A B C @@ -6050,6 +6359,7 @@ def describe(self, include: None | Literal["all"] = None): [8 rows x 2 columns] + Using describe with include = "all": >>> df.describe(include="all") A B C @@ -6096,6 +6406,9 @@ def pivot(self, *, columns, index=None, values=None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... "foo": ["one", "one", "one", "two", "two"], ... "bar": ["A", "B", "C", "A", "B"], @@ -6164,6 +6477,8 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], @@ -6254,6 +6569,9 @@ def stack(self, level=-1): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6290,6 +6608,9 @@ def unstack(self, level=-1): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) >>> df A B @@ -6328,6 +6649,9 @@ def index(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can access the index of a DataFrame via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6378,6 +6702,9 @@ def columns(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can access the column labels of a DataFrame via ``columns`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -6423,8 +6750,11 @@ def value_counts( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], - ... 'num_wings': [2, 0, 0, 0, pd.NA]}, + ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], ... dtype='Int64') >>> df @@ -6501,6 +6831,9 @@ def eval(self, expr: str) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B @@ -6558,6 +6891,7 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 4 columns] + Args: expr (str): The expression string to evaluate. @@ -6573,6 +6907,9 @@ def query(self, expr: str) -> DataFrame | None: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) @@ -6645,6 +6982,9 @@ def interpolate(self, method: str = "linear"): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], ... 'B': [None, 6, None, 2, None, 3], @@ -6692,6 +7032,9 @@ def fillna(self, value): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -6767,6 +7110,8 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'int_col': [1, 1, 2, 3], ... 'string_col': ["a", "b", "c", "b"], @@ -6805,6 +7150,7 @@ def replace( [4 rows x 2 columns] + Args: to_replace (str, regex, list, int, float or None): How to find the values that will be replaced. @@ -6860,6 +7206,9 @@ def iat(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -6891,6 +7240,9 @@ def at(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -6937,6 +7289,9 @@ def dot(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7028,6 +7383,9 @@ def __matmul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> left 0 1 2 3 @@ -7085,6 +7443,9 @@ def __len__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'a': [0, 1, 2], ... 'b': [3, 4, 5] @@ -7105,6 +7466,10 @@ def __array__(self, dtype=None, copy: Optional[bool] = None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) >>> np.array(df) @@ -7136,6 +7501,9 @@ def __getitem__(self, key): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], @@ -7179,6 +7547,7 @@ def __getitem__(self, key): You can specify a pandas Index with desired column labels. + >>> import pandas as pd >>> df[pd.Index(["age", "location"])] age location 0 20 WA @@ -7207,6 +7576,9 @@ def __setitem__(self, key, value): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... "name" : ["alpha", "beta", "gamma"], ... "age": [20, 30, 40], diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index e8079e573b..273339efcf 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -38,6 +38,9 @@ def size(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) >>> s.size 3 @@ -62,6 +65,9 @@ def __iter__(self) -> Iterator: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': [4, 5, 6], @@ -100,6 +106,9 @@ def astype(self, dtype): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Create a DataFrame: >>> d = {'col1': [1, 2], 'col2': [3, 4]} @@ -143,7 +152,7 @@ def astype(self, dtype): Note that this is equivalent of using ``to_datetime`` with ``unit='us'``: - >>> bpd.to_datetime(ser, unit='us', utc=True) # doctest: +SKIP + >>> bpd.to_datetime(ser, unit='us', utc=True) 0 2034-02-08 11:13:20.246789+00:00 1 2021-06-19 17:20:44.123101+00:00 2 2003-06-05 17:30:34.120101+00:00 @@ -341,6 +350,9 @@ def get(self, key, default=None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame( ... [ ... [24.3, 75.7, "high"], @@ -449,6 +461,9 @@ def head(self, n: int = 5): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df @@ -547,6 +562,8 @@ def sample( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, @@ -626,6 +643,9 @@ def dtypes(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) >>> df.dtypes float Float64 @@ -648,6 +668,9 @@ def copy(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Modification in the original Series will not affect the copy Series: >>> s = bpd.Series([1, 2], index=["a", "b"]) @@ -718,6 +741,10 @@ def ffill(self, *, limit: Optional[int] = None): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, np.nan], @@ -743,6 +770,7 @@ def ffill(self, *, limit: Optional[int] = None): [4 rows x 4 columns] + Fill NA/NaN values in Series: >>> series = bpd.Series([1, np.nan, 2, 3]) @@ -762,6 +790,7 @@ def ffill(self, *, limit: Optional[int] = None): maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. + Returns: bigframes.pandas.DataFrame or bigframes.pandas.Series or None: Object with missing values filled. @@ -796,9 +825,13 @@ def isna(self) -> NDFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> df = bpd.DataFrame(dict( ... age=[5, 6, np.nan], - ... born=[pd.NA, "1940-04-25", "1940-04-25"], + ... born=[bpd.NA, "1940-04-25", "1940-04-25"], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'], ... )) @@ -830,7 +863,7 @@ def isna(self) -> NDFrame: Show which entries in a Series are NA: - >>> ser = bpd.Series([5, None, 6, np.nan, pd.NA]) + >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) >>> ser 0 5 1 @@ -1035,6 +1068,8 @@ def rolling( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0,1,2,3,4]) >>> s.rolling(window=3).min() 0 @@ -1119,6 +1154,10 @@ def pipe( Constructing a income DataFrame from a dictionary. + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) >>> df diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 8dba97ff07..1e39ec8f94 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -45,6 +45,8 @@ def describe(self, include: None | Literal["all"] = None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({"A": [1, 1, 1, 2, 2], "B": [0, 2, 8, 2, 7], "C": ["cat", "cat", "dog", "mouse", "cat"]}) >>> df A B C @@ -84,6 +86,8 @@ def any(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).any() @@ -121,6 +125,8 @@ def all(self): For SeriesGroupBy: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 0], index=lst) >>> ser.groupby(level=0).all() @@ -157,6 +163,10 @@ def count(self): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, np.nan], index=lst) >>> ser.groupby(level=0).count() @@ -192,6 +202,9 @@ def mean( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [1, 1, 2, 1, 2], ... 'B': [np.nan, 2, 3, 4, 5], ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) @@ -250,6 +263,9 @@ def median( For SeriesGroupBy: >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).median() @@ -288,6 +304,7 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([ ... ['a', 1], ['a', 2], ['a', 3], ... ['b', 1], ['b', 3], ['b', 5] @@ -326,6 +343,10 @@ def std( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).std() @@ -369,6 +390,10 @@ def var( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).var() @@ -410,6 +435,9 @@ def rank( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame( ... { ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], @@ -482,6 +510,10 @@ def skew( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series([390., 350., 357., np.nan, 22., 20., 30.], ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', ... 'Parrot', 'Parrot', 'Parrot'], @@ -514,6 +546,9 @@ def kurt( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurt() @@ -544,6 +579,9 @@ def kurtosis( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'] >>> ser = bpd.Series([0, 1, 1, 0, 0, 1, 2, 4, 5], index=lst) >>> ser.groupby(level=0).kurtosis() @@ -568,8 +606,9 @@ def first(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** - >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3])) >>> df.groupby("A").first() B C @@ -608,6 +647,8 @@ def last(self, numeric_only: bool = False, min_count: int = -1): Defaults to skipping NA elements. **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) >>> df.groupby("A").last() @@ -644,6 +685,9 @@ def sum( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).sum() @@ -686,6 +730,10 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).prod() @@ -718,6 +766,10 @@ def min( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).min() @@ -763,6 +815,9 @@ def max( For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).max() @@ -804,6 +859,9 @@ def cumcount(self, ascending: bool = True): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b', 'c'] >>> ser = bpd.Series([5, 1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).cumcount() @@ -839,6 +897,10 @@ def cumprod(self, *args, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumprod() @@ -874,6 +936,10 @@ def cumsum(self, *args, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cumsum() @@ -909,6 +975,10 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummin() @@ -944,6 +1014,10 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([6, 2, 0], index=lst) >>> ser.groupby(level=0).cummax() @@ -981,6 +1055,10 @@ def diff(self): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] >>> ser = bpd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser.groupby(level=0).diff() @@ -1023,6 +1101,10 @@ def shift(self, periods: int = 1): For SeriesGroupBy: + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 4], index=lst) >>> ser.groupby(level=0).shift(1) @@ -1063,6 +1145,9 @@ def rolling(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'a', 'a', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).rolling(2).min() @@ -1119,6 +1204,9 @@ def expanding(self, *args, **kwargs): **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'c', 'c', 'e'] >>> ser = bpd.Series([1, 0, -2, -1, 2], index=lst) >>> ser.groupby(level=0).expanding().min() @@ -1142,6 +1230,9 @@ def head(self, n: int = 5): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) >>> df.groupby('A').head(1) @@ -1168,6 +1259,9 @@ def size(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For SeriesGroupBy: >>> lst = ['a', 'a', 'b'] @@ -1219,6 +1313,9 @@ def __iter__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For SeriesGroupBy: >>> lst = ["a", "a", "b"] @@ -1280,6 +1377,10 @@ def agg(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).agg(['min', 'max']) min max @@ -1309,6 +1410,10 @@ def aggregate(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4], index=[1, 1, 2, 2]) >>> s.groupby(level=0).aggregate(['min', 'max']) min max @@ -1338,6 +1443,10 @@ def nunique(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> lst = ['a', 'a', 'b', 'b'] >>> ser = bpd.Series([1, 2, 3, 3], index=lst) >>> ser.groupby(level=0).nunique() @@ -1385,6 +1494,10 @@ def agg(self, func, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1441,6 +1554,10 @@ def aggregate(self, func, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} @@ -1497,6 +1614,10 @@ def nunique(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], ... 'value1': [1, 5, 5, 2, 5, 5], @@ -1529,6 +1650,10 @@ def value_counts( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 0e74b3e178..0dd487d056 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -12,6 +12,9 @@ def day(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="D") ... ) @@ -39,6 +42,9 @@ def dayofweek(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -70,6 +76,9 @@ def day_of_week(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() ... ) @@ -97,7 +106,9 @@ def dayofyear(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -123,7 +134,9 @@ def day_of_year(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() ... ) @@ -155,6 +168,7 @@ def date(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") >>> s @@ -175,7 +189,9 @@ def hour(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="h") ... ) @@ -199,7 +215,9 @@ def minute(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) @@ -223,6 +241,9 @@ def month(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="M") ... ) @@ -246,6 +267,9 @@ def isocalendar(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range('2009-12-27', '2010-01-04', freq='d').to_series() ... ) @@ -263,9 +287,11 @@ def isocalendar(self): [9 rows x 3 columns] + Returns: DataFrame With columns year, week and day. + """ @property @@ -274,7 +300,9 @@ def second(self): **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="s") ... ) @@ -303,6 +331,7 @@ def time(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -324,6 +353,7 @@ def quarter(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -344,6 +374,9 @@ def year(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... pd.date_range("2000-01-01", periods=3, freq="Y") ... ) @@ -367,6 +400,9 @@ def days(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -382,6 +418,9 @@ def seconds(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -397,6 +436,9 @@ def microseconds(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("4d3m2s1us")]) >>> s 0 4 days 00:03:02.000001 @@ -411,6 +453,9 @@ def total_seconds(self): **Examples:** + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timedelta("1d1m1s1us")]) >>> s 0 1 days 00:01:01.000001 @@ -427,6 +472,7 @@ def tz(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s @@ -449,6 +495,7 @@ def unit(self) -> str: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") >>> s diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 04f7f5938d..eba47fc1f9 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -32,6 +32,9 @@ def name(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3], name='x') >>> idx Index([1, 2, 3], dtype='Int64', name='x') @@ -60,6 +63,9 @@ def values(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -80,6 +86,9 @@ def ndim(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -112,6 +121,9 @@ def size(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -144,6 +156,9 @@ def is_monotonic_increasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bool(bpd.Index([1, 2, 3]).is_monotonic_increasing) True @@ -166,6 +181,9 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bool(bpd.Index([3, 2, 1]).is_monotonic_decreasing) True @@ -188,6 +206,9 @@ def from_frame(cls, frame) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) @@ -225,6 +246,9 @@ def shape(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -244,6 +268,9 @@ def nlevels(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> mi = bpd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) >>> mi MultiIndex([('a', 'b', 'c')], @@ -263,6 +290,9 @@ def is_unique(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 5, 7, 7]) >>> idx.is_unique False @@ -283,6 +313,9 @@ def has_duplicates(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 5, 7, 7]) >>> bool(idx.has_duplicates) True @@ -303,6 +336,9 @@ def dtype(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -328,6 +364,9 @@ def T(self) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -364,6 +403,9 @@ def copy( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['a', 'b', 'c']) >>> new_idx = idx.copy() >>> idx is new_idx @@ -396,10 +438,14 @@ def astype(self, dtype): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='Int64') + Args: dtype (str, data type, or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, @@ -441,6 +487,9 @@ def get_level_values(self, level) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(list('abc')) >>> idx Index(['a', 'b', 'c'], dtype='string') @@ -468,6 +517,9 @@ def to_series(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['Ant', 'Bear', 'Cow'], name='animal') By default, the original index and original name is reused. @@ -519,6 +571,9 @@ def isin(self, values): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1,2,3]) >>> idx Index([1, 2, 3], dtype='Int64') @@ -556,6 +611,9 @@ def all(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + True, because nonzero integers are considered True. >>> bool(bpd.Index([1, 2, 3]).all()) @@ -581,6 +639,9 @@ def any(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> index = bpd.Index([0, 1, 2]) >>> bool(index.any()) True @@ -604,6 +665,9 @@ def min(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.min()) 1 @@ -623,6 +687,9 @@ def max(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([3, 2, 1]) >>> int(idx.max()) 3 @@ -646,6 +713,9 @@ def argmin(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Consider dataset containing cereal calories >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -680,6 +750,9 @@ def get_loc( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> unique_index = bpd.Index(list('abc')) >>> unique_index.get_loc('b') 1 @@ -721,6 +794,9 @@ def argmax(self) -> int: Consider dataset containing cereal calories + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}) >>> s @@ -752,6 +828,9 @@ def nunique(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -781,6 +860,9 @@ def sort_values( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([10, 100, 1, 1000]) >>> idx Index([10, 100, 1, 1000], dtype='Int64') @@ -822,6 +904,10 @@ def value_counts( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> index = bpd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 @@ -875,6 +961,10 @@ def fillna(self, value) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([np.nan, np.nan, 3]) >>> idx.fillna(0) Index([0.0, 0.0, 3.0], dtype='Float64') @@ -902,6 +992,9 @@ def rename(self, name, *, inplace): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') Index(['A', 'C', 'A', 'B'], dtype='string', name='grade') @@ -929,6 +1022,9 @@ def drop(self, labels) -> Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index(['a', 'b', 'c']) >>> idx.drop(['a']) Index(['b', 'c'], dtype='string') @@ -946,6 +1042,10 @@ def dropna(self, how: typing.Literal["all", "any"] = "any"): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, np.nan, 3]) >>> idx.dropna() Index([1.0, 3.0], dtype='Float64') @@ -970,9 +1070,11 @@ def drop_duplicates(self, *, keep: str = "first"): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Generate an pandas.Index with duplicate values. - >>> import bigframes.pandas as bpd >>> idx = bpd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) The keep parameter controls which duplicate values are removed. @@ -1011,6 +1113,8 @@ def unique(self, level: Hashable | int | None = None): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> idx = bpd.Index([1, 1, 2, 3, 3]) >>> idx.unique() Index([1, 2, 3], dtype='Int64') @@ -1030,6 +1134,8 @@ def item(self, *args, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1], index=['a']) >>> s.index.item() 'a' diff --git a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py index 973d5c763a..105a376728 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/datetimes.py @@ -15,6 +15,10 @@ def year(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.year Index([2025], dtype='Int64') @@ -27,6 +31,10 @@ def month(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.month Index([2], dtype='Int64') @@ -39,6 +47,10 @@ def day(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day Index([15], dtype='Int64') @@ -51,6 +63,10 @@ def day_of_week(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.day_of_week Index([5], dtype='Int64') @@ -63,6 +79,10 @@ def dayofweek(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.dayofweek Index([5], dtype='Int64') @@ -75,6 +95,10 @@ def weekday(self) -> base.Index: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([pd.Timestamp("20250215")]) >>> idx.weekday Index([5], dtype='Int64') diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py index 018e638de3..a882aa40e3 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/multi.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -25,6 +25,8 @@ def from_tuples( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) @@ -60,6 +62,8 @@ def from_arrays( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) MultiIndex([(1, 'red'), diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 0f42433384..697c17f23c 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -34,6 +34,8 @@ def cut( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 5, 10]) >>> s 0 0 @@ -71,6 +73,7 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: + >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b089c65d3b..932959a826 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -38,6 +38,9 @@ def dt(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> seconds_series = bpd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) >>> seconds_series 0 2000-01-01 00:00:00 @@ -107,6 +110,9 @@ def index(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can access the index of a Series via ``index`` property. >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], @@ -155,10 +161,13 @@ def shape(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 4, 9, 16]) >>> s.shape (4,) - >>> s = bpd.Series(['Alice', 'Bob', pd.NA]) + >>> s = bpd.Series(['Alice', 'Bob', bpd.NA]) >>> s.shape (3,) """ @@ -171,6 +180,9 @@ def dtype(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.dtype Int64Dtype() @@ -188,6 +200,9 @@ def name(self) -> Hashable: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For a Series: >>> s = bpd.Series([1, 2, 3], dtype="Int64", name='Numbers') @@ -233,6 +248,9 @@ def hasnans(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, None]) >>> s 0 1.0 @@ -254,6 +272,9 @@ def T(self) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -276,6 +297,9 @@ def transpose(self) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant @@ -313,6 +337,10 @@ def reset_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4], name='foo', ... index=['a', 'b', 'c', 'd']) >>> s.index.name = "idx" @@ -412,6 +440,9 @@ def keys(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) >>> s.keys() Index([0, 1, 2], dtype='Int64') @@ -491,6 +522,9 @@ def to_markdown( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) | | animal | @@ -543,14 +577,16 @@ def to_dict( **Examples:** + >>> import bigframes.pandas as bpd >>> from collections import OrderedDict, defaultdict + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) >>> s.to_dict() {np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4} - >>> s.to_dict(into=OrderedDict) # doctest:+ELLIPSIS - OrderedDict(...) + >>> s.to_dict(into=OrderedDict) + OrderedDict({np.int64(0): 1, np.int64(1): 2, np.int64(2): 3, np.int64(3): 4}) >>> dd = defaultdict(list) >>> s.to_dict(into=dd) @@ -581,6 +617,9 @@ def to_frame(self, name=None) -> DataFrame: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["a", "b", "c"], ... name="vals") >>> s.to_frame() @@ -675,6 +714,9 @@ def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -706,6 +748,10 @@ def to_numpy( **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -757,6 +803,9 @@ def to_pickle(self, path, *, allow_large_results=None, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> original_df = bpd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar @@ -816,6 +865,9 @@ def agg(self, func): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) >>> s 0 1 @@ -850,7 +902,10 @@ def count(self): **Examples:** - >>> s = bpd.Series([0.0, 1.0, pd.NA]) + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0.0, 1.0, bpd.NA]) >>> s 0 0.0 1 1.0 @@ -873,6 +928,9 @@ def nunique(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 3, 5, 7, 7]) >>> s 0 1 @@ -905,6 +963,9 @@ def unique(self, keep_order=True) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -945,6 +1006,9 @@ def mode(self) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, 4, 8, 2, 4, None]) >>> s.mode() 0 2.0 @@ -967,9 +1031,11 @@ def drop_duplicates( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Generate a Series with duplicated entries. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], ... name='animal') >>> s @@ -1035,6 +1101,7 @@ def duplicated(self, keep="first") -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None By default, for each set of duplicated values, the first occurrence is set on False and all others on True: @@ -1105,6 +1172,9 @@ def idxmin(self) -> Hashable: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(data=[1, None, 4, 1], ... index=['A', 'B', 'C', 'D']) >>> s @@ -1131,6 +1201,9 @@ def idxmax(self) -> Hashable: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(data=[1, None, 4, 3, 4], ... index=['A', 'B', 'C', 'D', 'E']) >>> s @@ -1156,6 +1229,8 @@ def round(self, decimals: int = 0) -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0.1, 1.3, 2.7]) >>> s.round() 0 0.0 @@ -1187,6 +1262,9 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) >>> s 0 [1 2 3] @@ -1223,6 +1301,9 @@ def corr(self, other, method="pearson", min_periods=None) -> float: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series([.2, .0, .6, .2]) >>> s2 = bpd.Series([.3, .6, .0, .1]) >>> s1.corr(s2) @@ -1259,6 +1340,8 @@ def autocorr(self, lag: int = 1) -> float: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) >>> s.autocorr() # doctest: +ELLIPSIS np.float64(0.10355263309024067) @@ -1294,6 +1377,9 @@ def cov( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = bpd.Series([0.12528585, 0.26962463, 0.51111198]) >>> s1.cov(s2) @@ -1317,8 +1403,12 @@ def diff(self) -> Series: Calculates the difference of a Series element compared with another element in the Series (default is element in previous row). + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Difference with previous row >>> s = bpd.Series([1, 1, 2, 3, 5, 8]) @@ -1382,6 +1472,9 @@ def dot(self, other) -> Series | np.ndarray: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) >>> other = bpd.Series([-1, 2, -3, 4]) >>> s.dot(other) @@ -1403,6 +1496,7 @@ def dot(self, other) -> Series | np.ndarray: Series and each rows of other if other is a DataFrame or a numpy.ndarray between the Series and each columns of the numpy array. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1435,6 +1529,10 @@ def sort_values( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) >>> s 0 @@ -1530,6 +1628,10 @@ def sort_index( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) >>> s.sort_index() 1 c @@ -1588,6 +1690,8 @@ def nlargest( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1672,6 +1776,8 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> countries_population = {"Italy": 59000000, "France": 65000000, ... "Malta": 434000, "Maldives": 434000, ... "Brunei": 434000, "Iceland": 337000, @@ -1758,6 +1864,7 @@ def apply( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None For applying arbitrary python function a `remote_function` is recommended. Let's use ``reuse=False`` flag to make sure a new `remote_function` @@ -1765,13 +1872,9 @@ def apply( to potentially reuse a previously deployed `remote_function` from the same user defined function. - >>> def minutes_to_hours(x: int) -> float: + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") + ... def minutes_to_hours(x: int) -> float: ... return x/60 - >>> bpd.deploy_remote_function( # doctest: +SKIP - ... minutes_to_hours, - ... reuse=False, - ... cloud_function_service_account="default", - ... ) >>> minutes = bpd.Series([0, 30, 60, 90, 120]) >>> minutes @@ -1782,8 +1885,8 @@ def apply( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP - >>> hours # doctest: +SKIP + >>> hours = minutes.apply(minutes_to_hours) + >>> hours 0 0.0 1 0.5 2 1.0 @@ -1795,7 +1898,7 @@ def apply( a `remote_function`, you would provide the names of the packages via `packages` param. - >>> @bpd.remote_function( # doctest: +SKIP + >>> @bpd.remote_function( ... reuse=False, ... packages=["cryptography"], ... cloud_function_service_account="default" @@ -1812,11 +1915,11 @@ def apply( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) # doctest: +SKIP + >>> hashes = names.apply(get_hash) You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -1829,8 +1932,8 @@ def apply( ... "I love this product! It's amazing.", ... "Hungry? Wanna eat? Lets go!" ... ]) - >>> features = texts.apply(text_analyzer) # doctest: +SKIP - >>> features # doctest: +SKIP + >>> features = texts.apply(text_analyzer) + >>> features 0 [9 1 0 0] 1 [6 1 1 0] 2 [5 0 1 2] @@ -1903,6 +2006,8 @@ def combine( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. @@ -1960,6 +2065,9 @@ def groupby( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can group by a named index level. >>> s = bpd.Series([380, 370., 24., 26.], @@ -1981,6 +2089,7 @@ def groupby( You can also group by more than one index levels. + >>> import pandas as pd >>> s = bpd.Series([380, 370., 24., 26.], ... index=pd.MultiIndex.from_tuples( ... [("Falcon", "Clear"), @@ -2129,6 +2238,9 @@ def drop( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) >>> s A 0 @@ -2144,6 +2256,7 @@ def drop( Drop 2nd level label in MultiIndex Series: + >>> import pandas as pd >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -2209,6 +2322,7 @@ def reorder_levels(self, order: Sequence, axis) -> Series: axis ({0 or 'index', 1 or 'columns'}, default 0): For `Series` this parameter is unused and defaults to 0. + Returns: type of caller (new object) """ @@ -2255,6 +2369,10 @@ def interpolate(self, method: str = "linear"): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + Filling in NaN in a Series via linear interpolation. >>> s = bpd.Series([0, 1, np.nan, 3]) @@ -2296,6 +2414,10 @@ def fillna( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([np.nan, 2, np.nan, -1]) >>> s 0 @@ -2348,6 +2470,8 @@ def replace( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4, 5]) >>> s 0 1 @@ -2472,6 +2596,10 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + Drop NA values from a Series: >>> ser = bpd.Series([1., 2., np.nan]) @@ -2488,7 +2616,7 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = bpd.Series(['2', pd.NA, '', None, 'I stay'], dtype='object') + >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') >>> ser 0 2 1 @@ -2532,6 +2660,10 @@ def between( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + Boundary values are included by default: >>> s = bpd.Series([2, 0, 4, 8, np.nan]) @@ -2587,6 +2719,10 @@ def case_when( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> c = bpd.Series([6, 7, 8, 9], name="c") >>> a = bpd.Series([0, 0, 1, 2]) >>> b = bpd.Series([0, 3, 4, 5]) @@ -2653,6 +2789,9 @@ def cumprod(self): **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2687,6 +2826,10 @@ def cumsum(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2726,6 +2869,10 @@ def cummax(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2745,6 +2892,7 @@ def cummax(self): 4 5.0 dtype: Float64 + Returns: bigframes.pandas.Series: Return cumulative maximum of scalar or Series. @@ -2760,6 +2908,10 @@ def cummin(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) >>> s 0 2.0 @@ -2793,6 +2945,10 @@ def eq(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2834,6 +2990,10 @@ def ne(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2877,6 +3037,10 @@ def le(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2919,6 +3083,10 @@ def lt(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -2962,6 +3130,10 @@ def ge(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3005,6 +3177,10 @@ def gt(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3047,7 +3223,10 @@ def add(self, other) -> Series: **Examples:** - >>> a = bpd.Series([1, 2, 3, pd.NA]) + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> a = bpd.Series([1, 2, 3, bpd.NA]) >>> a 0 1 1 2 @@ -3108,6 +3287,9 @@ def __add__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3157,6 +3339,10 @@ def radd(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3218,6 +3404,10 @@ def sub( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3259,6 +3449,9 @@ def __sub__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) >>> s elk 1.5 @@ -3308,6 +3501,10 @@ def rsub(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3366,6 +3563,10 @@ def mul(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3408,6 +3609,9 @@ def __mul__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3445,6 +3649,10 @@ def rmul(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3502,6 +3710,10 @@ def truediv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3544,6 +3756,9 @@ def __truediv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can multiply with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3581,6 +3796,10 @@ def rtruediv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3639,6 +3858,10 @@ def floordiv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3681,6 +3904,9 @@ def __floordiv__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can divide by a scalar: >>> s = bpd.Series([15, 30, 45]) @@ -3718,6 +3944,10 @@ def rfloordiv(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3776,6 +4006,10 @@ def mod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3818,6 +4052,9 @@ def __mod__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can modulo with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3854,6 +4091,10 @@ def rmod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3914,6 +4155,9 @@ def pow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -3957,6 +4201,9 @@ def __pow__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can exponentiate with a scalar: >>> s = bpd.Series([1, 2, 3]) @@ -3995,6 +4242,9 @@ def rpow(self, other) -> Series: **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4054,6 +4304,10 @@ def divmod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4102,6 +4356,10 @@ def rdivmod(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> a = bpd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) >>> a a 1.0 @@ -4153,6 +4411,10 @@ def combine_first(self, other) -> Series: **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series([1, np.nan]) >>> s2 = bpd.Series([3, 4, 5]) >>> s1.combine_first(s2) @@ -4191,6 +4453,11 @@ def update(self, other) -> None: **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.update(bpd.Series([4, 5, 6])) >>> s @@ -4280,6 +4547,10 @@ def any( **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + For Series input, the output is a scalar indicating whether any element is True. >>> bpd.Series([False, False]).any() @@ -4312,6 +4583,9 @@ def max( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the max of a Series: >>> s = bpd.Series([1, 3]) @@ -4325,7 +4599,7 @@ def max( Calculating the max of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4351,6 +4625,9 @@ def min( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the min of a Series: >>> s = bpd.Series([1, 3]) @@ -4364,7 +4641,7 @@ def min( Calculating the min of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4389,6 +4666,9 @@ def std( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], ... 'age': [21, 25, 62, 43], ... 'height': [1.61, 1.87, 1.49, 2.01]} @@ -4434,6 +4714,9 @@ def sum(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the sum of a Series: >>> s = bpd.Series([1, 3]) @@ -4447,7 +4730,7 @@ def sum(self): Calculating the sum of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4467,6 +4750,9 @@ def mean(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Calculating the mean of a Series: >>> s = bpd.Series([1, 3]) @@ -4480,7 +4766,7 @@ def mean(self): Calculating the mean of a Series containing ``NA`` values: - >>> s = bpd.Series([1, 3, pd.NA]) + >>> s = bpd.Series([1, 3, bpd.NA]) >>> s 0 1 1 3 @@ -4501,6 +4787,8 @@ def median(self, *, exact: bool = True): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.median() np.float64(2.0) @@ -4540,6 +4828,8 @@ def quantile( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) >>> s.quantile(.5) np.float64(2.5) @@ -4590,6 +4880,9 @@ def describe(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['A', 'A', 'B']) >>> s 0 A @@ -4615,6 +4908,9 @@ def skew(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s.skew() np.float64(0.0) @@ -4650,6 +4946,9 @@ def kurt(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse']) >>> s cat 1 @@ -4690,6 +4989,9 @@ def item(self: Series, *args, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1]) >>> s.item() np.int64(1) @@ -4711,6 +5013,9 @@ def items(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): ... print(f"Index : {index}, Value : {value}") @@ -4730,6 +5035,9 @@ def where(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -4795,6 +5103,9 @@ def mask(self, cond, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([10, 11, 12, 13, 14]) >>> s 0 10 @@ -4838,7 +5149,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -4851,12 +5162,12 @@ def mask(self, cond, other): 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask) # doctest: +SKIP + >>> s.mask(should_mask) 0 1 Bob 2 Caroline dtype: string - >>> s.mask(should_mask, "REDACTED") # doctest: +SKIP + >>> s.mask(should_mask, "REDACTED") 0 REDACTED 1 Bob 2 Caroline @@ -4950,6 +5261,9 @@ def argmax(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -4985,6 +5299,9 @@ def argmin(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Consider dataset containing cereal calories. >>> s = bpd.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0, @@ -5023,6 +5340,9 @@ def rename(self, index, *, inplace, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> s 0 1 @@ -5072,6 +5392,9 @@ def rename_axis(self, mapper, *, inplace, **kwargs): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Series >>> s = bpd.Series(["dog", "cat", "monkey"]) @@ -5134,7 +5457,10 @@ def value_counts( **Examples:** - >>> s = bpd.Series([3, 1, 2, 3, 4, pd.NA], dtype="Int64") + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") >>> s 0 3 @@ -5210,6 +5536,8 @@ def str(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["A_Str_Series"]) >>> s 0 A_Str_Series @@ -5237,6 +5565,8 @@ def plot(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") >>> plot @@ -5262,6 +5592,9 @@ def isin(self, values): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', ... 'hippo'], name='animal') >>> s @@ -5325,6 +5658,9 @@ def is_monotonic_increasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 2]) >>> s.is_monotonic_increasing np.True_ @@ -5346,6 +5682,9 @@ def is_monotonic_decreasing(self) -> bool: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([3, 2, 2, 1]) >>> s.is_monotonic_decreasing np.True_ @@ -5386,7 +5725,9 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['cat', 'dog', pd.NA, 'rabbit']) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) >>> s 0 cat 1 dog @@ -5406,7 +5747,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function(cloud_function_service_account="default") # doctest: +SKIP + >>> @bpd.remote_function(cloud_function_service_account="default") ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: @@ -5415,7 +5756,7 @@ def map( ... ]) ... return "N/A" - >>> s.map(my_mapper) # doctest: +SKIP + >>> s.map(my_mapper) 0 cAt 1 dOg 2 N/A @@ -5449,6 +5790,9 @@ def iloc(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] @@ -5526,6 +5870,9 @@ def loc(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], ... columns=['max_speed', 'shield']) @@ -5610,6 +5957,9 @@ def iat(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... columns=['A', 'B', 'C']) >>> df @@ -5642,6 +5992,9 @@ def at(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], ... index=[4, 5, 6], columns=['A', 'B', 'C']) >>> df @@ -5675,6 +6028,9 @@ def values(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.Series([1, 2, 3]).values array([1, 2, 3]) @@ -5694,6 +6050,9 @@ def size(self) -> int: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + For Series: >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) @@ -5728,6 +6087,10 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> ser = bpd.Series([1, 2, 3]) >>> np.asarray(ser) @@ -5752,6 +6115,9 @@ def __len__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3]) >>> len(s) 3 @@ -5765,6 +6131,9 @@ def __invert__(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series([True, False, True]) >>> ~ser 0 False @@ -5783,6 +6152,9 @@ def __and__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -5819,6 +6191,9 @@ def __or__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -5855,6 +6230,9 @@ def __xor__(self, other): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 2, 3]) You can operate with a scalar. @@ -5891,6 +6269,9 @@ def __getitem__(self, indexer): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([15, 30, 45]) >>> s[1] np.int64(30) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 7a37eba341..fe94bf3049 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,6 +20,8 @@ def __getitem__(self, key: typing.Union[int, slice]): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) >>> s.str[0] 0 A @@ -51,10 +53,12 @@ def extract(self, pat: str, flags: int = 0): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + A pattern with two groups will return a DataFrame with two columns. Non-matches will be `NaN`. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['a1', 'b2', 'c3']) >>> s.str.extract(r'([ab])(\\d)') 0 1 @@ -111,6 +115,8 @@ def find(self, sub, start: int = 0, end=None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) >>> ser.str.find("_") 0 3 @@ -139,10 +145,12 @@ def len(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Returns the length (number of characters) in a string. - >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['dog', '', pd.NA]) + >>> s = bpd.Series(['dog', '', bpd.NA]) >>> s.str.len() 0 3 1 0 @@ -164,6 +172,8 @@ def lower(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -187,6 +197,8 @@ def slice(self, start=None, stop=None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["koala", "dog", "chameleon"]) >>> s 0 koala @@ -238,11 +250,13 @@ def strip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([ ... '1. Ant.', ... ' 2. Bee? ', ... '\\t3. Cat!\\n', - ... pd.NA, + ... bpd.NA, ... ]) >>> s.str.strip() 0 1. Ant. @@ -279,6 +293,8 @@ def upper(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -306,6 +322,8 @@ def isnumeric(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False @@ -331,6 +349,8 @@ def isalpha(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalpha() 0 True @@ -355,6 +375,8 @@ def isdigit(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['23', '1a', '1/5', '']) >>> s.str.isdigit() 0 True @@ -379,6 +401,8 @@ def isalnum(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s1 = bpd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True @@ -415,6 +439,8 @@ def isspace(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) >>> s.str.isspace() 0 True @@ -439,6 +465,8 @@ def islower(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.islower() 0 True @@ -464,6 +492,8 @@ def isupper(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s.str.isupper() 0 False @@ -488,10 +518,12 @@ def isdecimal(self): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + The `isdecimal` method checks for characters used to form numbers in base 10. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['23', '³', '⅕', '']) >>> s.str.isdecimal() 0 True @@ -518,7 +550,9 @@ def rstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.rstrip() 0 Ant 1 Bee @@ -549,7 +583,9 @@ def lstrip(self, to_strip: typing.Optional[str] = None): **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.lstrip() 0 Ant 1 Bee @@ -575,6 +611,8 @@ def repeat(self, repeats: int): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['a', 'b', 'c']) >>> s 0 a @@ -607,6 +645,8 @@ def capitalize(self): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(['lower', ... 'CAPITALS', ... 'this is a sentence', @@ -632,9 +672,11 @@ def cat(self, others, *, join): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + You can concatenate each string in a Series to another string. - >>> import bigframes.pandas as bpd >>> s = bpd.Series(['Jane', 'John']) >>> s.str.cat(" Doe") 0 Jane Doe @@ -687,9 +729,11 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Returning a Series of booleans using only a literal pattern. - >>> import bigframes.pandas as bpd >>> s1 = bpd.Series(['Mouse', 'dog', 'house and parrot', '23', None]) >>> s1.str.contains('og') 0 False @@ -789,12 +833,14 @@ def replace( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + When *pat* is a string and *regex* is True, the given *pat* is compiled as a regex. When *repl* is a string, it replaces matching regex patterns as with `re.sub()`. NaN value(s) in the Series are left as is: - >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['foo', 'fuz', pd.NA]) + >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) >>> s.str.replace('f.', 'ba', regex=True) 0 bao 1 baz @@ -804,7 +850,7 @@ def replace( When *pat* is a string and *regex* is False, every *pat* is replaced with *repl* as with `str.replace()`: - >>> s = bpd.Series(['f.o', 'fuz', pd.NA]) + >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) >>> s.str.replace('f.', 'ba', regex=False) 0 bao 1 fuz @@ -850,7 +896,9 @@ def startswith( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['bat', 'Bear', 'caT', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) >>> s 0 bat 1 Bear @@ -893,7 +941,9 @@ def endswith( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['bat', 'bear', 'caT', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) >>> s 0 bat 1 bear @@ -937,6 +987,9 @@ def split( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( ... [ ... "a regular sentence", @@ -978,6 +1031,8 @@ def match(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(["horse", "eagle", "donkey"]) >>> ser.str.match("e") 0 False @@ -1005,6 +1060,8 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r'd.+') 0 False @@ -1035,6 +1092,8 @@ def get(self, i: int): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["apple", "banana", "fig"]) >>> s.str.get(3) 0 l @@ -1063,6 +1122,8 @@ def pad( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["caribou", "tiger"]) >>> s 0 caribou @@ -1109,6 +1170,8 @@ def ljust( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.ljust(8, fillchar='.') 0 dog..... @@ -1139,6 +1202,8 @@ def rjust( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.rjust(8, fillchar='.') 0 .....dog @@ -1173,7 +1238,9 @@ def zfill( **Examples:** >>> import bigframes.pandas as bpd - >>> s = bpd.Series(['-1', '1', '1000', pd.NA]) + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) >>> s 0 -1 1 1 @@ -1211,6 +1278,8 @@ def center( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) >>> ser.str.center(8, fillchar='.') 0 ..dog... @@ -1240,9 +1309,12 @@ def join(self, sep: str): **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import pandas as pd + Example with a list that contains non-string elements. - >>> import bigframes.pandas as bpd >>> s = bpd.Series([['lion', 'elephant', 'zebra'], ... ['dragon'], ... ['duck', 'swan', 'fish', 'guppy']]) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 105277dbf0..9c17b9632e 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -21,7 +21,6 @@ def to_datetime( utc=False, format=None, unit=None, - session=None, ) -> Union[pd.Timestamp, datetime, series.Series]: """ This function converts a scalar, array-like or Series to a datetime object. @@ -38,9 +37,11 @@ def to_datetime( **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + Converting a Scalar to datetime: - >>> import bigframes.pandas as bpd >>> scalar = 123456.789 >>> bpd.to_datetime(scalar, unit = 's') Timestamp('1970-01-02 10:17:36.789000') diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py index 220b15f56e..9442e965fa 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -55,6 +55,7 @@ def to_timedelta( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Converting a Scalar to timedelta diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 3190c92b92..0fdca4dde1 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -61,6 +61,7 @@ def read_gbq( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None If the input is a table ID: diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 7d5c108f93..aec911d2fe 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -27,6 +27,8 @@ def read_parquet( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path, engine="bigquery") diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 9dc7b39873..4757f5ed9d 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -71,6 +71,8 @@ def read_csv( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" >>> df = bpd.read_csv(filepath_or_buffer=gcs_path) >>> df.head(2) @@ -190,6 +192,8 @@ def read_json( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://bigframes-dev-testing/sample1.json" >>> df = bpd.read_json(path_or_buf=gcs_path, lines=True, orient="records") >>> df.head(2) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 2950cf422a..33088dc019 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -35,6 +35,8 @@ def read_pickle( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) diff --git a/third_party/bigframes_vendored/pandas/pandas/_typing.py b/third_party/bigframes_vendored/pandas/pandas/_typing.py index 76e984a173..e665339fc8 100644 --- a/third_party/bigframes_vendored/pandas/pandas/_typing.py +++ b/third_party/bigframes_vendored/pandas/pandas/_typing.py @@ -100,6 +100,7 @@ Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, datetime] IntStrT = TypeVar("IntStrT", int, str) + # timestamp and timedelta convertible types TimestampConvertibleTypes = Union[ @@ -266,6 +267,7 @@ def closed(self) -> bool: # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] + # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index a7cd2c0cc9..4ed5c8eb0b 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -8,11 +8,10 @@ class PlotAccessor: Make plots of Series or DataFrame with the `matplotlib` backend. **Examples:** - - >>> import bigframes.pandas as bpd - For Series: + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -58,6 +57,9 @@ def hist( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -94,6 +96,7 @@ def line( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -161,6 +164,7 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], @@ -229,6 +233,7 @@ def bar( Basic plot. >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) >>> ax = df.plot.bar(x='lab', y='val', rot=0) @@ -291,6 +296,7 @@ def scatter( in a DataFrame's columns. >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], ... [6.4, 3.2, 1], [5.9, 3.0, 2]], ... columns=['length', 'width', 'species']) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 44eefeddd7..a7344d49d4 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -30,6 +30,7 @@ class KMeans(_BaseKMeans): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> from bigframes.ml.cluster import KMeans >>> X = bpd.DataFrame({"feat0": [1, 1, 1, 10, 10, 10], "feat1": [2, 4, 0, 2, 4, 0]}) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index e487a2e7c1..c3c3a77b71 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,6 +24,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], ... "column": [0,1] * 7, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 3535edc8f9..f13c52bfb6 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -24,6 +24,7 @@ class PCA(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import PCA + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [-1, -2, -3, 1, 2, 3], "feat1": [-1, -1, -2, 1, 1, 2]}) >>> pca = PCA(n_components=2).fit(X) >>> pca.predict(X) # doctest:+SKIP diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 175ad86b21..42eab24c82 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -22,6 +22,7 @@ class SimpleImputer(_BaseImputer): >>> import bigframes.pandas as bpd >>> from bigframes.ml.impute import SimpleImputer + >>> bpd.options.display.progress_bar = None >>> X_train = bpd.DataFrame({"feat0": [7.0, 4.0, 10.0], "feat1": [2.0, None, 5.0], "feat2": [3.0, 6.0, 9.0]}) >>> imp_mean = SimpleImputer().fit(X_train) >>> X_test = bpd.DataFrame({"feat0": [None, 4.0, 10.0], "feat1": [2.0, None, None], "feat2": [3.0, 6.0, 9.0]}) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 7543edd10b..21ba5a3bf8 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -66,6 +66,7 @@ class LinearRegression(RegressorMixin, LinearModel): >>> from bigframes.ml.linear_model import LinearRegression >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index d449a1040c..a85c6fae8d 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -25,6 +25,7 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): >>> from bigframes.ml.linear_model import LogisticRegression >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ \ "feature0": [20, 21, 19, 18], \ "feature1": [0, 1, 1, 0], \ diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index e60cc8cec4..fd6e8678ea 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -30,6 +30,7 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 2, 1, 3]) >>> y_pred = bpd.DataFrame([0, 1, 2, 3]) @@ -79,6 +80,7 @@ def confusion_matrix( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([2, 0, 2, 2, 0, 1]) >>> y_pred = bpd.DataFrame([0, 0, 2, 2, 0, 2]) @@ -130,6 +132,7 @@ def recall_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -178,6 +181,7 @@ def precision_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) @@ -228,6 +232,7 @@ def f1_score( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 1, 2, 0, 1, 2]) >>> y_pred = bpd.DataFrame([0, 2, 1, 0, 0, 1]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index cd5bd2cbcd..9262ffbd3d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -33,6 +33,7 @@ def auc(x, y) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> x = bpd.DataFrame([1, 1, 2, 2]) >>> y = bpd.DataFrame([2, 3, 4, 5]) @@ -88,6 +89,7 @@ def roc_auc_score(y_true, y_score) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) @@ -137,6 +139,7 @@ def roc_curve( >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([1, 1, 2, 2]) >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 85f0c1ecf9..1c14e8068b 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -46,6 +46,7 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -72,6 +73,7 @@ def mean_squared_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) @@ -98,6 +100,7 @@ def mean_absolute_error(y_true, y_pred) -> float: >>> import bigframes.pandas as bpd >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py index 326589be7d..ec16fa8cf9 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py @@ -69,6 +69,7 @@ class KFold(_BaseKFold): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import KFold + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> kf = KFold(n_splits=3, random_state=42) @@ -161,6 +162,7 @@ def train_test_split( >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import train_test_split + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]}) >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index 6f84018853..b93c47ea04 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -19,6 +19,7 @@ def cross_validate(estimator, X, y=None, *, cv=None): >>> import bigframes.pandas as bpd >>> from bigframes.ml.model_selection import cross_validate, KFold >>> from bigframes.ml.linear_model import LinearRegression + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> model = LinearRegression() diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 64a5786f17..5476a9fb3c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -25,6 +25,7 @@ class OneHotEncoder(BaseEstimator): >>> from bigframes.ml.preprocessing import OneHotEncoder >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> enc = OneHotEncoder() >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) From a97cc937ce8802778d4db4cc305ce516b60017e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:31:17 +0000 Subject: [PATCH 17/36] revert doctest changes --- dummy.pkl | Bin 1150 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dummy.pkl diff --git a/dummy.pkl b/dummy.pkl deleted file mode 100644 index 76a409b1ded309cfc7b30cccd49d85a710e737bd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1150 zcmbVMU2D`p6is$F+m9_uQPyHX!3T{XfkORwP()jV8eBodw=zt!lbx{nSTYmpf`UG@ zwlHsd?cdcqli4LgMKBM!H_4ql=bU>c-@Koq=a@@v&uB5GB8bb11xZD725RGwO8Um+ z3wZb)zJjlMB%f5E?zGF(Lb9r$nFw-P&UF%7emK7)(%AMgSEu&dnXFdB{C z{=&=LLPtUrdZ(b=1CUsxJd#r}1%A`N^i~^V0(?hxqP=#nFMsL9@0w1164RJ79LW+6 z-%^_>%#-~?i_XZxAL*eD(qha$lQ^RSN3+uw*L-0jh^WAcdq=uZQyUnvq@l`pRd0%w z$RveYM52z=dQ_*GObcx2i7bt^AfXewp{w->JNnKC{8}>|zO6|wHD8kNTM^c5T(@z< zM&P?zzlJlvcZF{ETi=l?5BMH}<1Y|Lr;X$cj=|@T)g~#}30bn_PmQG;-wfNn$!p|h zjE8T1B0d9+UahmU2&fpLJ^u Date: Wed, 8 Oct 2025 20:33:23 +0000 Subject: [PATCH 18/36] revert df docstrings --- bigframes/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0146287e15..3527b225e2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1771,6 +1771,7 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 2, 2]}) Download the data from BigQuery and convert it into an in-memory pandas DataFrame. @@ -1892,6 +1893,7 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) Iterate through the results in batches, limiting the total rows yielded @@ -4250,6 +4252,8 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None >>> data = { ... "timestamp_col": pd.date_range( From 922bbf45ac2f9b26409c425f0b8446579422eb11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 20:57:53 +0000 Subject: [PATCH 19/36] add polars series unit tests --- bigframes/testing/polars_session.py | 11 +- tests/unit/test_series_polars.py | 4897 +++++++++++++++++++++++++++ 2 files changed, 4906 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_series_polars.py diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 29eae20b7a..4d3e6862b9 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -95,10 +95,17 @@ def __init__(self): def read_pandas(self, pandas_dataframe, write_engine="default"): # override read_pandas to always keep data local-only - if isinstance(pandas_dataframe, pandas.Series): + if isinstance(pandas_dataframe, (pandas.Series, pandas.Index)): pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) - return bigframes.dataframe.DataFrame(local_block) + bf_df = bigframes.dataframe.DataFrame(local_block) + if isinstance(pandas_dataframe, pandas.Series): + series = bf_df[bf_df.columns[0]] + series.name = pandas_dataframe.name + return series + if isinstance(pandas_dataframe, pandas.Index): + return bf_df.index + return bf_df @property def bqclient(self): diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py new file mode 100644 index 0000000000..8c24a28f43 --- /dev/null +++ b/tests/unit/test_series_polars.py @@ -0,0 +1,4897 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime as dt +import json +import math +import pathlib +import re +import tempfile +from typing import Generator + +import db_dtypes # type: ignore +import geopandas as gpd # type: ignore +import google.api_core.exceptions +import numpy +from packaging.version import Version +import pandas as pd +import pyarrow as pa # type: ignore +import pytest +import shapely.geometry # type: ignore + +import bigframes +import bigframes.dtypes as dtypes +import bigframes.features +import bigframes.pandas +import bigframes.pandas as bpd +import bigframes.series as series +from bigframes.testing.utils import ( + assert_pandas_df_equal, + assert_series_equal, + convert_pandas_dtypes, + get_first_file_from_wildcard, +) + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="module") +def scalars_pandas_df_index() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + df.index.name = None + return df.set_index("rowindex").sort_index() + + +@pytest.fixture(scope="module") +def scalars_df_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_df_2_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_dfs( + scalars_df_index, + scalars_pandas_df_index, +): + return scalars_df_index, scalars_pandas_df_index + + +def test_series_construct_copy(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df["int64_col"], name="test_series", dtype="Float64" + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_nullable_ints(): + bf_result = series.Series( + [1, 3, bigframes.pandas.NA], index=[0, 4, bigframes.pandas.NA] + ).to_pandas() + + # TODO(b/340885567): fix type error + expected_index = pd.Index( # type: ignore + [0, 4, None], + dtype=pd.Int64Dtype(), + ) + expected = pd.Series([1, 3, pd.NA], dtype=pd.Int64Dtype(), index=expected_index) + + pd.testing.assert_series_equal(bf_result, expected) + + +def test_series_construct_timestamps(): + datetimes = [ + dt.datetime(2020, 1, 20, 20, 20, 20, 20), + dt.datetime(2019, 1, 20, 20, 20, 20, 20), + None, + ] + bf_result = series.Series(datetimes).to_pandas() + pd_result = pd.Series(datetimes, dtype=pd.ArrowDtype(pa.timestamp("us"))) + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_series_construct_copy_with_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_copy_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df.index, + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df.index, + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_pandas(scalars_dfs): + _, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" + ) + pd_result = pd.Series( + scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" + ) + assert bf_result.shape == pd_result.shape + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series_construct_from_list(): + bf_result = series.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64").to_pandas() + pd_result = pd.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_reindex(): + bf_result = series.Series( + series.Series({1: 10, 2: 30, 3: 30}), index=[3, 2], dtype="Int64" + ).to_pandas() + pd_result = pd.Series(pd.Series({1: 10, 2: 30, 3: 30}), index=[3, 2], dtype="Int64") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_from_list_w_index(): + bf_result = series.Series( + [1, 1, 2, 3, 5, 8, 13], index=[10, 20, 30, 40, 50, 60, 70], dtype="Int64" + ).to_pandas() + pd_result = pd.Series( + [1, 1, 2, 3, 5, 8, 13], index=[10, 20, 30, 40, 50, 60, 70], dtype="Int64" + ) + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_empty(session: bigframes.Session): + bf_series: series.Series = series.Series(session=session) + pd_series: pd.Series = pd.Series() + + bf_result = bf_series.empty + pd_result = pd_series.empty + + assert pd_result + assert bf_result == pd_result + + +def test_series_construct_scalar_no_index(): + bf_result = series.Series("hello world", dtype="string[pyarrow]").to_pandas() + pd_result = pd.Series("hello world", dtype="string[pyarrow]") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_scalar_w_index(): + bf_result = series.Series( + "hello world", dtype="string[pyarrow]", index=[0, 2, 1] + ).to_pandas() + pd_result = pd.Series("hello world", dtype="string[pyarrow]", index=[0, 2, 1]) + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_nan(): + bf_result = series.Series(numpy.nan).to_pandas() + pd_result = pd.Series(numpy.nan) + + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_scalar_w_bf_index(): + bf_result = series.Series( + "hello", index=bigframes.pandas.Index([1, 2, 3]) + ).to_pandas() + pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64")) + + pd_result = pd_result.astype("string[pyarrow]") + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_from_list_escaped_strings(): + """Check that special characters are supported.""" + strings = [ + "string\nwith\nnewline", + "string\twith\ttabs", + "string\\with\\backslashes", + ] + bf_result = series.Series(strings, name="test_series", dtype="string[pyarrow]") + pd_result = pd.Series(strings, name="test_series", dtype="string[pyarrow]") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series_construct_geodata(): + pd_series = pd.Series( + [ + shapely.geometry.Point(1, 1), + shapely.geometry.Point(2, 2), + shapely.geometry.Point(3, 3), + ], + dtype=gpd.array.GeometryDtype(), + ) + + series = bigframes.pandas.Series(pd_series) + + pd.testing.assert_series_equal( + pd_series, series.to_pandas(), check_index_type=False + ) + + +@pytest.mark.parametrize( + ("dtype"), + [ + pytest.param(pd.Int64Dtype(), id="int"), + pytest.param(pd.Float64Dtype(), id="float"), + pytest.param(pd.StringDtype(storage="pyarrow"), id="string"), + ], +) +def test_series_construct_w_dtype(dtype): + data = [1, 2, 3] + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + series = bigframes.pandas.Series(data, dtype=dtype) + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_struct(): + # The data shows the struct fields are disordered and correctly handled during + # construction. + data = [ + {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)}, + {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)}, + {"a": 1, "c": "numpy", "b": None}, + ] + dtype = pd.ArrowDtype( + pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))]) + ) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_array_string(): + data = [["1", "2", "3"], [], ["4", "5"]] + dtype = pd.ArrowDtype(pa.list_(pa.string())) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + +def test_series_construct_w_dtype_for_array_struct(): + data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]] + dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())]))) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + +def test_series_construct_local_unordered_has_sequential_index(unordered_session): + series = bigframes.pandas.Series( + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ) + expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) + pd.testing.assert_index_equal(series.index.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_series_construct_w_json_dtype(json_type): + data = [ + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', + None, + '{"a": {"b": [1, 2, 3], "c": true}}', + ] + s = bigframes.pandas.Series(data, dtype=json_type) + + assert s.dtype == dtypes.JSON_DTYPE + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + assert s[3] == '["a",{"b":1},null]' + assert pd.isna(s[4]) + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' + + +def test_series_keys(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].keys().to_pandas() + pd_result = scalars_pandas_df["int64_col"].keys() + pd.testing.assert_index_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ["data", "index"], + [ + (["a", "b", "c"], None), + ([1, 2, 3], ["a", "b", "c"]), + ([1, 2, None], ["a", "b", "c"]), + ([1, 2, 3], [pd.NA, "b", "c"]), + ([numpy.nan, 2, 3], ["a", "b", "c"]), + ], +) +def test_series_items(data, index): + bf_series = series.Series(data, index=index) + pd_series = pd.Series(data, index=index) + + for (bf_index, bf_value), (pd_index, pd_value) in zip( + bf_series.items(), pd_series.items() + ): + # TODO(jialuo): Remove the if conditions after b/373699458 is addressed. + if not pd.isna(bf_index) or not pd.isna(pd_index): + assert bf_index == pd_index + if not pd.isna(bf_value) or not pd.isna(pd_value): + assert bf_value == pd_value + + +@pytest.mark.parametrize( + ["col_name", "expected_dtype"], + [ + ("bool_col", pd.BooleanDtype()), + # TODO(swast): Use a more efficient type. + ("bytes_col", pd.ArrowDtype(pa.binary())), + ("date_col", pd.ArrowDtype(pa.date32())), + ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))), + ("float64_col", pd.Float64Dtype()), + ("geography_col", gpd.array.GeometryDtype()), + ("int64_col", pd.Int64Dtype()), + # TODO(swast): Use a more efficient type. + ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_too", pd.Int64Dtype()), + ("string_col", pd.StringDtype(storage="pyarrow")), + ("time_col", pd.ArrowDtype(pa.time64("us"))), + ("timestamp_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ], +) +def test_get_column(scalars_dfs, col_name, expected_dtype): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df[col_name] + series_pandas = series.to_pandas() + assert series_pandas.dtype == expected_dtype + assert series_pandas.shape[0] == scalars_pandas_df.shape[0] + + +def test_get_column_w_json(json_df, json_pandas_df): + series = json_df["json_col"] + series_pandas = series.to_pandas() + assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + assert series_pandas.shape[0] == json_pandas_df.shape[0] + + +def test_series_get_column_default(scalars_dfs): + scalars_df, _ = scalars_dfs + result = scalars_df.get(123123123123123, "default_val") + assert result == "default_val" + + +@pytest.mark.parametrize( + ("key",), + [ + ("hello",), + (2,), + ("int64_col",), + (None,), + ], +) +def test_series_contains(scalars_df_index, scalars_pandas_df_index, key): + bf_result = key in scalars_df_index["int64_col"] + pd_result = key in scalars_pandas_df_index["int64_col"] + + assert bf_result == pd_result + + +def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col) + pd_result = scalars_pandas_df_index.int64_col.equals( + scalars_pandas_df_index.int64_col + ) + + assert pd_result == bf_result + + +def test_series_equals_df(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]]) + pd_result = scalars_pandas_df_index["int64_col"].equals( + scalars_pandas_df_index[["int64_col"]] + ) + + assert pd_result == bf_result + + +def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + bf_result = bf_series.equals(bf_series.astype("Float64")) + pd_result = pd_series.equals(pd_series.astype("Float64")) + + assert pd_result == bf_result + + +def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + bf_result = bf_series.equals(bf_series + 1) + pd_result = pd_series.equals(pd_series + 1) + + assert pd_result == bf_result + + +def test_series_get_with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].get(key) + pd_result = scalars_pandas_df[col_name].get(key) + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index_col", "key"), + ( + ("int64_too", 2), + ("string_col", "Hello, World!"), + ("int64_too", slice(2, 6)), + ), +) +def test_series___getitem__(scalars_dfs, index_col, key): + col_name = "float64_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("key",), + ( + (-2,), + (-1,), + (0,), + (1,), + ), +) +def test_series___getitem___with_int_key(scalars_dfs, key): + col_name = "int64_too" + index_col = "string_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + assert bf_result == pd_result + + +def test_series___getitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index_col", "key", "value"), + ( + ("int64_too", 2, "new_string_value"), + ("string_col", "Hello, World!", "updated_value"), + ("int64_too", 0, None), + ), +) +def test_series___setitem__(scalars_dfs, index_col, key, value): + col_name = "string_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +@pytest.mark.parametrize( + ("key", "value"), + ( + (0, 999), + (1, 888), + (0, None), + (-2345, 777), + ), +) +def test_series___setitem___with_int_key_numeric(scalars_dfs, key, value): + col_name = "int64_col" + index_col = "int64_too" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_series___setitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + value = 123.456 + scalars_df, scalars_pandas_df = scalars_dfs + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + assert bf_series.to_pandas().iloc[key] == pd_series.iloc[key] + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_too",), + ), +) +def test_abs(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].abs().to_pandas() + pd_result = scalars_pandas_df[col_name].abs() + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_too",), + ), +) +def test_series_pos(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (+scalars_df[col_name]).to_pandas() + pd_result = +scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_too",), + ), +) +def test_series_neg(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (-scalars_df[col_name]).to_pandas() + pd_result = -scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("bool_col",), + ("int64_col",), + ), +) +def test_series_invert(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (~scalars_df[col_name]).to_pandas() + pd_result = ~scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + +def test_fillna(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].fillna("Missing").to_pandas() + pd_result = scalars_pandas_df[col_name].fillna("Missing") + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_series_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name].replace("Hello, World!", "Howdy, Planet!").to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace("Hello, World!", "Howdy, Planet!") + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +def test_series_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name] + .replace(["Hello, World!", "T"], "Howdy, Planet!") + .to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace( + ["Hello, World!", "T"], "Howdy, Planet!" + ) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("replacement_dict",), + (({},),), + ids=[ + "empty", + ], +) +def test_series_replace_dict(scalars_dfs, replacement_dict): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].replace(replacement_dict).to_pandas() + pd_result = scalars_pandas_df[col_name].replace(replacement_dict) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("method",), + ( + ("linear",), + ("values",), + ("slinear",), + ("nearest",), + ("zero",), + ("pad",), + ), +) +def test_series_interpolate(method): + pytest.importorskip("scipy") + + values = [None, 1, 2, None, None, 16, None] + index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] + pd_series = pd.Series(values, index) + bf_series = series.Series(pd_series) + + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = pd_series.astype("float64").interpolate(method=method) + bf_result = bf_series.interpolate(method=method).to_pandas() + + # pd uses non-null types, while bf uses nullable types + pd.testing.assert_series_equal( + pd_result, + bf_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index",), + ( + (True,), + (False,), + ), +) +def test_series_dropna(scalars_dfs, ignore_index): + if pd.__version__.startswith("1."): + pytest.skip("ignore_index parameter not supported in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas() + pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index) + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("agg",), + ( + ("sum",), + ("size",), + ), +) +def test_series_agg_single_string(scalars_dfs, agg): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].agg(agg) + pd_result = scalars_pandas_df["int64_col"].agg(agg) + assert math.isclose(pd_result, bf_result) + + +def test_series_agg_multi_string(scalars_dfs): + aggregations = [ + "sum", + "mean", + "std", + "var", + "min", + "max", + "nunique", + "count", + "size", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df["int64_col"].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("string_col",), + ("int64_col",), + ), +) +def test_max(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].max() + pd_result = scalars_pandas_df[col_name].max() + assert pd_result == bf_result + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("string_col",), + ("int64_col",), + ), +) +def test_min(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].min() + pd_result = scalars_pandas_df[col_name].min() + assert pd_result == bf_result + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_std(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].std() + pd_result = scalars_pandas_df[col_name].std() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_kurt(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].kurt() + pd_result = scalars_pandas_df[col_name].kurt() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_skew(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].skew() + pd_result = scalars_pandas_df[col_name].skew() + assert math.isclose(pd_result, bf_result) + + +def test_skew_undefined(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].iloc[:2].skew() + pd_result = scalars_pandas_df["int64_col"].iloc[:2].skew() + # both should be pd.NA + assert pd_result is bf_result + + +def test_kurt_undefined(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].iloc[:3].kurt() + pd_result = scalars_pandas_df["int64_col"].iloc[:3].kurt() + # both should be pd.NA + assert pd_result is bf_result + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_var(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].var() + pd_result = scalars_pandas_df[col_name].var() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("bool_col",), + ("int64_col",), + ), +) +def test_mode_stat(scalars_df_index, scalars_pandas_df_index, col_name): + bf_result = scalars_df_index[col_name].mode().to_pandas() + pd_result = scalars_pandas_df_index[col_name].mode() + + ## Mode implicitly resets index, and bigframes default indices use nullable Int64 + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x, y: x + y), + (lambda x, y: x - y), + (lambda x, y: x * y), + (lambda x, y: x / y), + (lambda x, y: x // y), + (lambda x, y: x < y), + (lambda x, y: x > y), + (lambda x, y: x <= y), + (lambda x, y: x >= y), + ], + ids=[ + "add", + "subtract", + "multiply", + "divide", + "floordivide", + "less_than", + "greater_than", + "less_than_equal", + "greater_than_equal", + ], +) +@pytest.mark.parametrize( + ("other_scalar"), + [ + -1, + 0, + 14, + # TODO(tswast): Support pd.NA, + ], +) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_series_int_int_operators_scalar( + scalars_dfs, operator, other_scalar, reverse_operands +): + scalars_df, scalars_pandas_df = scalars_dfs + + maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator + + bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) + + assert_series_equal(pd_result, bf_result) + + +def test_series_pow_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["int64_col"] ** 2).to_pandas() + pd_result = scalars_pandas_df["int64_col"] ** 2 + + assert_series_equal(pd_result, bf_result) + + +def test_series_pow_scalar_reverse(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() + pd_result = 0.8 ** scalars_pandas_df["int64_col"] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x, y: x & y), + (lambda x, y: x | y), + (lambda x, y: x ^ y), + ], + ids=[ + "and", + "or", + "xor", + ], +) +@pytest.mark.parametrize(("other_scalar"), [True, False, pd.NA]) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_series_bool_bool_operators_scalar( + scalars_dfs, operator, other_scalar, reverse_operands +): + scalars_df, scalars_pandas_df = scalars_dfs + + maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator + + bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) + + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x, y: x + y), + (lambda x, y: x - y), + (lambda x, y: x * y), + (lambda x, y: x / y), + (lambda x, y: x < y), + (lambda x, y: x > y), + (lambda x, y: x <= y), + (lambda x, y: x >= y), + (lambda x, y: x % y), + (lambda x, y: x // y), + (lambda x, y: x & y), + (lambda x, y: x | y), + (lambda x, y: x ^ y), + ], + ids=[ + "add", + "subtract", + "multiply", + "divide", + "less_than", + "greater_than", + "less_than_equal", + "greater_than_equal", + "modulo", + "floordivide", + "bitwise_and", + "bitwise_or", + "bitwise_xor", + ], +) +def test_series_int_int_operators_series(scalars_dfs, operator): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() + pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("int64_too",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("col_y",), + [ + ("int64_col",), + ("int64_too",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("mod",), + ("rmod",), + ], +) +def test_mods(scalars_dfs, col_x, col_y, method): + scalars_df, scalars_pandas_df = scalars_dfs + x_bf = scalars_df[col_x] + y_bf = scalars_df[col_y] + bf_series = getattr(x_bf, method)(y_bf) + # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod + if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): + bf_result = bf_series.to_pandas() + else: + bf_result = bf_series.astype("Float64").to_pandas() + pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) + pd.testing.assert_series_equal(pd_result, bf_result) + + +# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this +# manually with dumb self-correlation instead of parameterized as test_mods is above. +def test_series_corr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .corr(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + +def test_series_autocorr(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["float64_col"].autocorr(2) + pd_result = scalars_pandas_df["float64_col"].autocorr(2) + assert math.isclose(pd_result, bf_result) + + +def test_series_cov(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .cov(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("col_y",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("divmod",), + ("rdivmod",), + ], +) +def test_divmods_series(scalars_dfs, col_x, col_y, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]) + pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( + scalars_pandas_df[col_y] + ) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("other",), + [ + (-1000,), + (678,), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("divmod",), + ("rdivmod",), + ], +) +def test_divmods_scalars(scalars_dfs, col_x, other, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) + pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) + + +@pytest.mark.parametrize( + ("other",), + [ + (3,), + (-6.2,), + ], +) +def test_series_add_scalar(scalars_dfs, other): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (scalars_df["float64_col"] + other).to_pandas() + pd_result = scalars_pandas_df["float64_col"] + other + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("left_col", "right_col"), + [ + ("float64_col", "float64_col"), + ("int64_col", "float64_col"), + ("int64_col", "int64_too"), + ], +) +def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() + pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("left_col", "right_col", "righter_col"), + [ + ("float64_col", "float64_col", "float64_col"), + ("int64_col", "int64_col", "int64_col"), + ], +) +def test_series_add_bigframes_series_nested( + scalars_dfs, left_col, right_col, righter_col +): + """Test that we can correctly add multiple times.""" + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + (scalars_df[left_col] + scalars_df[right_col]) + scalars_df[righter_col] + ).to_pandas() + pd_result = ( + scalars_pandas_df[left_col] + scalars_pandas_df[right_col] + ) + scalars_pandas_df[righter_col] + + assert_series_equal(pd_result, bf_result) + + +def test_series_add_different_table_default_index( + scalars_df_default_index, + scalars_df_2_default_index, +): + bf_result = ( + scalars_df_default_index["float64_col"] + + scalars_df_2_default_index["float64_col"] + ).to_pandas() + pd_result = ( + # Default index may not have a well defined order, but it should at + # least be consistent across to_pandas() calls. + scalars_df_default_index["float64_col"].to_pandas() + + scalars_df_2_default_index["float64_col"].to_pandas() + ) + # TODO(swast): Can remove sort_index() when there's default ordering. + pd.testing.assert_series_equal(bf_result.sort_index(), pd_result.sort_index()) + + +def test_series_add_different_table_with_index( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + scalars_pandas_df = scalars_pandas_df_index + bf_result = scalars_df_index["float64_col"] + scalars_df_2_index["int64_col"] + # When index values are unique, we can emulate with values from the same + # DataFrame. + pd_result = scalars_pandas_df["float64_col"] + scalars_pandas_df["int64_col"] + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): + scalars_pandas_df = scalars_pandas_df_index + bf_result = ( + scalars_df_index["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=True) + ).iloc[::2] + pd_result = ( + scalars_pandas_df["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=True) + ).iloc[::2] + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"].copy() + bf_series.index.name = "int64_col" + df = bf_series.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + pd_series = scalars_pandas_df_index["int64_col"].copy() + pd_series.index.name = "int64_col" + pd_result = pd_series.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_series_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index["int64_col"].copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + +def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"] + bf_result.reset_index(drop=True, inplace=True) + pd_result = scalars_pandas_df_index.sort_index(ascending=False)["float64_col"] + pd_result.reset_index(drop=True, inplace=True) + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("name",), + [ + ("some_name",), + (None,), + ], +) +def test_reset_index_no_drop(scalars_df_index, scalars_pandas_df_index, name): + scalars_pandas_df = scalars_pandas_df_index + kw_args = {"name": name} if name else {} + bf_result = ( + scalars_df_index["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=False, **kw_args) + ) + pd_result = ( + scalars_pandas_df["float64_col"] + .sort_index(ascending=False) + .reset_index(drop=False, **kw_args) + ) + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_copy(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + # Expect mutation on original not to effect_copy + bf_series = scalars_df_index[col_name].copy() + bf_copy = bf_series.copy() + bf_copy.loc[0] = 5.6 + bf_series.loc[0] = 3.4 + + pd_series = scalars_pandas_df_index[col_name].copy() + pd_copy = pd_series.copy() + pd_copy.loc[0] = 5.6 + pd_series.loc[0] = 3.4 + + assert bf_copy.to_pandas().loc[0] != bf_series.to_pandas().loc[0] + pd.testing.assert_series_equal(bf_copy.to_pandas(), pd_copy) + + +def test_isin_raise_error(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_too" + with pytest.raises(TypeError): + scalars_df_index[col_name].isin("whatever").to_pandas() + + +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin(scalars_dfs, col_name, test_set): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].isin(test_set).to_pandas() + pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + scalars_df[col_name].isin(series.Series(test_set, session=session)).to_pandas() + ) + pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +def test_isin_bigframes_index(scalars_dfs, session): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + scalars_df["string_col"] + .isin(bigframes.pandas.Index(["Hello, World!", "Hi", "こんにちは"], session=session)) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df["string_col"] + .isin(pd.Index(["Hello, World!", "Hi", "こんにちは"])) + .astype("boolean") + ) + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin_bigframes_values_as_predicate( + scalars_dfs_maybe_ordered, col_name, test_set +): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_predicate = scalars_df[col_name].isin( + series.Series(test_set, session=scalars_df._session) + ) + bf_result = scalars_df[bf_predicate].to_pandas() + pd_predicate = scalars_pandas_df[col_name].isin(test_set) + pd_result = scalars_pandas_df[pd_predicate] + + pd.testing.assert_frame_equal( + pd_result.reset_index(), + bf_result.reset_index(), + ) + + +def test_isnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_series = scalars_df[col_name].isnull().to_pandas() + pd_series = scalars_pandas_df[col_name].isnull() + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) + + +def test_notnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series = scalars_df[col_name].notnull().to_pandas() + pd_series = scalars_pandas_df[col_name].notnull() + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) + + +def test_eq_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = scalars_df[col_name].eq(0).to_pandas() + pd_result = scalars_pandas_df[col_name].eq(0) + + assert_series_equal(pd_result, bf_result) + + +def test_eq_wider_type_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = scalars_df[col_name].eq(1.0).to_pandas() + pd_result = scalars_pandas_df[col_name].eq(1.0) + + assert_series_equal(pd_result, bf_result) + + +def test_ne_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = (scalars_df[col_name] != 0).to_pandas() + pd_result = scalars_pandas_df[col_name] != 0 + + assert_series_equal(pd_result, bf_result) + + +def test_eq_int_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = (scalars_df[col_name] == 0).to_pandas() + pd_result = scalars_pandas_df[col_name] == 0 + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name",), + ( + ("string_col",), + ("float64_col",), + ("int64_too",), + ), +) +def test_eq_same_type_series(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = (scalars_df[col_name] == scalars_df[col_name]).to_pandas() + pd_result = scalars_pandas_df[col_name] == scalars_pandas_df[col_name] + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) + + +def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): + bf_original = scalars_df_index["string_col"] + bf_series = scalars_df_index["string_col"] + pd_original = scalars_pandas_df_index["string_col"] + pd_series = scalars_pandas_df_index["string_col"].copy() + bf_series.loc[2] = "This value isn't in the test data." + pd_series.loc[2] = "This value isn't in the test data." + bf_result = bf_series.to_pandas() + pd_result = pd_series + pd.testing.assert_series_equal(bf_result, pd_result) + # Per Copy-on-Write semantics, other references to the original DataFrame + # should remain unchanged. + pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original) + + +def test_at_setitem_row_label_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"].copy() + bf_series.at[1] = 1000 + pd_series.at[1] = 1000 + bf_result = bf_series.to_pandas() + pd_result = pd_series.astype("Int64") + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_ne_obj_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = (scalars_df[col_name] != scalars_df[col_name]).to_pandas() + pd_result = scalars_pandas_df[col_name] != scalars_pandas_df[col_name] + + # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but + # the `pd_series.dtype` is `bool`. + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) + + +def test_indexing_using_unselected_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() + pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_indexing_using_selected_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name][ + scalars_df["string_col"].eq("Hello, World!") + ].to_pandas() + pd_result = scalars_pandas_df[col_name][ + scalars_pandas_df["string_col"].eq("Hello, World!") + ] + + assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("indices"), + [ + ([1, 3, 5]), + ([5, -3, -5, -6]), + ([-2, -4, -6]), + ], +) +def test_take(scalars_dfs, indices): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices).to_pandas() + pd_result = scalars_pandas_df.take(indices) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_nested_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + string_col = scalars_df["string_col"] + int64_too = scalars_df["int64_too"] + bool_col = scalars_df["bool_col"] == bool( + True + ) # Convert from nullable bool to nonnullable bool usable as indexer + bf_result = string_col[int64_too == 0][~bool_col].to_pandas() + + pd_string_col = scalars_pandas_df["string_col"] + pd_int64_too = scalars_pandas_df["int64_too"] + pd_bool_col = scalars_pandas_df["bool_col"] == bool( + True + ) # Convert from nullable bool to nonnullable bool usable as indexer + pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col] + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_binop_opposite_filters(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col1 = scalars_df["int64_col"] + int64_col2 = scalars_df["int64_col"] + bool_col = scalars_df["bool_col"] + bf_result = (int64_col1[bool_col] + int64_col2[bool_col.__invert__()]).to_pandas() + + pd_int64_col1 = scalars_pandas_df["int64_col"] + pd_int64_col2 = scalars_pandas_df["int64_col"] + pd_bool_col = scalars_pandas_df["bool_col"] + pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] + + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) + + +def test_binop_left_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"] + float64_col = scalars_df["float64_col"] + bool_col = scalars_df["bool_col"] + bf_result = (int64_col[bool_col] + float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_float64_col = scalars_pandas_df["float64_col"] + pd_bool_col = scalars_pandas_df["bool_col"] + pd_result = pd_int64_col[pd_bool_col] + pd_float64_col + + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) + + +def test_binop_right_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"] + float64_col = scalars_df["float64_col"] + bool_col = scalars_df["bool_col"] + bf_result = (float64_col + int64_col[bool_col]).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_float64_col = scalars_pandas_df["float64_col"] + pd_bool_col = scalars_pandas_df["bool_col"] + pd_result = pd_float64_col + pd_int64_col[pd_bool_col] + + assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("other",), + [ + ([-1.4, 2.3, None],), + (pd.Index([-1.4, 2.3, None]),), + (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), + ], +) +def test_series_binop_w_other_types(scalars_dfs, other): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas() + pd_result = scalars_pandas_df["int64_col"].head(3) + other + + assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("other",), + [ + ([-1.4, 2.3, None],), + (pd.Index([-1.4, 2.3, None]),), + (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), + ], +) +def test_series_reverse_binop_w_other_types(scalars_dfs, other): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (other + scalars_df["int64_col"].head(3)).to_pandas() + pd_result = other + scalars_pandas_df["int64_col"].head(3) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_combine_first(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7) + bf_result = int64_col.combine_first(float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_result = pd_int64_col.combine_first(pd_float64_col) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_update(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7).copy() + float64_col.update(int64_col) + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy() + pd_float64_col.update(pd_int64_col) + + assert_series_equal( + float64_col.to_pandas(), + pd_float64_col, + ) + + +def test_mean(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].mean() + pd_result = scalars_pandas_df[col_name].mean() + assert math.isclose(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_name"), + [ + "int64_col", + # Non-numeric column + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "string_col", + ], +) +def test_median(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].median(exact=False) + pd_max = scalars_pandas_df[col_name].max() + pd_min = scalars_pandas_df[col_name].min() + # Median is approximate, so just check for plausibility. + assert pd_min < bf_result < pd_max + + +def test_numeric_literal(scalars_dfs): + scalars_df, _ = scalars_dfs + col_name = "numeric_col" + assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + bf_result = scalars_df[col_name] + 42 + assert bf_result.size == scalars_df[col_name].size + assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + + +def test_series_small_repr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + col_name = "int64_col" + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name] + assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) + + +def test_sum(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].sum() + pd_result = scalars_pandas_df[col_name].sum() + assert pd_result == bf_result + + +def test_product(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_result = scalars_df[col_name].product() + pd_result = scalars_pandas_df[col_name].product() + assert math.isclose(pd_result, bf_result) + + +def test_cumprod(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumprod NA mask are different in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_result = scalars_df[col_name].cumprod() + pd_result = scalars_pandas_df[col_name].cumprod() + pd.testing.assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_count(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].count() + pd_result = scalars_pandas_df[col_name].count() + assert pd_result == bf_result + + +def test_nunique(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = (scalars_df[col_name] % 3).nunique() + pd_result = (scalars_pandas_df[col_name] % 3).nunique() + assert pd_result == bf_result + + +def test_all(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].all() + pd_result = scalars_pandas_df[col_name].all() + assert pd_result == bf_result + + +def test_any(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].any() + pd_result = scalars_pandas_df[col_name].any() + assert pd_result == bf_result + + +def test_groupby_sum(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby([scalars_df["bool_col"], ~scalars_df["bool_col"]]) + .sum() + ) + pd_series = ( + scalars_pandas_df[col_name] + .groupby([scalars_pandas_df["bool_col"], ~scalars_pandas_df["bool_col"]]) + .sum() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + check_exact=False, + ) + + +def test_groupby_std(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = scalars_df[col_name].groupby(scalars_df["string_col"]).std() + pd_series = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"]) + .std() + .astype(pd.Float64Dtype()) + ) + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + check_exact=False, + ) + + +def test_groupby_var(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = scalars_df[col_name].groupby(scalars_df["string_col"]).var() + pd_series = ( + scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() + ) + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + check_exact=False, + ) + + +def test_groupby_level_sum(scalars_dfs): + # TODO(tbergeron): Use a non-unique index once that becomes possible in tests + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + + bf_series = scalars_df[col_name].groupby(level=0).sum() + pd_series = scalars_pandas_df[col_name].groupby(level=0).sum() + # TODO(swast): Update groupby to use index based on group by key(s). + pd.testing.assert_series_equal( + pd_series.sort_index(), + bf_series.to_pandas().sort_index(), + ) + + +def test_groupby_level_list_sum(scalars_dfs): + # TODO(tbergeron): Use a non-unique index once that becomes possible in tests + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + + bf_series = scalars_df[col_name].groupby(level=["rowindex"]).sum() + pd_series = scalars_pandas_df[col_name].groupby(level=["rowindex"]).sum() + # TODO(swast): Update groupby to use index based on group by key(s). + pd.testing.assert_series_equal( + pd_series.sort_index(), + bf_series.to_pandas().sort_index(), + ) + + +def test_groupby_mean(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).mean() + ) + pd_series = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .mean() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + ) + + +def test_groupby_median_exact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_result = ( + scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() + ) + pd_result = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .median() + ) + + assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_groupby_median_inexact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby(scalars_df["string_col"], dropna=False) + .median(exact=False) + ) + pd_max = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .max() + ) + pd_min = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .min() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + + # Median is approximate, so just check that it's plausible. + assert ((pd_min <= bf_result) & (bf_result <= pd_max)).all() + + +def test_groupby_prod(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = scalars_df[col_name].groupby(scalars_df["int64_col"]).prod() + pd_series = ( + scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() + ).astype(pd.Float64Dtype()) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + assert_series_equal( + pd_series, + bf_result, + ) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x: x.cumsum()), + (lambda x: x.cumcount()), + (lambda x: x.cummin()), + (lambda x: x.cummax()), + # Pandas 2.2 casts to cumprod to float. + (lambda x: x.cumprod().astype("Float64")), + (lambda x: x.diff()), + (lambda x: x.shift(2)), + (lambda x: x.shift(-2)), + ], + ids=[ + "cumsum", + "cumcount", + "cummin", + "cummax", + "cumprod", + "diff", + "shiftpostive", + "shiftnegative", + ], +) +def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator): + col_name = "int64_col" + group_key = "int64_too" # has some duplicates values, good for grouping + bf_series = ( + operator(scalars_df_index[col_name].groupby(scalars_df_index[group_key])) + ).to_pandas() + pd_series = operator( + scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key]) + ).astype(bf_series.dtype) + + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +@pytest.mark.parametrize( + ("label", "col_name"), + [ + (0, "bool_col"), + (1, "int64_col"), + ], +) +def test_drop_label(scalars_df_index, scalars_pandas_df_index, label, col_name): + bf_series = scalars_df_index[col_name].drop(label).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop(label) + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_series = scalars_df_index[col_name].drop([1, 3]).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop([1, 3]) + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +@pytest.mark.parametrize( + ("col_name",), + [ + ("bool_col",), + ("int64_too",), + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_name): + bf_series = scalars_df_index[col_name].drop_duplicates(keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop_duplicates(keep=keep) + pd.testing.assert_series_equal( + pd_series, + bf_series, + ) + + +@pytest.mark.parametrize( + ("col_name",), + [ + ("bool_col",), + ("int64_too",), + ], +) +def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): + bf_uniq = scalars_df_index[col_name].unique().to_numpy(na_value=None) + pd_uniq = scalars_pandas_df_index[col_name].unique() + numpy.array_equal(pd_uniq, bf_uniq) + + +@pytest.mark.parametrize( + ("col_name",), + [ + ("bool_col",), + ("int64_too",), + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_duplicated(scalars_df_index, scalars_pandas_df_index, keep, col_name): + bf_series = scalars_df_index[col_name].duplicated(keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[col_name].duplicated(keep=keep) + pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + + +def test_shape(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].shape + pd_result = scalars_pandas_df["string_col"].shape + + assert pd_result == bf_result + + +def test_len(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = len(scalars_df["string_col"]) + pd_result = len(scalars_pandas_df["string_col"]) + + assert pd_result == bf_result + + +def test_size(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].size + pd_result = scalars_pandas_df["string_col"].size + + assert pd_result == bf_result + + +def test_series_hasnans_true(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].hasnans + pd_result = scalars_pandas_df["string_col"].hasnans + + assert pd_result == bf_result + + +def test_series_hasnans_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dropna().hasnans + pd_result = scalars_pandas_df["string_col"].dropna().hasnans + + assert pd_result == bf_result + + +def test_empty_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].empty + pd_result = scalars_pandas_df["string_col"].empty + + assert pd_result == bf_result + + +def test_empty_true_row_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"][ + scalars_df["string_col"] == "won't find this" + ].empty + pd_result = scalars_pandas_df["string_col"][ + scalars_pandas_df["string_col"] == "won't find this" + ].empty + + assert pd_result + assert pd_result == bf_result + + +def test_series_names(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].copy() + bf_result.index.name = "new index name" + bf_result.name = "new series name" + + pd_result = scalars_pandas_df["string_col"].copy() + pd_result.index.name = "new index name" + pd_result.name = "new series name" + + assert pd_result.name == bf_result.name + assert pd_result.index.name == bf_result.index.name + + +def test_dtype(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dtype + pd_result = scalars_pandas_df["string_col"].dtype + + assert pd_result == bf_result + + +def test_dtypes(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_col"].dtypes + pd_result = scalars_pandas_df["int64_col"].dtypes + + assert pd_result == bf_result + + +def test_head(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].head(2).to_pandas() + pd_result = scalars_pandas_df["string_col"].head(2) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_tail(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].tail(2).to_pandas() + pd_result = scalars_pandas_df["string_col"].tail(2) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_head_then_scalar_operation(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() + pd_result = scalars_pandas_df["float64_col"].head(1) + 4 + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_head_then_series_operation(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = ( + scalars_df["float64_col"].head(4) + scalars_df["float64_col"].head(2) + ).to_pandas() + pd_result = scalars_pandas_df["float64_col"].head(4) + scalars_pandas_df[ + "float64_col" + ].head(2) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_peek(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + peek_result = scalars_df["float64_col"].peek(n=3, force=False) + + pd.testing.assert_series_equal( + peek_result, + scalars_pandas_df["float64_col"].reindex_like(peek_result), + ) + assert len(peek_result) == 3 + + +def test_series_peek_with_large_results_not_allowed(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + session = scalars_df._block.session + slot_millis_sum = session.slot_millis_sum + peek_result = scalars_df["float64_col"].peek( + n=3, force=False, allow_large_results=False + ) + + # The metrics won't be fully updated when we call query_and_wait. + print(session.slot_millis_sum - slot_millis_sum) + assert session.slot_millis_sum - slot_millis_sum < 500 + pd.testing.assert_series_equal( + peek_result, + scalars_pandas_df["float64_col"].reindex_like(peek_result), + ) + assert len(peek_result) == 3 + + +def test_series_peek_multi_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.set_index(["string_col", "bool_col"])["float64_col"] + bf_series.name = ("2-part", "name") + pd_series = scalars_pandas_df.set_index(["string_col", "bool_col"])["float64_col"] + pd_series.name = ("2-part", "name") + peek_result = bf_series.peek(n=3, force=False) + pd.testing.assert_series_equal( + peek_result, + pd_series.reindex_like(peek_result), + ) + + +def test_series_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col > 0]["float64_col"].peek( + n=3, force=False + ) + pd_result = scalars_pandas_df[scalars_pandas_df.int64_col > 0]["float64_col"] + pd.testing.assert_series_equal( + peek_result, + pd_result.reindex_like(peek_result), + ) + + +def test_series_peek_force(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + cumsum_df = scalars_df[["int64_col", "int64_too"]].cumsum() + df_filtered = cumsum_df[cumsum_df.int64_col > 0]["int64_too"] + peek_result = df_filtered.peek(n=3, force=True) + pd_cumsum_df = scalars_pandas_df[["int64_col", "int64_too"]].cumsum() + pd_result = pd_cumsum_df[pd_cumsum_df.int64_col > 0]["int64_too"] + pd.testing.assert_series_equal( + peek_result, + pd_result.reindex_like(peek_result), + ) + + +def test_series_peek_force_float(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + cumsum_df = scalars_df[["int64_col", "float64_col"]].cumsum() + df_filtered = cumsum_df[cumsum_df.float64_col > 0]["float64_col"] + peek_result = df_filtered.peek(n=3, force=True) + pd_cumsum_df = scalars_pandas_df[["int64_col", "float64_col"]].cumsum() + pd_result = pd_cumsum_df[pd_cumsum_df.float64_col > 0]["float64_col"] + pd.testing.assert_series_equal( + peek_result, + pd_result.reindex_like(peek_result), + ) + + +def test_shift(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_result = scalars_df_index[col_name].shift().to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].shift().astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_ffill(scalars_df_index, scalars_pandas_df_index): + col_name = "numeric_col" + bf_result = scalars_df_index[col_name].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df_index[col_name].ffill(limit=1) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_bfill(scalars_df_index, scalars_pandas_df_index): + col_name = "numeric_col" + bf_result = scalars_df_index[col_name].bfill(limit=2).to_pandas() + pd_result = scalars_pandas_df_index[col_name].bfill(limit=2) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") + + col_name = "int64_col" + bf_result = scalars_df_index[col_name].cumsum().to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") + + col_name = "int64_col" + bf_result = ( + scalars_df_index.sort_values(by="rowindex_2")[col_name].cumsum().to_pandas() + ) + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index.sort_values(by="rowindex_2")[col_name] + .cumsum() + .astype(pd.Int64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_series_nlargest(scalars_df_index, scalars_pandas_df_index, keep): + col_name = "bool_col" + bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas() + pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_diff(scalars_df_index, scalars_pandas_df_index, periods): + bf_result = scalars_df_index["int64_col"].diff(periods=periods).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index["int64_col"] + .diff(periods=periods) + .astype(pd.Int64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): + col_name = "bool_col" + bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas() + pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("na_option", "method", "ascending", "numeric_only", "pct"), + [ + ("keep", "average", True, True, False), + ("top", "min", False, False, True), + ("bottom", "max", False, False, False), + ("top", "first", False, False, True), + ("bottom", "dense", False, False, False), + ], +) +def test_series_rank( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, + numeric_only, + pct, +): + col_name = "int64_too" + bf_result = ( + scalars_df_index[col_name] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_name] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .astype(pd.Float64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cast_float_to_int(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].astype(pd.Int64Dtype()).to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cast_float_to_bool(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].astype(pd.BooleanDtype()).to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].astype(pd.BooleanDtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].cumsum().cumsum().cumsum().to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index[col_name] + .cumsum() + .cumsum() + .cumsum() + .astype(pd.Float64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + col_name = "float64_col" + # set non-unique index to check implicit alignment + bf_series = scalars_df_index.set_index("bool_col")[col_name].fillna(0.0) + pd_series = scalars_pandas_df_index.set_index("bool_col")[col_name].fillna(0.0) + + bf_result = ( + (bf_series + 5) + + (bf_series.cumsum().cumsum().cumsum() + bf_series.rolling(window=3).mean()) + + bf_series.expanding().max() + ).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + (pd_series + 5) + + ( + pd_series.cumsum().cumsum().cumsum().astype(pd.Float64Dtype()) + + pd_series.rolling(window=3).mean() + ) + + pd_series.expanding().max() + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_int_filtered(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + + bf_col = scalars_df_index[col_name] + bf_result = bf_col[bf_col > -2].cumsum().to_pandas() + + pd_col = scalars_pandas_df_index[col_name] + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = pd_col[pd_col > -2].cumsum().astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cumsum_float(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].cumsum().to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Float64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cummin_int(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_result = scalars_df_index[col_name].cummin().to_pandas() + pd_result = scalars_pandas_df_index[col_name].cummin() + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cummax_int(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" + bf_result = scalars_df_index[col_name].cummax().to_pandas() + pd_result = scalars_pandas_df_index[col_name].cummax() + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("kwargs"), + [ + {}, + {"normalize": True}, + {"ascending": True}, + ], + ids=[ + "default", + "normalize", + "ascending", + ], +) +def test_value_counts(scalars_dfs, kwargs): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + + # Pandas `value_counts` can produce non-deterministic results with tied counts. + # Remove duplicates to enforce a consistent output. + s = scalars_df[col_name].drop(0) + pd_s = scalars_pandas_df[col_name].drop(0) + + bf_result = s.value_counts(**kwargs).to_pandas() + pd_result = pd_s.value_counts(**kwargs) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_value_counts_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas() + pd_result = scalars_pandas_df[col_name].value_counts(dropna=False) + + # Older pandas version may not have these values, bigframes tries to emulate 2.0+ + pd_result.name = "count" + pd_result.index.name = col_name + + assert_series_equal( + bf_result, + pd_result, + # bigframes values_counts does not honor ordering in the original data + ignore_order=True, + ) + + +def test_value_counts_w_cut(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("value_counts results different in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_cut = bigframes.pandas.cut(scalars_df[col_name], 3, labels=False) + pd_cut = pd.cut(scalars_pandas_df[col_name], 3, labels=False) + + bf_result = bf_cut.value_counts().to_pandas() + pd_result = pd_cut.value_counts() + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.Int64Dtype()), + ) + + +def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): + + bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() + pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50000000000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50000000000), + (-1, -7, -2), + (None, -7, -2), + (-1, None, -2), + (-7, -1, 2), + (-7, -1, None), + (-7, 7, None), + (7, -7, -2), + ], +) +def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): + bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_at(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index["string_col"].at[index] + pd_result = scalars_pandas_df_index["string_col"].at[index] + + assert bf_result == pd_result + + +def test_iat(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].iat[3] + pd_result = scalars_pandas_df_index["int64_too"].iat[3] + + assert bf_result == pd_result + + +def test_iat_error(scalars_df_index, scalars_pandas_df_index): + with pytest.raises(ValueError): + scalars_pandas_df_index["int64_too"].iat["asd"] + with pytest.raises(ValueError): + scalars_df_index["int64_too"].iat["asd"] + + +def test_series_add_prefix(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].add_prefix("prefix_").to_pandas() + + pd_result = scalars_pandas_df_index["int64_too"].add_prefix("prefix_") + + # Index will be object type in pandas, string type in bigframes, but same values + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].add_suffix("_suffix").to_pandas() + + pd_result = scalars_pandas_df_index["int64_too"].add_suffix("_suffix") + + # Index will be object type in pandas, string type in bigframes, but same values + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index["float64_col"].filter(items=[5, 1, 3]).to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(items=[5, 1, 3]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Ignore ordering as pandas order differently depending on version + assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True) + + +def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index["float64_col"].filter(like="ello").to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(like="ello") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_filter_regex(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index["float64_col"].filter(regex="^[GH].*").to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(regex="^[GH].*") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_reindex(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]).to_pandas() + ) + + pd_result = scalars_pandas_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too")["float64_col"].reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_series_reindex_like(scalars_df_index, scalars_pandas_df_index): + bf_reindex_target = scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) + bf_result = ( + scalars_df_index["int64_too"].reindex_like(bf_reindex_target).to_pandas() + ) + + pd_reindex_target = scalars_pandas_df_index["float64_col"].reindex( + index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index["int64_too"].reindex_like(pd_reindex_target) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_series(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_col"] + .where(scalars_df_index["bool_col"], scalars_df_index["int64_too"]) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].where( + scalars_pandas_df_index["bool_col"], scalars_pandas_df_index["int64_too"] + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_different_indices(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_col"] + .iloc[::2] + .where( + scalars_df_index["bool_col"].iloc[2:], + scalars_df_index["int64_too"].iloc[:5], + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index["int64_col"] + .iloc[::2] + .where( + scalars_pandas_df_index["bool_col"].iloc[2:], + scalars_pandas_df_index["int64_too"].iloc[:5], + ) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_default(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_col"].where(scalars_df_index["bool_col"]).to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].where( + scalars_pandas_df_index["bool_col"] + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_where_with_callable(scalars_df_index, scalars_pandas_df_index): + def _is_positive(x): + return x > 0 + + # Both cond and other are callable. + bf_result = ( + scalars_df_index["int64_col"] + .where(cond=_is_positive, other=lambda x: x * 10) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].where( + cond=_is_positive, other=lambda x: x * 10 + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): + col_bf = scalars_df_index["int64_col"] + lower_bf = scalars_df_index["int64_too"] - 1 + upper_bf = scalars_df_index["int64_too"] + 1 + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered) + + col_pd = scalars_pandas_df_index["int64_col"] + lower_pd = scalars_pandas_df_index["int64_too"] - 1 + upper_pd = scalars_pandas_df_index["int64_too"] + 1 + pd_result = col_pd.clip(lower_pd, upper_pd) + + assert_series_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_too"] + bf_result = col_bf.clip(-100, 3.14151593).to_pandas() + + col_pd = scalars_pandas_df_index["int64_too"] + # pandas doesn't work with Int64 and clip with floats + pd_result = col_pd.astype("int64").clip(-100, 3.14151593).astype("Float64") + + assert_series_equal(bf_result, pd_result) + + +def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_col"].iloc[::2] + lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 + upper_bf = scalars_df_index["int64_too"].iloc[:5] + 1 + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() + + col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] + lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 + upper_pd = scalars_pandas_df_index["int64_too"].iloc[:5] + 1 + pd_result = col_pd.clip(lower_pd, upper_pd) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): + col_bf = scalars_df_index["int64_col"].iloc[::2] + lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 + bf_result = col_bf.clip(lower_bf, None).to_pandas() + + col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] + lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 + pd_result = col_pd.clip(lower_pd, None) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_dot(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"] @ scalars_df["int64_too"] + + pd_result = scalars_pandas_df["int64_too"] @ scalars_pandas_df["int64_too"] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("left", "right", "inclusive"), + [ + (-234892, 55555, "left"), + (-234892, 55555, "both"), + (-234892, 55555, "neither"), + (-234892, 55555, "right"), + ], +) +def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusive): + bf_result = ( + scalars_df_index["int64_col"].between(left, right, inclusive).to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].between(left, right, inclusive) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()), + ) + + +def test_series_case_when(scalars_dfs_maybe_ordered): + pytest.importorskip( + "pandas", + minversion="2.2.0", + reason="case_when added in pandas 2.2.0", + ) + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + # TODO(tswast): pandas case_when appears to assume True when a value is + # null. I suspect this should be considered a bug in pandas. + + # Generate 150 conditions to test case_when with a large number of conditions + bf_conditions = ( + [((bf_series > 645).fillna(True), bf_series - 1)] + + [((bf_series > (-100 + i * 5)).fillna(True), i) for i in range(148, 0, -1)] + + [((bf_series <= -100).fillna(True), pd.NA)] + ) + + pd_conditions = ( + [((pd_series > 645), pd_series - 1)] + + [((pd_series > (-100 + i * 5)), i) for i in range(148, 0, -1)] + + [(pd_series <= -100, pd.NA)] + ) + + assert len(bf_conditions) == 150 + + bf_result = bf_series.case_when(bf_conditions).to_pandas() + pd_result = pd_series.case_when(pd_conditions) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.Int64Dtype()), + ) + + +def test_series_case_when_change_type(scalars_dfs_maybe_ordered): + pytest.importorskip( + "pandas", + minversion="2.2.0", + reason="case_when added in pandas 2.2.0", + ) + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + # TODO(tswast): pandas case_when appears to assume True when a value is + # null. I suspect this should be considered a bug in pandas. + + bf_conditions = [ + ((bf_series > 645).fillna(True), scalars_df["string_col"]), + ((bf_series <= -100).fillna(True), pd.NA), + (True, "not_found"), + ] + + pd_conditions = [ + ((pd_series > 645).fillna(True), scalars_pandas_df["string_col"]), + ((pd_series <= -100).fillna(True), pd.NA), + # pandas currently fails if both the condition and the value are literals. + ([True] * len(pd_series), ["not_found"] * len(pd_series)), + ] + + bf_result = bf_series.case_when(bf_conditions).to_pandas() + pd_result = pd_series.case_when(pd_conditions) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype("string[pyarrow]"), + ) + + +def test_to_frame(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_col"].to_frame().to_pandas() + pd_result = scalars_pandas_df["int64_col"].to_frame() + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_to_frame_no_name(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_col"].rename(None).to_frame().to_pandas() + pd_result = scalars_pandas_df["int64_col"].rename(None).to_frame() + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): + path = gcs_folder + "test_series_to_json*.jsonl" + scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") + gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True) + + pd.testing.assert_series_equal( + gcs_df["int64_col"].astype(pd.Int64Dtype()), + scalars_pandas_df_index["int64_col"], + check_dtype=False, + check_index=False, + ) + + +def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): + path = gcs_folder + "test_series_to_csv*.csv" + scalars_df_index["int64_col"].to_csv(path) + gcs_df = pd.read_csv(get_first_file_from_wildcard(path)) + + pd.testing.assert_series_equal( + gcs_df["int64_col"].astype(pd.Int64Dtype()), + scalars_pandas_df_index["int64_col"], + check_dtype=False, + check_index=False, + ) + + +def test_to_latex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_col"].to_latex() + pd_result = scalars_pandas_df_index["int64_col"].to_latex() + + assert bf_result == pd_result + + +def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_json() + pd_result = scalars_pandas_df_index.int64_col.to_json() + + assert bf_result == pd_result + + +def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_json(bf_result_file) + scalars_pandas_df_index.int64_col.to_json(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_series_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.int64_col.to_csv() + + assert bf_result == pd_result + + +def test_series_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_csv(bf_result_file) + scalars_pandas_df_index.int64_col.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_to_dict(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_dict() + + pd_result = scalars_pandas_df_index["int64_too"].to_dict() + + assert bf_result == pd_result + + +def test_to_excel(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip("openpyxl") + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index["int64_too"].to_excel(bf_result_file) + scalars_pandas_df_index["int64_too"].to_excel(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_to_pickle(scalars_df_index, scalars_pandas_df_index): + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index["int64_too"].to_pickle(bf_result_file) + scalars_pandas_df_index["int64_too"].to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_to_string(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_string() + + pd_result = scalars_pandas_df_index["int64_too"].to_string() + + assert bf_result == pd_result + + +def test_to_list(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_list() + + pd_result = scalars_pandas_df_index["int64_too"].to_list() + + assert bf_result == pd_result + + +def test_to_numpy(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_numpy() + + pd_result = scalars_pandas_df_index["int64_too"].to_numpy() + + assert (bf_result == pd_result).all() + + +def test_to_xarray(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip("xarray") + bf_result = scalars_df_index["int64_too"].to_xarray() + + pd_result = scalars_pandas_df_index["int64_too"].to_xarray() + + assert bf_result.equals(pd_result) + + +def test_to_markdown(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].to_markdown() + + pd_result = scalars_pandas_df_index["int64_too"].to_markdown() + + assert bf_result == pd_result + + +def test_series_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_too"].values + + pd_result = scalars_pandas_df_index["int64_too"].values + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_series_equal( + pd.Series(bf_result), pd.Series(pd_result), check_dtype=False + ) + + +def test_series___array__(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["float64_col"].__array__() + + pd_result = scalars_pandas_df_index["float64_col"].__array__() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + numpy.array_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("ascending", "na_position"), + [ + (True, "first"), + (True, "last"), + (False, "first"), + (False, "last"), + ], +) +def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_position): + # Test needs values to be unique + bf_result = ( + scalars_df_index["int64_col"] + .sort_values(ascending=ascending, na_position=na_position) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].sort_values( + ascending=ascending, na_position=na_position + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_sort_values_inplace(scalars_df_index, scalars_pandas_df_index): + # Test needs values to be unique + bf_series = scalars_df_index["int64_col"].copy() + bf_series.sort_values(ascending=False, inplace=True) + bf_result = bf_series.to_pandas() + pd_result = scalars_pandas_df_index["int64_col"].sort_values(ascending=False) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("ascending"), + [ + (True,), + (False,), + ], +) +def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending): + bf_result = ( + scalars_df_index["int64_too"].sort_index(ascending=ascending).to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=ascending) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_sort_index_inplace(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_too"].copy() + bf_series.sort_index(ascending=False, inplace=True) + bf_result = bf_series.to_pandas() + pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=False) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_mask_default_value(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_col_masked = bf_col.mask(bf_col % 2 == 1) + bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_col_masked = pd_col.mask(pd_col % 2 == 1) + pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_mask_custom_value(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_col_masked = bf_col.mask(bf_col % 2 == 1, -1) + bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_col_masked = pd_col.mask(pd_col % 2 == 1, -1) + pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) + + # TODO(shobs): There is a pd.NA value in the original series, which is not + # odd so should be left as is, but it is being masked in pandas. + # Accidentally the bigframes bahavior matches, but it should be updated + # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 + assert_pandas_df_equal(bf_result, pd_result) + + +def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index): + def _ten_times(x): + return x * 10 + + # Both cond and other are callable. + bf_result = ( + scalars_df_index["int64_col"] + .mask(cond=lambda x: x > 0, other=_ten_times) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].mask( + cond=lambda x: x > 0, other=_ten_times + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("lambda_",), + [ + pytest.param(lambda x: x > 0), + pytest.param( + lambda x: True if x > 0 else False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "lambda_arithmatic", + "lambda_arbitrary", + ], +) +def test_mask_lambda(scalars_dfs, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.mask(lambda_).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.mask(lambda_) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_mask_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x < 1000000 + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.mask(foo).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.mask(foo) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("column", "to_type"), + [ + ("int64_col", "Float64"), + ("int64_col", "Int64"), # No-op + ("int64_col", pd.Float64Dtype()), + ("int64_col", "string[pyarrow]"), + ("int64_col", "boolean"), + ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("int64_col", "time64[us][pyarrow]"), + ("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), + ("bool_col", "Int64"), + ("bool_col", "string[pyarrow]"), + ("bool_col", "Float64"), + ("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), + ("string_col", "binary[pyarrow]"), + ("bytes_col", "string[pyarrow]"), + # pandas actually doesn't let folks convert to/from naive timestamp and + # raises a deprecation warning to use tz_localize/tz_convert instead, + # but BigQuery always stores values as UTC and doesn't have to deal + # with timezone conversions, so we'll allow it. + ("timestamp_col", "date32[day][pyarrow]"), + ("timestamp_col", "time64[us][pyarrow]"), + ("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))), + ("datetime_col", "date32[day][pyarrow]"), + pytest.param( + "datetime_col", + "string[pyarrow]", + marks=pytest.mark.skipif( + pd.__version__.startswith("2.2"), + reason="pandas 2.2 uses T as date/time separator whereas earlier versions use space", + ), + ), + ("datetime_col", "time64[us][pyarrow]"), + ("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("date_col", "string[pyarrow]"), + ("date_col", pd.ArrowDtype(pa.timestamp("us"))), + ("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("time_col", "string[pyarrow]"), + # TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int + # ("float64_col", "Int64"), + # TODO(bmil): decide whether to fix Ibis bug: BigQuery backend + # formats floats with no decimal places if they have no fractional + # part, and does not switch to scientific notation for > 10^15 + # ("float64_col", "string[pyarrow]") + # TODO(bmil): add any other compatible conversions per + # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions + ], +) +def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() + pd_result = scalars_pandas_df_index[column].astype(to_type) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_astype_python(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype(float, errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + +def test_astype_safe(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype("Float64", errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + +def test_series_astype_w_invalid_error(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + +def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + column = "numeric_col" + to_type = "Int64" + bf_result = scalars_df_index[column].astype(to_type).to_pandas() + # Truncate to int to avoid TypeError + pd_result = ( + scalars_pandas_df_index[column] + .apply(lambda x: None if pd.isna(x) else math.trunc(x)) + .astype(to_type) + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("column", "to_type"), + [ + ("timestamp_col", "int64[pyarrow]"), + ("datetime_col", "int64[pyarrow]"), + ("time_col", "int64[pyarrow]"), + ], +) +def test_date_time_astype_int( + scalars_df_index, scalars_pandas_df_index, column, to_type +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + bf_result = scalars_df_index[column].astype(to_type).to_pandas() + pd_result = scalars_pandas_df_index[column].astype(to_type) + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert bf_result.dtype == "Int64" + + +def test_string_astype_int(): + pd_series = pd.Series(["4", "-7", "0", " -03"]) + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("Int64") + bf_result = bf_series.astype("Int64").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_float(): + pd_series = pd.Series( + ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] + ) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("Float64") + bf_result = bf_series.astype("Float64").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_date(): + if int(pa.__version__.split(".")[0]) < 15: + pytest.skip( + "Avoid pyarrow.lib.ArrowNotImplementedError: " + "Unsupported cast from string to date32 using function cast_date32." + ) + + pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( + pd.ArrowDtype(pa.string()) + ) + + bf_series = series.Series(pd_series) + + # TODO(b/340885567): fix type error + pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore + bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_datetime(): + pd_series = pd.Series( + ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) + bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_timestamp(): + pd_series = pd.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + bf_result = bf_series.astype( + pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_timestamp_astype_string(): + bf_series = series.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + + expected_result = pd.Series( + [ + "2014-08-15 08:15:12+00", + "2015-08-15 03:15:12.654754+00", + "2016-02-28 16:00:00+00", + ] + ) + bf_result = bf_series.astype(pa.string()).to_pandas() + + pd.testing.assert_series_equal( + bf_result, expected_result, check_index_type=False, check_dtype=False + ) + assert bf_result.dtype == "string[pyarrow]" + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_float_astype_json(errors): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_result.index = expected_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + + +def test_float_astype_json_str(): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype("json") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_result.index = expected_result.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_string_astype_json(errors): + data = [ + "1", + None, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', + ] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_string_astype_json_in_safe_mode(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +def test_string_astype_json_raise_error(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + with pytest.raises( + google.api_core.exceptions.BadRequest, + match="syntax error while parsing value", + ): + bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas() + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"), + pytest.param( + ['"str"', None], + dtypes.TIME_DTYPE, + id="invalid", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_json_astype_others(data, to_type, errors): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + + bf_result = bf_series.astype(to_type, errors=errors) + assert bf_result.dtype == to_type + + load_data = [json.loads(item) if item is not None else None for item in data] + expected = pd.Series(load_data, dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_raise_error(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + with pytest.raises(google.api_core.exceptions.BadRequest): + bf_series.astype(to_type, errors="raise").to_pandas() + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_in_safe_mode(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + bf_result = bf_series.astype(to_type, errors="null") + assert bf_result.dtype == to_type + + expected = pd.Series([None, None], dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + "index", + [0, 5, -2], +) +def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.string_col.iloc[index] + pd_result = scalars_pandas_df_index.string_col.iloc[index] + + assert bf_result == pd_result + + +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): + with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): + scalars_df_index.string_col.iloc[99] + + +def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.loc[scalars_df_index.bool_col].to_pandas() + pd_result = scalars_pandas_df_index.string_col.loc[scalars_pandas_df_index.bool_col] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_loc_bool_series_default_index( + scalars_df_default_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_default_index.string_col.loc[ + scalars_df_default_index.bool_col + ].to_pandas() + pd_result = scalars_pandas_df_default_index.string_col.loc[ + scalars_pandas_df_default_index.bool_col + ] + + assert_pandas_df_equal( + bf_result.to_frame(), + pd_result.to_frame(), + ) + + +def test_argmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.argmin() + pd_result = scalars_pandas_df_index.string_col.argmin() + assert bf_result == pd_result + + +def test_argmax(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_too.argmax() + pd_result = scalars_pandas_df_index.int64_too.argmax() + assert bf_result == pd_result + + +def test_series_idxmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.idxmin() + pd_result = scalars_pandas_df_index.string_col.idxmin() + assert bf_result == pd_result + + +def test_series_idxmax(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_too.idxmax() + pd_result = scalars_pandas_df_index.int64_too.idxmax() + assert bf_result == pd_result + + +def test_getattr_attribute_error_when_pandas_has(scalars_df_index): + # asof is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): + scalars_df_index.string_col.asof() + + +def test_getattr_attribute_error(scalars_df_index): + with pytest.raises(AttributeError): + scalars_df_index.string_col.not_a_method() + + +def test_rename(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename("newname") + pd_result = scalars_pandas_df_index.string_col.rename("newname") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_nonstring(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename((4, 2)) + pd_result = scalars_pandas_df_index.string_col.rename((4, 2)) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_dict_same_type(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename({1: 100, 2: 200}) + pd_result = scalars_pandas_df_index.string_col.rename({1: 100, 2: 200}) + + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_axis(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename_axis("newindexname") + pd_result = scalars_pandas_df_index.string_col.rename_axis("newindexname") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values + + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + + bf_result = scalars_df_index.string_col.loc[index_list] + pd_result = scalars_pandas_df_index.string_col.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + index_list = [3, 2, 1, 3, 2, 1] + + bf_result = scalars_df_index.bool_col.loc[index_list] + pd_result = scalars_pandas_df_index.bool_col.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] + + bf_result = scalars_df_multiindex.int64_too.loc[index_list] + pd_result = scalars_pandas_df_multiindex.int64_too.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list(scalars_df_index, scalars_pandas_df_index): + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df_index.string_col.iloc[index_list] + pd_result = scalars_pandas_df_index.string_col.iloc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list_nameless(scalars_df_index, scalars_pandas_df_index): + index_list = [0, 0, 0, 5, 4, 7] + + bf_series = scalars_df_index.string_col.rename(None) + bf_result = bf_series.iloc[index_list] + pd_series = scalars_pandas_df_index.string_col.rename(None) + pd_result = pd_series.iloc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_nameless(scalars_df_index, scalars_pandas_df_index): + index_list = [0, 0, 0, 5, 4, 7] + + bf_series = scalars_df_index.string_col.rename(None) + bf_result = bf_series.loc[index_list] + + pd_series = scalars_pandas_df_index.string_col.rename(None) + pd_result = pd_series.loc[index_list] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.date_col.loc[bf_string_series] + pd_result = scalars_pandas_df_index.date_col.loc[pd_string_series] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + + bf_result = scalars_df_multiindex.int64_too.loc[bf_string_series] + pd_result = scalars_pandas_df_multiindex.int64_too.loc[pd_string_series] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.date_col.loc[bf_index] + pd_result = scalars_pandas_df_index.date_col.loc[pd_index] + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.date_col.loc[index] + pd_result = scalars_pandas_df_index.date_col.loc[index] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.date_col.loc[index] + pd_result = scalars_pandas_df_index.date_col.loc[index] + assert bf_result == pd_result + + +def test_series_bool_interpretation_error(scalars_df_index): + with pytest.raises(ValueError): + True if scalars_df_index["string_col"] else False + + +def test_query_job_setters(scalars_dfs): + # if allow_large_results=False, might not create query job + with bigframes.option_context("compute.allow_large_results", True): + job_ids = set() + df, _ = scalars_dfs + series = df["int64_col"] + assert series.query_job is not None + repr(series) + job_ids.add(series.query_job.job_id) + series.to_pandas() + job_ids.add(series.query_job.job_id) + assert len(job_ids) == 2 + + +@pytest.mark.parametrize( + ("series_input",), + [ + ([1, 2, 3, 4, 5],), + ([1, 1, 3, 5, 5],), + ([1, pd.NA, 4, 5, 5],), + ([1, 3, 2, 5, 4],), + ([pd.NA, pd.NA],), + ([1, 1, 1, 1, 1],), + ], +) +def test_is_monotonic_increasing(series_input): + scalars_df = series.Series(series_input, dtype=pd.Int64Dtype()) + scalars_pandas_df = pd.Series(series_input, dtype=pd.Int64Dtype()) + assert ( + scalars_df.is_monotonic_increasing == scalars_pandas_df.is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + ("series_input",), + [ + ([1],), + ([5, 4, 3, 2, 1],), + ([5, 5, 3, 1, 1],), + ([1, pd.NA, 4, 5, 5],), + ([5, pd.NA, 4, 2, 1],), + ([1, 1, 1, 1, 1],), + ], +) +def test_is_monotonic_decreasing(series_input): + scalars_df = series.Series(series_input) + scalars_pandas_df = pd.Series(series_input) + assert ( + scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing + ) + + +def test_map_dict_input(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + local_map = dict() + # construct a local map, incomplete to cover behavior + for s in scalars_pandas_df.string_col[:-3]: + if isinstance(s, str): + local_map[s] = ord(s[0]) + + pd_result = scalars_pandas_df.string_col.map(local_map) + pd_result = pd_result.astype("Int64") # pandas type differences + bf_result = scalars_df.string_col.map(local_map) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_map_series_input(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + new_index = scalars_pandas_df.int64_too.drop_duplicates() + pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] + pd_map_series.index = new_index + bf_map_series = series.Series( + pd_map_series, session=scalars_df._get_block().expr.session + ) + + pd_result = scalars_pandas_df.int64_too.map(pd_map_series) + bf_result = scalars_df.int64_too.map(bf_map_series) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_map_series_input_duplicates_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + new_index = scalars_pandas_df.int64_too + pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] + pd_map_series.index = new_index + bf_map_series = series.Series( + pd_map_series, session=scalars_df._get_block().expr.session + ) + + with pytest.raises(pd.errors.InvalidIndexError): + scalars_pandas_df.int64_too.map(pd_map_series) + with pytest.raises(pd.errors.InvalidIndexError): + scalars_df.int64_too.map(bf_map_series, verify_integrity=True) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size + + +def test_series_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, pd_i in zip( + scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"] + ): + assert bf_i == pd_i + + +@pytest.mark.parametrize( + ( + "col", + "lambda_", + ), + [ + pytest.param("int64_col", lambda x: x * x + x + 1), + pytest.param("int64_col", lambda x: x % 2 == 1), + pytest.param("string_col", lambda x: x + "_suffix"), + ], + ids=[ + "lambda_int_int", + "lambda_int_bool", + "lambda_str_str", + ], +) +def test_apply_lambda(scalars_dfs, col, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df[col] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(lambda_) + + bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() + + pd_col = scalars_pandas_df[col] + if pd.__version__[:3] in ("2.2", "2.3"): + pd_result = pd_col.apply(lambda_, by_row=False) + else: + pd_result = pd_col.apply(lambda_) + + # ignore dtype check, which are Int64 and object respectively + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) + + +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.log), + pytest.param(numpy.sqrt), + pytest.param(numpy.sin), + ], + ids=[ + "log", + "sqrt", + "sin", + ], +) +def test_apply_numpy_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(ufunc) + + bf_result = bf_col.apply(ufunc, by_row=False).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(ufunc) + + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.add), + pytest.param(numpy.divide), + ], + ids=[ + "add", + "divide", + ], +) +def test_combine_series_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"].dropna() + bf_result = bf_col.combine(bf_col, ufunc).to_pandas() + + pd_col = scalars_pandas_df["int64_col"].dropna() + pd_result = pd_col.combine(pd_col, ufunc) + + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_combine_scalar_ufunc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"].dropna() + bf_result = bf_col.combine(2.5, numpy.add).to_pandas() + + pd_col = scalars_pandas_df["int64_col"].dropna() + pd_result = pd_col.combine(2.5, numpy.add) + + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_apply_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x * x + 2 * x + 3 + + bf_col = scalars_df["int64_col"] + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(ValueError, match="by_row=False"): + bf_col.apply(foo) + + bf_result = bf_col.apply(foo, by_row=False).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + + if pd.__version__[:3] in ("2.2", "2.3"): + pd_result = pd_col.apply(foo, by_row=False) + else: + pd_result = pd_col.apply(foo) + + # ignore dtype check, which are Int64 and object respectively + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) + + +@pytest.mark.parametrize( + ("col", "lambda_", "exception"), + [ + pytest.param("int64_col", {1: 2, 3: 4}, ValueError), + pytest.param("int64_col", numpy.square, TypeError), + pytest.param("string_col", lambda x: x.capitalize(), AttributeError), + ], + ids=[ + "not_callable", + "numpy_ufunc", + "custom_lambda", + ], +) +def test_apply_not_supported(scalars_dfs, col, lambda_, exception): + scalars_df, _ = scalars_dfs + + bf_col = scalars_df[col] + with pytest.raises(exception): + bf_col.apply(lambda_, by_row=False) + + +def test_series_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + column = "int64_too" + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param([1, 2, 3], id="int"), + pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"), + pytest.param( + [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array" + ), + pytest.param( + [ + {"A": {"x": 1.0}, "B": "b"}, + {"A": {"y": 2.0}, "B": "bb"}, + {"A": {"z": 4.0}}, + {}, + numpy.nan, + ], + id="struct_array", + ), + ], +) +def test_series_explode(data): + s = bigframes.pandas.Series(data) + pd_s = s.to_pandas() + pd.testing.assert_series_equal( + s.explode().to_pandas(), + pd_s.explode(), + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("index", "ignore_index"), + [ + pytest.param(None, True, id="default_index"), + pytest.param(None, False, id="ignore_default_index"), + pytest.param([5, 1, 3, 2], True, id="unordered_index"), + pytest.param([5, 1, 3, 2], False, id="ignore_unordered_index"), + pytest.param(["z", "x", "a", "b"], True, id="str_index"), + pytest.param(["z", "x", "a", "b"], False, id="ignore_str_index"), + pytest.param( + pd.Index(["z", "x", "a", "b"], name="idx"), True, id="str_named_index" + ), + pytest.param( + pd.Index(["z", "x", "a", "b"], name="idx"), + False, + id="ignore_str_named_index", + ), + pytest.param( + pd.MultiIndex.from_frame( + pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) + ), + True, + id="multi_index", + ), + pytest.param( + pd.MultiIndex.from_frame( + pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) + ), + False, + id="ignore_multi_index", + ), + ], +) +def test_series_explode_w_index(index, ignore_index): + data = [[], [200.0, 23.12], [4.5, -9.0], [1.0]] + s = bigframes.pandas.Series(data, index=index) + pd_s = pd.Series(data, index=index) + # TODO(b/340885567): fix type error + pd.testing.assert_series_equal( + s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore + pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_series_explode_reserve_order(ignore_index, ordered): + data = [numpy.random.randint(0, 10, 10) for _ in range(10)] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + + # TODO(b/340885567): fix type error + res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) # type: ignore + # TODO(b/340885567): fix type error + pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) # type: ignore + pd_res.index = pd_res.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + res if ordered else res.sort_index(), + pd_res, + ) + + +def test_series_explode_w_aggregate(): + data = [[1, 2, 3], [], numpy.nan, [3, 4]] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + assert s.explode().sum() == pd_s.explode().sum() + + +def test_series_construct_empty_array(): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + s = bigframes.pandas.Series([[]]) + expected = pd.Series( + [[]], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + index=pd.Index([0], dtype=pd.Int64Dtype()), + ) + pd.testing.assert_series_equal( + expected, + s.to_pandas(), + ) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param(numpy.nan, id="null"), + pytest.param([numpy.nan], id="null_array"), + pytest.param([[]], id="empty_array"), + pytest.param([numpy.nan, []], id="null_and_empty_array"), + ], +) +def test_series_explode_null(data): + s = bigframes.pandas.Series(data) + pd.testing.assert_series_equal( + s.explode().to_pandas(), + s.to_pandas().explode(), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("append", "level", "col", "rule"), + [ + pytest.param(False, None, "timestamp_col", "75D"), + pytest.param(True, 1, "timestamp_col", "25W"), + pytest.param(False, None, "datetime_col", "3ME"), + pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), + ], +) +def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] + scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ + "int64_col" + ] + bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_struct_get_field_by_attribute( + nested_structs_df, nested_structs_pandas_df +): + if Version(pd.__version__) < Version("2.2.0"): + pytest.skip("struct accessor is not supported before pandas 2.2") + + bf_series = nested_structs_df["person"] + df_series = nested_structs_pandas_df["person"] + + pd.testing.assert_series_equal( + bf_series.address.city.to_pandas(), + df_series.struct.field("address").struct.field("city"), + check_dtype=False, + check_index=False, + ) + pd.testing.assert_series_equal( + bf_series.address.country.to_pandas(), + df_series.struct.field("address").struct.field("country"), + check_dtype=False, + check_index=False, + ) + + +def test_series_struct_fields_in_dir(nested_structs_df): + series = nested_structs_df["person"] + + assert "age" in dir(series) + assert "address" in dir(series) + assert "city" in dir(series.address) + assert "country" in dir(series.address) + + +def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): + series = nested_structs_df["person"] + + assert series.name == "person" + + +def test_series_to_pandas_dry_run(scalars_df_index): + bf_series = scalars_df_index["int64_col"] + + result = bf_series.to_pandas(dry_run=True) + + assert isinstance(result, pd.Series) + assert len(result) > 0 + + +def test_series_item(session): + # Test with a single item + bf_s_single = bigframes.pandas.Series([42], session=session) + pd_s_single = pd.Series([42]) + assert bf_s_single.item() == pd_s_single.item() + + +def test_series_item_with_multiple(session): + # Test with multiple items + bf_s_multiple = bigframes.pandas.Series([1, 2, 3], session=session) + pd_s_multiple = pd.Series([1, 2, 3]) + + try: + pd_s_multiple.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_s_multiple.item() + + +def test_series_item_with_empty(session): + # Test with an empty Series + bf_s_empty = bigframes.pandas.Series([], dtype="Int64", session=session) + pd_s_empty = pd.Series([], dtype="Int64") + + try: + pd_s_empty.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_s_empty.item() From 765b678b34a7976aef1017d2a1fdb34d7a4cfbe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 21:38:20 +0000 Subject: [PATCH 20/36] restore a test --- tests/unit/test_series_polars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 8c24a28f43..d26bdd93d2 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -376,9 +376,9 @@ def test_series_construct_w_dtype_for_array_struct(): ) -def test_series_construct_local_unordered_has_sequential_index(unordered_session): +def test_series_construct_local_unordered_has_sequential_index(session): series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) pd.testing.assert_index_equal(series.index.to_pandas(), expected) From 4aa47a865899292e930c33e015ee92d5c35919f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 21:39:19 +0000 Subject: [PATCH 21/36] Revert "restore a test" This reverts commit 765b678b34a7976aef1017d2a1fdb34d7a4cfbe4. --- tests/unit/test_series_polars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index d26bdd93d2..8c24a28f43 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -376,9 +376,9 @@ def test_series_construct_w_dtype_for_array_struct(): ) -def test_series_construct_local_unordered_has_sequential_index(session): +def test_series_construct_local_unordered_has_sequential_index(unordered_session): series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) pd.testing.assert_index_equal(series.index.to_pandas(), expected) From f75f5bf46c10b9da4d89f763d0a0a0c9b749084b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:10:43 +0000 Subject: [PATCH 22/36] skip null --- tests/unit/test_series_polars.py | 42 ++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 8c24a28f43..b7a2d17022 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -75,6 +75,20 @@ def scalars_pandas_df_index() -> pd.DataFrame: return df.set_index("rowindex").sort_index() +@pytest.fixture(scope="module") +def scalars_df_default_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index).reset_index(drop=False) + + +@pytest.fixture(scope="module") +def scalars_df_2_default_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index).reset_index(drop=False) + + @pytest.fixture(scope="module") def scalars_df_index( session: bigframes.Session, scalars_pandas_df_index @@ -376,9 +390,9 @@ def test_series_construct_w_dtype_for_array_struct(): ) -def test_series_construct_local_unordered_has_sequential_index(unordered_session): +def test_series_construct_local_unordered_has_sequential_index(session): series = bigframes.pandas.Series( - ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=session ) expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) pd.testing.assert_index_equal(series.index.to_pandas(), expected) @@ -469,13 +483,6 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): assert series_pandas.shape[0] == scalars_pandas_df.shape[0] -def test_get_column_w_json(json_df, json_pandas_df): - series = json_df["json_col"] - series_pandas = series.to_pandas() - assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert series_pandas.shape[0] == json_pandas_df.shape[0] - - def test_series_get_column_default(scalars_dfs): scalars_df, _ = scalars_dfs result = scalars_df.get(123123123123123, "default_val") @@ -1062,7 +1069,22 @@ def test_series_pow_scalar_reverse(scalars_dfs): "xor", ], ) -@pytest.mark.parametrize(("other_scalar"), [True, False, pd.NA]) +@pytest.mark.parametrize( + ("other_scalar"), + [ + True, + False, + pytest.param( + pd.NA, + marks=[ + pytest.mark.skip( + reason="https://github.com/pola-rs/polars/issues/24809" + ) + ], + id="NULL", + ), + ], +) @pytest.mark.parametrize(("reverse_operands"), [True, False]) def test_series_bool_bool_operators_scalar( scalars_dfs, operator, other_scalar, reverse_operands From a7058acefe8abb6927a6ca59a42b86f4149ce70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:29:43 +0000 Subject: [PATCH 23/36] skip unsupported tests --- tests/unit/test_series_polars.py | 123 ++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index b7a2d17022..3eb1a3c095 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -1579,6 +1579,7 @@ def test_isin_bigframes_index(scalars_dfs, session): ) +@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") @pytest.mark.parametrize( ( "col_name", @@ -1946,14 +1947,42 @@ def test_mean(scalars_dfs): @pytest.mark.parametrize( ("col_name"), [ - "int64_col", + pytest.param( + "int64_col", + marks=[ + pytest.mark.skip( + reason="pyarrow.lib.ArrowInvalid: Float value 27778.500000 was truncated converting to int64" + ) + ], + ), # Non-numeric column - "bytes_col", + pytest.param( + "bytes_col", + marks=[ + pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: `median` operation not supported for dtype `binary`" + ) + ], + ), "date_col", "datetime_col", - "time_col", + pytest.param( + "time_col", + marks=[ + pytest.mark.skip( + reason="pyarrow.lib.ArrowInvalid: Casting from time64[ns] to time64[us] would lose data: 42651538080500" + ) + ], + ), "timestamp_col", - "string_col", + pytest.param( + "string_col", + marks=[ + pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: `median` operation not supported for dtype `str`" + ) + ], + ), ], ) def test_median(scalars_dfs, col_name): @@ -2146,6 +2175,9 @@ def test_groupby_mean(scalars_dfs): ) +@pytest.mark.skip( + reason="Aggregate op QuantileOp(q=0.5, should_floor_result=False) not yet supported in polars engine." +) def test_groupby_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" @@ -2164,6 +2196,9 @@ def test_groupby_median_exact(scalars_dfs): ) +@pytest.mark.skip( + reason="pyarrow.lib.ArrowInvalid: Float value -1172.500000 was truncated converting to int64" +) def test_groupby_median_inexact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" @@ -2204,6 +2239,7 @@ def test_groupby_prod(scalars_dfs): ) +@pytest.mark.skip(reason="AssertionError: Series are different") @pytest.mark.parametrize( ("operator"), [ @@ -2270,6 +2306,7 @@ def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip(reason="AssertionError: Series.index are different") @pytest.mark.parametrize( ("col_name",), [ @@ -2294,6 +2331,7 @@ def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_na ) +@pytest.mark.skip(reason="TypeError: boolean value of NA is ambiguous") @pytest.mark.parametrize( ("col_name",), [ @@ -2307,6 +2345,7 @@ def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): numpy.array_equal(pd_uniq, bf_uniq) +@pytest.mark.skip(reason="AssertionError: Series are different") @pytest.mark.parametrize( ("col_name",), [ @@ -2639,6 +2678,9 @@ def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op RankOp() not yet supported in polars engine." +) @pytest.mark.parametrize( ("keep",), [ @@ -2700,6 +2742,9 @@ def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op RankOp() not yet supported in polars engine." +) @pytest.mark.parametrize( ("keep",), [ @@ -2719,6 +2764,9 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op DenseRankOp() not yet supported in polars engine." +) @pytest.mark.parametrize( ("na_option", "method", "ascending", "numeric_only", "pct"), [ @@ -2810,6 +2858,9 @@ def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip( + reason="NotImplementedError: min_period not yet supported for polars engine" +) def test_nested_analytic_ops_align(scalars_df_index, scalars_pandas_df_index): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") @@ -2941,6 +2992,9 @@ def test_value_counts_with_na(scalars_dfs): ) +@pytest.mark.skip( + reason="NotImplementedError: Aggregate op CutOp(bins=3, right=True, labels=False) not yet supported in polars engine." +) def test_value_counts_w_cut(scalars_dfs): if pd.__version__.startswith("1."): pytest.skip("value_counts results different in pandas 1.x.") @@ -3208,6 +3262,9 @@ def _is_positive(x): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" +) @pytest.mark.parametrize( ("ordered"), [ @@ -3229,6 +3286,9 @@ def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): assert_series_equal(bf_result, pd_result, ignore_order=not ordered) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" +) def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_too"] bf_result = col_bf.clip(-100, 3.14151593).to_pandas() @@ -3240,6 +3300,9 @@ def test_clip_int_with_float_bounds(scalars_df_index, scalars_pandas_df_index): assert_series_equal(bf_result, pd_result) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented ClipOp()" +) def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 @@ -3257,6 +3320,9 @@ def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented maximum()" +) def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 @@ -3302,6 +3368,7 @@ def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusi ) +@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") def test_series_case_when(scalars_dfs_maybe_ordered): pytest.importorskip( "pandas", @@ -3340,6 +3407,7 @@ def test_series_case_when(scalars_dfs_maybe_ordered): ) +@pytest.mark.skip(reason="fixture 'scalars_dfs_maybe_ordered' not found") def test_series_case_when_change_type(scalars_dfs_maybe_ordered): pytest.importorskip( "pandas", @@ -3394,6 +3462,7 @@ def test_to_frame_no_name(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +@pytest.mark.skip(reason="fixture 'gcs_folder' not found") def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_json*.jsonl" scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") @@ -3407,6 +3476,7 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.skip(reason="fixture 'gcs_folder' not found") def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_csv*.csv" scalars_df_index["int64_col"].to_csv(path) @@ -3723,6 +3793,9 @@ def foo(x): assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: decimal precision should be <= 38 & >= 1" +) @pytest.mark.parametrize("errors", ["raise", "null"]) @pytest.mark.parametrize( ("column", "to_type"), @@ -3784,6 +3857,9 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.skip( + reason="AttributeError: 'DataFrame' object has no attribute 'dtype'. Did you mean: 'dtypes'?" +) def test_series_astype_python(session): input = pd.Series(["hello", "world", "3.11", "4000"]) exepcted = pd.Series( @@ -3795,6 +3871,9 @@ def test_series_astype_python(session): pd.testing.assert_series_equal(result, exepcted) +@pytest.mark.skip( + reason="AttributeError: 'DataFrame' object has no attribute 'dtype'. Did you mean: 'dtypes'?" +) def test_astype_safe(session): input = pd.Series(["hello", "world", "3.11", "4000"]) exepcted = pd.Series( @@ -3846,6 +3925,9 @@ def test_date_time_astype_int( assert bf_result.dtype == "Int64" +@pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: conversion from `str` to `i64` failed in column 'column_0' for 1 out of 4 values: [' -03']" +) def test_string_astype_int(): pd_series = pd.Series(["4", "-7", "0", " -03"]) bf_series = series.Series(pd_series) @@ -3856,6 +3938,9 @@ def test_string_astype_int(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) +@pytest.mark.skip( + reason="polars.exceptions.InvalidOperationError: conversion from `str` to `f64` failed in column 'column_0' for 1 out of 10 values: [' -03.235']" +) def test_string_astype_float(): pd_series = pd.Series( ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] @@ -3921,6 +4006,7 @@ def test_string_astype_timestamp(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) +@pytest.mark.skip(reason="AssertionError: Series are different") def test_timestamp_astype_string(): bf_series = series.Series( [ @@ -3945,6 +4031,7 @@ def test_timestamp_astype_string(): assert bf_result.dtype == "string[pyarrow]" +@pytest.mark.skip(reason="AssertionError: Series are different") @pytest.mark.parametrize("errors", ["raise", "null"]) def test_float_astype_json(errors): data = ["1.25", "2500000000", None, "-12323.24"] @@ -3958,6 +4045,7 @@ def test_float_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) +@pytest.mark.skip(reason="AssertionError: Series are different") def test_float_astype_json_str(): data = ["1.25", "2500000000", None, "-12323.24"] bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) @@ -3987,6 +4075,7 @@ def test_string_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +@pytest.mark.skip(reason="AssertionError: Series NA mask are different") def test_string_astype_json_in_safe_mode(): data = ["this is not a valid json string"] bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) @@ -3998,6 +4087,9 @@ def test_string_astype_json_in_safe_mode(): pd.testing.assert_series_equal(bf_result.to_pandas(), expected) +@pytest.mark.skip( + reason="Failed: DID NOT RAISE " +) def test_string_astype_json_raise_error(): data = ["this is not a valid json string"] bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) @@ -4036,6 +4128,9 @@ def test_json_astype_others(data, to_type, errors): pd.testing.assert_series_equal(bf_result.to_pandas(), expected) +@pytest.mark.skip( + reason="Failed: DID NOT RAISE " +) @pytest.mark.parametrize( ("data", "to_type"), [ @@ -4051,6 +4146,7 @@ def test_json_astype_others_raise_error(data, to_type): bf_series.astype(to_type, errors="raise").to_pandas() +@pytest.mark.skip(reason="AssertionError: Series NA mask are different") @pytest.mark.parametrize( ("data", "to_type"), [ @@ -4096,6 +4192,7 @@ def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_inde ) +@pytest.mark.skip(reason="fixture 'scalars_pandas_df_default_index' not found") def test_loc_bool_series_default_index( scalars_df_default_index, scalars_pandas_df_default_index ): @@ -4350,6 +4447,9 @@ def test_series_bool_interpretation_error(scalars_df_index): True if scalars_df_index["string_col"] else False +@pytest.mark.skip( + reason="NotImplementedError: dry_run not implemented for this executor" +) def test_query_job_setters(scalars_dfs): # if allow_large_results=False, might not create query job with bigframes.option_context("compute.allow_large_results", True): @@ -4456,6 +4556,9 @@ def test_map_series_input_duplicates_error(scalars_dfs): scalars_df.int64_too.map(bf_map_series, verify_integrity=True) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented hash()" +) @pytest.mark.parametrize( ("frac", "n", "random_state"), [ @@ -4533,6 +4636,9 @@ def test_apply_lambda(scalars_dfs, col, lambda_): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented log()" +) @pytest.mark.parametrize( ("ufunc",), [ @@ -4812,6 +4918,9 @@ def test_series_explode_null(data): ) +@pytest.mark.skip( + reason="NotImplementedError: Polars compiler hasn't implemented IntegerLabelToDatetimeOp(freq=<75 * Days>, label=None, origin='start_day')" +) @pytest.mark.parametrize( ("append", "level", "col", "rule"), [ @@ -4833,6 +4942,7 @@ def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") def test_series_struct_get_field_by_attribute( nested_structs_df, nested_structs_pandas_df ): @@ -4856,6 +4966,7 @@ def test_series_struct_get_field_by_attribute( ) +@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") def test_series_struct_fields_in_dir(nested_structs_df): series = nested_structs_df["person"] @@ -4865,12 +4976,16 @@ def test_series_struct_fields_in_dir(nested_structs_df): assert "country" in dir(series.address) +@pytest.mark.skip(reason="fixture 'nested_structs_df' not found") def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): series = nested_structs_df["person"] assert series.name == "person" +@pytest.mark.skip( + reason="NotImplementedError: dry_run not implemented for this executor" +) def test_series_to_pandas_dry_run(scalars_df_index): bf_series = scalars_df_index["int64_col"] From 62d591130d9696e58e2f7fd8db662afcbf45cd67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:31:42 +0000 Subject: [PATCH 24/36] revert more docs changes --- bigframes/bigquery/_operations/array.py | 6 +++ bigframes/bigquery/_operations/datetime.py | 8 ++++ bigframes/bigquery/_operations/geo.py | 13 +++++++ bigframes/bigquery/_operations/json.py | 12 ++++++ bigframes/bigquery/_operations/search.py | 1 + bigframes/bigquery/_operations/sql.py | 3 ++ bigframes/bigquery/_operations/struct.py | 1 + third_party/bigframes_vendored/conftest.py | 44 ---------------------- 8 files changed, 44 insertions(+), 44 deletions(-) delete mode 100644 third_party/bigframes_vendored/conftest.py diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 239bc9566a..4af1416127 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -40,6 +40,8 @@ def array_length(series: series.Series) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) >>> bbq.array_length(s) 0 4 @@ -76,6 +78,8 @@ def array_agg( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None For a SeriesGroupBy object: @@ -124,6 +128,8 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) >>> bbq.array_to_string(s, delimiter=", ") diff --git a/bigframes/bigquery/_operations/datetime.py b/bigframes/bigquery/_operations/datetime.py index e27a3de0c8..f8767336dd 100644 --- a/bigframes/bigquery/_operations/datetime.py +++ b/bigframes/bigquery/_operations/datetime.py @@ -21,8 +21,11 @@ def unix_seconds(input: series.Series) -> series.Series: **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_seconds(s) 0 86400 @@ -45,8 +48,11 @@ def unix_millis(input: series.Series) -> series.Series: **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_millis(s) 0 86400000 @@ -69,8 +75,10 @@ def unix_micros(input: series.Series) -> series.Series: **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([pd.Timestamp("1970-01-02", tz="UTC"), pd.Timestamp("1970-01-03", tz="UTC")]) >>> bbq.unix_micros(s) diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index e5aa383779..9a92a8960d 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -53,6 +53,8 @@ def st_area( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -123,6 +125,8 @@ def st_buffer( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Point(0, 0), @@ -191,6 +195,8 @@ def st_centroid( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -244,6 +250,8 @@ def st_convexhull( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + >>> series = bigframes.geopandas.GeoSeries( ... [ ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), @@ -304,6 +312,7 @@ def st_difference( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row: @@ -398,6 +407,7 @@ def st_distance( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -479,6 +489,7 @@ def st_intersection( >>> import bigframes.bigquery as bbq >>> import bigframes.geopandas >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None We can check two GeoSeries against each other, row by row. @@ -572,6 +583,7 @@ def st_isclosed( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Point, LineString, Polygon + >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ @@ -638,6 +650,7 @@ def st_length( >>> import bigframes.bigquery as bbq >>> from shapely.geometry import Polygon, LineString, Point, GeometryCollection + >>> bpd.options.display.progress_bar = None >>> series = bigframes.geopandas.GeoSeries( ... [ diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 4e1f43aab0..656e59af0d 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -49,6 +49,8 @@ def json_set( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) @@ -99,6 +101,7 @@ def json_extract( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") @@ -138,6 +141,7 @@ def json_extract_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_array(s) @@ -200,6 +204,7 @@ def json_extract_string_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_extract_string_array(s) @@ -267,6 +272,7 @@ def json_query( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_query(s, json_path="$.class") @@ -297,6 +303,7 @@ def json_query_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_query_array(s) @@ -348,6 +355,7 @@ def json_value( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"name": "Jakob", "age": "6"}', '{"name": "Jakob", "age": []}']) >>> bbq.json_value(s, json_path="$.age") @@ -384,6 +392,7 @@ def json_value_array( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) >>> bbq.json_value_array(s) @@ -430,6 +439,7 @@ def to_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json(s) @@ -463,6 +473,7 @@ def to_json_string( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3]) >>> bbq.to_json_string(s) @@ -501,6 +512,7 @@ def parse_json( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> s diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index b65eed2475..c16c2af1a9 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -111,6 +111,7 @@ def vector_search( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column is used as the search query: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 295412fd75..a2de61fc21 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -36,6 +36,9 @@ def sql_scalar( >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq + >>> import pandas as pd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series(["1.5", "2.5", "3.5"]) >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9))) diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index a6304677ef..7cb826351c 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -39,6 +39,7 @@ def struct(value: dataframe.DataFrame) -> series.Series: >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> import bigframes.series as series + >>> bpd.options.display.progress_bar = None >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) >>> df = srs.struct.explode() diff --git a/third_party/bigframes_vendored/conftest.py b/third_party/bigframes_vendored/conftest.py deleted file mode 100644 index cafd6a1b7c..0000000000 --- a/third_party/bigframes_vendored/conftest.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import bigframes._config - - -@pytest.fixture(autouse=True) -def default_doctest_imports(doctest_namespace): - """ - Avoid some boilerplate in pandas-inspired tests. - - See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture - """ - try: - from bigframes.testing import polars_session - - bpd = polars_session.TestSession() - except ImportError: - # Don't skip doctest if polars isn't available. - import bigframes.pandas as bpd - - doctest_namespace["np"] = np - doctest_namespace["pd"] = pd - doctest_namespace["pa"] = pa - doctest_namespace["bpd"] = bpd - bigframes._config.options.display.progress_bar = None From 70021f3b1e61b5bf11407a1eda9a755f3475f577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:35:58 +0000 Subject: [PATCH 25/36] revert more docs --- bigframes/ml/compose.py | 1 + bigframes/operations/ai.py | 17 ++++++++++++----- bigframes/session/__init__.py | 8 ++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 54ce7066cb..92c98695cd 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -69,6 +69,7 @@ class SQLScalarColumnTransformer: >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 253b838e90..ac294b0fbd 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -45,6 +45,7 @@ def filter( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 @@ -114,7 +115,8 @@ def map( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -132,7 +134,8 @@ def map( >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -263,7 +266,8 @@ def classify( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -352,7 +356,8 @@ def join( **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm @@ -491,6 +496,7 @@ def search( ** Examples: ** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> import bigframes >>> bigframes.options.experiments.ai_operators = True @@ -602,7 +608,8 @@ def sim_join( ** Examples: ** >>> import bigframes.pandas as bpd - >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0490152003..886072b884 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -618,6 +618,7 @@ def read_gbq_query( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Simple query input: @@ -773,6 +774,7 @@ def read_gbq_table( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). @@ -851,6 +853,7 @@ def read_gbq_table_streaming( >>> import bigframes.streaming as bst >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") @@ -879,6 +882,7 @@ def read_gbq_model(self, model_name: str): **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Read an existing BigQuery ML model. @@ -948,6 +952,8 @@ def read_pandas( **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> pandas_df = pd.DataFrame(data=d) @@ -1825,6 +1831,7 @@ def udf( >>> import bigframes.pandas as bpd >>> import datetime + >>> bpd.options.display.progress_bar = None Turning an arbitrary python function into a BigQuery managed python udf: @@ -1987,6 +1994,7 @@ def read_gbq_function( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None Use the [cw_lower_case_ascii_only](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_lower_case_ascii_onlystr-string) function from Community UDFs. From 93502094fa94de7f0fcca17f5b1cb3aa6e1aa7cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 8 Oct 2025 22:36:57 +0000 Subject: [PATCH 26/36] revert more docs --- bigframes/series.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index 337a796739..490298d8dd 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -533,6 +533,7 @@ def to_pandas( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2]) Download the data from BigQuery and convert it into an in-memory pandas Series. @@ -660,6 +661,7 @@ def to_pandas_batches( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([4, 3, 2, 2, 3]) Iterate through the results in batches, limiting the total rows yielded @@ -2419,6 +2421,9 @@ def _resample( **Examples:** >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + >>> data = { ... "timestamp_col": pd.date_range( ... start="2021-01-01 13:00:00", periods=30, freq="1s" From 23346b0947c9f38a29429df80c8502c1f155fb35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 00:06:48 +0000 Subject: [PATCH 27/36] fix unit tests python 3.13 --- tests/unit/test_series_polars.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 3eb1a3c095..64814126ea 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -3491,6 +3491,7 @@ def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): def test_to_latex(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip("jinja2") bf_result = scalars_df_index["int64_col"].to_latex() pd_result = scalars_pandas_df_index["int64_col"].to_latex() @@ -3891,21 +3892,6 @@ def test_series_astype_w_invalid_error(session): session.read_pandas(input).astype("Float64", errors="bad_value") -def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - column = "numeric_col" - to_type = "Int64" - bf_result = scalars_df_index[column].astype(to_type).to_pandas() - # Truncate to int to avoid TypeError - pd_result = ( - scalars_pandas_df_index[column] - .apply(lambda x: None if pd.isna(x) else math.trunc(x)) - .astype(to_type) - ) - pd.testing.assert_series_equal(bf_result, pd_result) - - @pytest.mark.parametrize( ("column", "to_type"), [ From 03822d7bf761fd4e98f3b113c498a79bb8ad35dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 00:20:52 +0000 Subject: [PATCH 28/36] add test to reproduce name error --- tests/unit/test_local_engine.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 7d3d532d88..8c8c2dcf0d 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -42,6 +42,14 @@ def small_inline_frame() -> pd.DataFrame: return df +def test_polars_local_engine_series(polars_session: bigframes.Session): + bf_series = bpd.Series([1, 2, 3], session=polars_session) + pd_series = pd.Series([1, 2, 3], dtype=bf_series.dtype) + bf_result = bf_series.to_pandas() + pd_result = pd_series + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + def test_polars_local_engine_add( small_inline_frame: pd.DataFrame, polars_session: bigframes.Session ): From ddbb32dd2baab6f0fde38d940f685040525f2d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 15:52:05 +0000 Subject: [PATCH 29/36] revert new session methods --- bigframes/core/indexes/base.py | 11 +-- bigframes/core/indexes/multi.py | 48 +----------- bigframes/core/log_adapter.py | 4 +- bigframes/core/reshape/tile.py | 7 +- bigframes/core/tools/datetimes.py | 10 +-- bigframes/pandas/__init__.py | 17 ++-- bigframes/session/__init__.py | 124 ++---------------------------- tests/unit/test_pandas.py | 26 +++---- 8 files changed, 35 insertions(+), 212 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index a258c01195..83dd11dacb 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -383,16 +383,9 @@ def to_series( name = self.name if name is None else name if index is None: - return bigframes.series.Series( - data=self, index=self, name=name, session=self._session - ) + return bigframes.series.Series(data=self, index=self, name=name) else: - return bigframes.series.Series( - data=self, - index=Index(index, session=self._session), - name=name, - session=self._session, - ) + return bigframes.series.Series(data=self, index=Index(index), name=name) def get_level_values(self, level) -> Index: level_n = level if isinstance(level, int) else self.names.index(level) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index a611442b88..a8b4b7dffe 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -14,7 +14,7 @@ from __future__ import annotations -from typing import cast, Hashable, Iterable, Optional, Sequence, TYPE_CHECKING +from typing import cast, Hashable, Iterable, Sequence import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas @@ -23,9 +23,6 @@ from bigframes.core import expression as ex from bigframes.core.indexes.base import Index -if TYPE_CHECKING: - import bigframes.session - class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ @@ -36,12 +33,10 @@ def from_tuples( tuples: Iterable[tuple[Hashable, ...]], sortorder: int | None = None, names: Sequence[Hashable] | Hashable | None = None, - *, - session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index, session=session)) + return cast(MultiIndex, Index(pd_index)) @classmethod def from_arrays( @@ -49,12 +44,10 @@ def from_arrays( arrays, sortorder: int | None = None, names=None, - *, - session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index, session=session)) + return cast(MultiIndex, Index(pd_index)) def __eq__(self, other) -> Index: # type: ignore import bigframes.operations as ops @@ -78,38 +71,3 @@ def __eq__(self, other) -> Index: # type: ignore index_labels=[None], ) ) - - -class MultiIndexAccessor: - """Proxy to MultiIndex constructors to allow a session to be passed in.""" - - def __init__(self, session: bigframes.session.Session): - self._session = session - - def __call__(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :class:`bigframes.pandas.MultiIndex`. - """ - return MultiIndex(*args, session=self._session, **kwargs) - - def from_arrays(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :func:`bigframes.pandas.MultiIndex.from_arrays`. - """ - return MultiIndex.from_arrays(*args, session=self._session, **kwargs) - - def from_frame(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :func:`bigframes.pandas.MultiIndex.from_frame`. - """ - return cast(MultiIndex, MultiIndex.from_frame(*args, **kwargs)) - - def from_tuples(self, *args, **kwargs) -> MultiIndex: - """Construct a MultiIndex using the associated Session. - - See :func:`bigframes.pandas.MultiIndex.from_tuples`. - """ - return MultiIndex.from_tuples(*args, session=self._session, **kwargs) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 8179ffbeed..3ec1e86dc7 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -155,9 +155,7 @@ def method_logger(method=None, /, *, custom_base_name: Optional[str] = None): def outer_wrapper(method): @functools.wraps(method) def wrapper(*args, **kwargs): - api_method_name = getattr( - method, LOG_OVERRIDE_NAME, method.__name__ - ).lower() + api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) if custom_base_name is None: qualname_parts = getattr(method, "__qualname__", method.__name__).split( "." diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index a2efa8f927..74a941be54 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -15,7 +15,6 @@ from __future__ import annotations import typing -from typing import Optional, TYPE_CHECKING import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -32,9 +31,6 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series -if TYPE_CHECKING: - import bigframes.session - def cut( x, @@ -46,7 +42,6 @@ def cut( *, right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, - session: Optional[bigframes.session.Session] = None, ) -> bigframes.series.Series: if ( labels is not None @@ -70,7 +65,7 @@ def cut( raise ValueError("Cannot cut empty array.") if not isinstance(x, bigframes.series.Series): - x = bigframes.series.Series(x, session=session) + x = bigframes.series.Series(x) if isinstance(bins, int): if bins <= 0: diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 0e5594d498..7edf2fa2e4 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations - from collections.abc import Mapping from datetime import date, datetime -from typing import Optional, TYPE_CHECKING, Union +from typing import Optional, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes @@ -27,9 +25,6 @@ import bigframes.operations as ops import bigframes.series -if TYPE_CHECKING: - import bigframes.session - def to_datetime( arg: Union[ @@ -42,7 +37,6 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, - session: Optional[bigframes.session.Session] = None, ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( @@ -58,7 +52,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg, session=session) + arg = bigframes.series.Series(arg) if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 6fcb71f0d8..2455637b0a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -16,8 +16,8 @@ from __future__ import annotations -import collections -import datetime +from collections import namedtuple +from datetime import date, datetime import inspect import sys import typing @@ -198,18 +198,18 @@ def to_datetime( @typing.overload def to_datetime( - arg: Union[int, float, str, datetime.datetime, datetime.date], + arg: Union[int, float, str, datetime, date], *, utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime.datetime]: +) -> Union[pandas.Timestamp, datetime]: ... def to_datetime( arg: Union[ - Union[int, float, str, datetime.datetime, datetime.date], + Union[int, float, str, datetime, date], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, @@ -218,9 +218,8 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: - return global_session.with_default_session( - bigframes.session.Session.to_datetime, +) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: + return bigframes.core.tools.to_datetime( arg, utc=utc, format=format, @@ -323,7 +322,7 @@ def clean_up_by_session_id( __version__ = bigframes.version.__version__ # Other public pandas attributes -NamedAgg = collections.namedtuple("NamedAgg", ["column", "aggfunc"]) +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 886072b884..46fb56b88e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,8 +68,6 @@ import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.events -import bigframes.core.indexes -import bigframes.core.indexes.multi import bigframes.core.pyformat import bigframes.formatting_helpers import bigframes.functions._function_session as bff_session @@ -81,6 +79,7 @@ # Avoid circular imports. if typing.TYPE_CHECKING: + import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series import bigframes.streaming.dataframe as streaming_dataframe @@ -321,15 +320,6 @@ def bqconnectionmanager(self): ) return self._bq_connection_manager - @property - def options(self) -> bigframes._config.Options: - """Options for configuring BigQuery DataFrames. - - Included for compatibility between bpd and Session. - """ - # TODO(tswast): Consider making a separate session-level options object. - return bigframes._config.options - @property def session_id(self): return self._session_id @@ -1836,7 +1826,7 @@ def udf( Turning an arbitrary python function into a BigQuery managed python udf: >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) # doctest: +SKIP + >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1849,8 +1839,8 @@ def udf( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP - >>> hours # doctest: +SKIP + >>> hours = minutes.apply(minutes_to_hours) + >>> hours 0 0.0 1 0.5 2 1.0 @@ -1863,7 +1853,7 @@ def udf( packages (optionally with the package version) via `packages` param. >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf( # doctest: +SKIP + >>> @bpd.udf( ... dataset="bigfranes_testing", ... name=bq_name, ... packages=["cryptography"] @@ -1880,14 +1870,14 @@ def udf( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) # doctest: +SKIP + >>> hashes = names.apply(get_hash) You can clean-up the BigQuery functions created above using the BigQuery client from the BigQuery DataFrames session: >>> session = bpd.get_global_session() - >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) # doctest: +SKIP - >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) # doctest: +SKIP + >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) + >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) Args: input_types (type or sequence(type), Optional): @@ -2307,104 +2297,6 @@ def read_gbq_object_table( s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() - # ========================================================================= - # bigframes.pandas attributes - # - # These are included so that Session and bigframes.pandas can be used - # interchangeably. - # ========================================================================= - def cut(self, *args, **kwargs) -> bigframes.series.Series: - """Cuts a BigQuery DataFrames object. - - Included for compatibility between bpd and Session. - - See :func:`bigframes.pandas.cut` for full documentation. - """ - import bigframes.core.reshape.tile - - return bigframes.core.reshape.tile.cut( - *args, - session=self, - **kwargs, - ) - - def DataFrame(self, *args, **kwargs): - """Constructs a DataFrame. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.DataFrame` for full documentation. - """ - import bigframes.dataframe - - return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) - - @property - def MultiIndex(self) -> bigframes.core.indexes.multi.MultiIndexAccessor: - """Constructs a MultiIndex. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.MulitIndex` for full documentation. - """ - import bigframes.core.indexes.multi - - return bigframes.core.indexes.multi.MultiIndexAccessor(self) - - def Index(self, *args, **kwargs): - """Constructs a Index. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.Index` for full documentation. - """ - import bigframes.core.indexes - - return bigframes.core.indexes.Index(*args, session=self, **kwargs) - - def Series(self, *args, **kwargs): - """Constructs a Series. - - Included for compatibility between bpd and Session. - - See :class:`bigframes.pandas.Series` for full documentation. - """ - import bigframes.series - - return bigframes.series.Series(*args, session=self, **kwargs) - - def to_datetime( - self, *args, **kwargs - ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: - """Converts a BigQuery DataFrames object to datetime dtype. - - Included for compatibility between bpd and Session. - - See :func:`bigframes.pandas.to_datetime` for full documentation. - """ - import bigframes.core.tools - - return bigframes.core.tools.to_datetime( - *args, - session=self, - **kwargs, - ) - - def to_timedelta(self, *args, **kwargs): - """Converts a BigQuery DataFrames object to timedelta/duration dtype. - - Included for compatibility between bpd and Session. - - See :func:`bigframes.pandas.to_timedelta` for full documentation. - """ - import bigframes.pandas.core.tools.timedeltas - - return bigframes.pandas.core.tools.timedeltas.to_timedelta( - *args, - session=self, - **kwargs, - ) - def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 5e75e6b20f..73e0b7f2d6 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -64,12 +64,8 @@ def test_method_matches_session(method_name: str): pandas_method = getattr(bigframes.pandas, method_name) pandas_doc = inspect.getdoc(pandas_method) assert pandas_doc is not None, "docstrings are required" - - pandas_doc_stripped = re.sub(leading_whitespace, "", pandas_doc) - session_doc_stripped = re.sub(leading_whitespace, "", session_doc) - assert ( - pandas_doc_stripped == session_doc_stripped - or ":`bigframes.pandas" in session_doc_stripped + assert re.sub(leading_whitespace, "", pandas_doc) == re.sub( + leading_whitespace, "", session_doc ) # Add `eval_str = True` so that deferred annotations are turned into their @@ -79,20 +75,18 @@ def test_method_matches_session(method_name: str): eval_str=True, globals={**vars(bigframes.session), **{"dataframe": bigframes.dataframe}}, ) - session_args = [ - # Kind includes position, which will be an offset. - parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) - for parameter in session_signature.parameters.values() - # Don't include the first parameter, which is `self: Session` - ][1:] pandas_signature = inspect.signature(pandas_method, eval_str=True) - pandas_args = [ + assert [ # Kind includes position, which will be an offset. parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) for parameter in pandas_signature.parameters.values() - ] - assert session_args == pandas_args or ["args", "kwargs"] == [ - parameter.name for parameter in session_args + ] == [ + # Kind includes position, which will be an offset. + parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) + for parameter in session_signature.parameters.values() + # Don't include the first parameter, which is `self: Session` + ][ + 1: ] assert pandas_signature.return_annotation == session_signature.return_annotation From d80bfcb717d4e8186b276738c7d01c24c17f6caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 16:08:59 +0000 Subject: [PATCH 30/36] fix TestSession read_pandas for Series --- bigframes/dataframe.py | 2 +- bigframes/operations/base.py | 27 ++++----------------------- bigframes/testing/polars_session.py | 12 +++++++++--- 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3527b225e2..bc2bbb963b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -688,7 +688,7 @@ def _getitem_label(self, key: blocks.Label): return DataFrame(block) if len(col_ids) == 1: - return bigframes.series.Series(block, name=key) + return bigframes.series.Series(block) return DataFrame(block) # Bool Series selects rows diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 7d6a1c3b68..f2bbcb3320 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,7 +14,6 @@ from __future__ import annotations -import enum import typing from typing import List, Sequence, Union @@ -36,18 +35,6 @@ import bigframes.session -class Default(enum.Enum): - """Sentinel that can disambiguate explicit None from missing. - - See https://stackoverflow.com/a/76606310/101923 - """ - - token = 0 - - -DEFAULT = Default.token - - class SeriesMethods: def __init__( self, @@ -56,7 +43,7 @@ def __init__( dtype: typing.Optional[ bigframes.dtypes.DtypeString | bigframes.dtypes.Dtype ] = None, - name: str | None | Default = DEFAULT, + name: str | None = None, copy: typing.Optional[bool] = None, *, session: typing.Optional[bigframes.session.Session] = None, @@ -120,7 +107,6 @@ def __init__( block = data_block if block: - # Data was a bigframes object. assert len(block.value_columns) == 1 assert len(block.column_labels) == 1 if index is not None: # reindexing operation @@ -129,27 +115,23 @@ def __init__( idx_cols = idx_block.index_columns block, _ = idx_block.join(block, how="left") block = block.with_index_labels(bf_index.names) - if name is not DEFAULT: + if name: block = block.with_column_labels([name]) if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: - # Data was local. if isinstance(dtype, str) and dtype.lower() == "json": dtype = bigframes.dtypes.JSON_DTYPE pd_series = pd.Series( data=data, index=index, # type:ignore dtype=dtype, # type:ignore - name=name if name is not DEFAULT else None, + name=name, ) - name = pd_series.name # type: ignore block = read_pandas_func(pd_series)._get_block() # type:ignore - block = block.with_column_labels([name]) assert block is not None - self._block: blocks.Block = block @property @@ -178,8 +160,7 @@ def _apply_unary_op( block, result_id = self._block.apply_unary_op( self._value_column, op, result_label=self._name ) - result = series.Series(block.select_column(result_id), name=self._name) - return result + return series.Series(block.select_column(result_id)) def _apply_binary_op( self, diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 4d3e6862b9..ba6d502fcc 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -94,17 +94,23 @@ def __init__(self): self._loader = None # type: ignore def read_pandas(self, pandas_dataframe, write_engine="default"): + original_input = pandas_dataframe + # override read_pandas to always keep data local-only if isinstance(pandas_dataframe, (pandas.Series, pandas.Index)): pandas_dataframe = pandas_dataframe.to_frame() + local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) bf_df = bigframes.dataframe.DataFrame(local_block) - if isinstance(pandas_dataframe, pandas.Series): + + if isinstance(original_input, pandas.Series): series = bf_df[bf_df.columns[0]] - series.name = pandas_dataframe.name + series.name = original_input.name return series - if isinstance(pandas_dataframe, pandas.Index): + + if isinstance(original_input, pandas.Index): return bf_df.index + return bf_df @property From 0a5a9353bd3e97cc74c9000af0cb4e5a379bc5b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 16:10:56 +0000 Subject: [PATCH 31/36] revert more unnecessary changes --- scripts/publish_api_coverage.py | 3 --- tests/unit/test_dataframe_polars.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 181b8c3365..8f305bcc0f 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -204,9 +204,6 @@ def generate_pandas_api_coverage(): def generate_sklearn_api_coverage(): """Explore all SKLearn modules, and for each item contained generate a regex to detect it being imported, and record whether we implement it""" - - import sklearn # noqa - sklearn_modules = [ "sklearn", "sklearn.model_selection", diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index c95c647fa8..a6f5c3d1ef 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2319,8 +2319,7 @@ def test_binop_with_self_aggregate(session, scalars_dfs): df_columns = ["int64_col", "float64_col", "int64_too"] bf_df = scalars_df[df_columns] - bf_deviation = bf_df - bf_df.mean() - bf_result = bf_deviation.to_pandas() + bf_result = (bf_df - bf_df.mean()).to_pandas() pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() From 11262443537ca528d9d67c7ef93b296578837b99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 16:12:24 +0000 Subject: [PATCH 32/36] even more --- bigframes/core/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cf3518ff29..f9896784bb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2471,7 +2471,7 @@ def _align_series_block_axis_1( def _align_pd_series_axis_1( self, other: pd.Series, how: str ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: - if self.column_labels.astype("object").equals(other.index.astype("object")): + if self.column_labels.equals(other.index): columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: if not (self.column_labels.is_unique and other.index.is_unique): From d63a95f72db7264847cf81504576175f064d0124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 18:23:32 +0000 Subject: [PATCH 33/36] add unit_noextras to improve code coverage --- noxfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/noxfile.py b/noxfile.py index 703937d453..095a10c1e2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -124,6 +124,7 @@ # Sessions are executed in the order so putting the smaller sessions # ahead to fail fast at presubmit running. nox.options.sessions = [ + "unit_noextras", "system-3.9", # No extras. "system-3.11", "cover", From 6aadbaf229714efd3665b79a52a40111973d7b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 19:03:32 +0000 Subject: [PATCH 34/36] run system tests on latest fully supported --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 095a10c1e2..f2d25103c8 100644 --- a/noxfile.py +++ b/noxfile.py @@ -126,7 +126,7 @@ nox.options.sessions = [ "unit_noextras", "system-3.9", # No extras. - "system-3.11", + f"system-{LATEST_FULLY_SUPPORTED_PYTHON}", # All extras. "cover", # TODO(b/401609005): remove "cleanup", From 95e4394fc39edf7c9b8447deda8c98c55ad1a2ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 19:09:35 +0000 Subject: [PATCH 35/36] system-3.12 not found --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index f2d25103c8..e7f3299933 100644 --- a/noxfile.py +++ b/noxfile.py @@ -89,7 +89,7 @@ # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search # bigframes/windows-docker, internally. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.13"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", From d33147ae86e15156b96a1e6336879fb4b4f5fe7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 9 Oct 2025 22:25:28 +0000 Subject: [PATCH 36/36] cap polars version --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index abc760b691..8072a3a3f5 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,9 @@ "google-cloud-pubsub >=2.21.4", ], # used for local engine - "polars": ["polars >= 1.21.0"], + # TODO(tswast): relax upper pin when issue with test_engines_astype_int + # and test_divmods_series is resolved. + "polars": ["polars >= 1.21.0, <1.34.0"], "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. "dev": [