From 25d049c078693466905a19cc0954fafcac6c414c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 31 May 2024 13:35:19 -0500 Subject: [PATCH] feat: `merge` only generates a default index if both inputs already have an index (#733) * feat: `merge` only generates a default index if both inputs already have an index * add tests for merge with null index --- bigframes/_config/__init__.py | 6 +- bigframes/core/blocks.py | 22 +- bigframes/exceptions.py | 4 + notebooks/dataframes/index_col_null.ipynb | 1409 +++++++++++++++++++++ tests/system/conftest.py | 2 +- tests/system/small/test_empty_index.py | 227 ---- tests/system/small/test_null_index.py | 288 +++++ 7 files changed, 1723 insertions(+), 235 deletions(-) create mode 100644 notebooks/dataframes/index_col_null.ipynb delete mode 100644 tests/system/small/test_empty_index.py create mode 100644 tests/system/small/test_null_index.py diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index bf33420e6..4729532e9 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -61,10 +61,12 @@ def _init_bigquery_thread_local(self): @property def bigquery(self) -> bigquery_options.BigQueryOptions: """Options to use with the BigQuery engine.""" - if self._local.bigquery_options is not None: + if ( + bigquery_options := getattr(self._local, "bigquery_options", None) + ) is not None: # The only way we can get here is if someone called # _init_bigquery_thread_local. - return self._local.bigquery_options + return bigquery_options return self._bigquery_options diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 010eb96f7..9c567555f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -124,7 +124,7 @@ def __init__( if len(index_columns) == 0: warnings.warn( "Creating object with Null Index. Null Index is a preview feature.", - category=bigframes.exceptions.PreviewWarning, + category=bigframes.exceptions.NullIndexPreviewWarning, ) self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple @@ -1930,10 +1930,22 @@ def merge( coalesce_labels=matching_join_labels, suffixes=suffixes, ) - # Constructs default index - offset_index_id = guid.generate_guid() - expr = joined_expr.promote_offsets(offset_index_id) - return Block(expr, index_columns=[offset_index_id], column_labels=labels) + + # Construct a default index only if this object and the other both have + # indexes. In other words, joining anything to a NULL index object + # keeps everything as a NULL index. + # + # This keeps us from generating an index if the user joins a large + # BigQuery table against small local data, for example. + if len(self._index_columns) > 0 and len(other._index_columns) > 0: + offset_index_id = guid.generate_guid() + expr = joined_expr.promote_offsets(offset_index_id) + index_columns = [offset_index_id] + else: + expr = joined_expr + index_columns = [] + + return Block(expr, index_columns=index_columns, column_labels=labels) def join( self, diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 1162217fc..bae239b6d 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -39,6 +39,10 @@ class PreviewWarning(Warning): """The feature is in preview.""" +class NullIndexPreviewWarning(PreviewWarning): + """Null index feature is in preview.""" + + class NullIndexError(ValueError): """Object has no index.""" diff --git a/notebooks/dataframes/index_col_null.ipynb b/notebooks/dataframes/index_col_null.ipynb new file mode 100644 index 000000000..de373050f --- /dev/null +++ b/notebooks/dataframes/index_col_null.ipynb @@ -0,0 +1,1409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "eeec3428", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "47439dbd-4e54-4954-8b16-edc4bcd4f855", + "metadata": {}, + "source": [ + "# Operations with an \"NULL index\" DataFrame\n", + "\n", + "**Note**: This notebook describes a feature that is currently in [preview](https://cloud.google.com/blog/products/gcp/google-cloud-gets-simplified-product-launch-stages). There may be breaking changes to the functionality when using \"NULL index\" objects.\n", + "\n", + "Use the \"NULL\" index for more efficient query generation, but\n", + "some pandas-compatible methods may not be possible without an index." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "96757c59-fc22-420e-a42f-c6cb956110ec", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "import bigframes.enums\n", + "import bigframes.exceptions\n", + "import bigframes.pandas as bpd\n", + "\n", + "# Explicitly opt-in to the NULL index preview feature.\n", + "warnings.simplefilter(\n", + " \"ignore\",\n", + " bigframes.exceptions.NullIndexPreviewWarning,\n", + ")\n", + "\n", + "df = bpd.read_gbq(\n", + " \"bigquery-public-data.baseball.schedules\",\n", + " index_col=bigframes.enums.DefaultIndexKind.NULL,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d15688e1", + "metadata": {}, + "source": [ + "Use `peek()` to view an arbitrary selection of rows from the DataFrame. This is much more efficient than `head()`, which requires a total ordering for determinism." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c93949fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 1b8726ce-c4ea-47fe-a47c-d6fae50d8fb0 is DONE. 582.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdgameNumberseasonIdyeartypedayNightdurationduration_minuteshomeTeamIdhomeTeamNameawayTeamIdawayTeamNamestartTimeattendancestatuscreated
0e14b6493-9e7f-404f-840a-8a680cc364bf1565de4be-dc80-4849-a7e1-54bc79156cc82016REGD3:0718703556285-bdbb-4576-a06d-42f71f46ddc5Marlins55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-26 17:10:00+00:0027318closed2016-10-06 06:25:15+00:00
11f32b347-cbcb-4c31-a145-0e685306d1681565de4be-dc80-4849-a7e1-54bc79156cc82016REGD3:0918903556285-bdbb-4576-a06d-42f71f46ddc5Marlins55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-25 20:10:00+00:0029457closed2016-10-06 06:25:15+00:00
20c2292d1-7398-48be-bf8e-b41dad5e1a431565de4be-dc80-4849-a7e1-54bc79156cc82016REGD2:4516512079497-e414-450a-8bf2-29f91de646bfBraves55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-11 20:10:00+00:0043114closed2016-10-06 06:25:15+00:00
38fbec734-a15a-42ab-8d51-60790de7750b1565de4be-dc80-4849-a7e1-54bc79156cc82016REGD3:4222212079497-e414-450a-8bf2-29f91de646bfBraves55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-12 17:35:00+00:0031625closed2016-10-06 06:25:15+00:00
489e514d5-fbf5-4b9d-bdac-6ca45bfd18dd1565de4be-dc80-4849-a7e1-54bc79156cc82016REGD2:441642142e1ba-3b40-445c-b8bb-f1f8b1054220Phillies55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-08 17:05:00+00:0028650closed2016-10-06 06:25:15+00:00
\n", + "
" + ], + "text/plain": [ + " gameId gameNumber \\\n", + "0 e14b6493-9e7f-404f-840a-8a680cc364bf 1 \n", + "1 1f32b347-cbcb-4c31-a145-0e685306d168 1 \n", + "2 0c2292d1-7398-48be-bf8e-b41dad5e1a43 1 \n", + "3 8fbec734-a15a-42ab-8d51-60790de7750b 1 \n", + "4 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd 1 \n", + "\n", + " seasonId year type dayNight duration \\\n", + "0 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG D 3:07 \n", + "1 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG D 3:09 \n", + "2 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG D 2:45 \n", + "3 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG D 3:42 \n", + "4 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG D 2:44 \n", + "\n", + " duration_minutes homeTeamId homeTeamName \\\n", + "0 187 03556285-bdbb-4576-a06d-42f71f46ddc5 Marlins \n", + "1 189 03556285-bdbb-4576-a06d-42f71f46ddc5 Marlins \n", + "2 165 12079497-e414-450a-8bf2-29f91de646bf Braves \n", + "3 222 12079497-e414-450a-8bf2-29f91de646bf Braves \n", + "4 164 2142e1ba-3b40-445c-b8bb-f1f8b1054220 Phillies \n", + "\n", + " awayTeamId awayTeamName \\\n", + "0 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "1 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "2 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "3 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "4 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "\n", + " startTime attendance status created \n", + "0 2016-06-26 17:10:00+00:00 27318 closed 2016-10-06 06:25:15+00:00 \n", + "1 2016-06-25 20:10:00+00:00 29457 closed 2016-10-06 06:25:15+00:00 \n", + "2 2016-06-11 20:10:00+00:00 43114 closed 2016-10-06 06:25:15+00:00 \n", + "3 2016-06-12 17:35:00+00:00 31625 closed 2016-10-06 06:25:15+00:00 \n", + "4 2016-06-08 17:05:00+00:00 28650 closed 2016-10-06 06:25:15+00:00 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.peek()" + ] + }, + { + "cell_type": "markdown", + "id": "78e3d27d", + "metadata": {}, + "source": [ + "# Inspect the properties of the DataFrame\n", + "\n", + "Some properties, such as `dtypes`, can be retrieved without executing a query job." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "38f566c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "gameId string[pyarrow]\n", + "gameNumber Int64\n", + "seasonId string[pyarrow]\n", + "year Int64\n", + "type string[pyarrow]\n", + "dayNight string[pyarrow]\n", + "duration string[pyarrow]\n", + "duration_minutes Int64\n", + "homeTeamId string[pyarrow]\n", + "homeTeamName string[pyarrow]\n", + "awayTeamId string[pyarrow]\n", + "awayTeamName string[pyarrow]\n", + "startTime timestamp[us, tz=UTC][pyarrow]\n", + "attendance Int64\n", + "status string[pyarrow]\n", + "created timestamp[us, tz=UTC][pyarrow]\n", + "dtype: object" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "38a59ecc", + "metadata": {}, + "source": [ + "Other properties, such as `shape` require a query. In this case, `shape` runs a `COUNT(1)` query." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e3b43d37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0f85f12c-227c-4001-b851-6e9b9087ab7e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(2431, 16)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "13861abc-120c-4db6-ad0c-e414b85d3443", + "metadata": {}, + "source": [ + "### Select a subset of the DataFrame\n", + "\n", + "Filter columns by selecting a list of columns from the DataFrame.\n", + "\n", + "**Note**: Even with `index_col=bigframes.enums.DefaultIndexKind.NULL`, it is more efficient to do this selection in `read_gbq` / `read_gbq_table` except in cases where the total ordering ID columns can be pruned." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "05cb36e9-bb75-4f6f-8eb6-e4219df6e1d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job efa6b4be-cf60-4951-9125-7d77fb6b6b44 is DONE. 174.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdyearhomeTeamNameawayTeamNameduration_minutes
0e14b6493-9e7f-404f-840a-8a680cc364bf2016MarlinsCubs187
11f32b347-cbcb-4c31-a145-0e685306d1682016MarlinsCubs189
20c2292d1-7398-48be-bf8e-b41dad5e1a432016BravesCubs165
38fbec734-a15a-42ab-8d51-60790de7750b2016BravesCubs222
489e514d5-fbf5-4b9d-bdac-6ca45bfd18dd2016PhilliesCubs164
\n", + "
" + ], + "text/plain": [ + " gameId year homeTeamName awayTeamName \\\n", + "0 e14b6493-9e7f-404f-840a-8a680cc364bf 2016 Marlins Cubs \n", + "1 1f32b347-cbcb-4c31-a145-0e685306d168 2016 Marlins Cubs \n", + "2 0c2292d1-7398-48be-bf8e-b41dad5e1a43 2016 Braves Cubs \n", + "3 8fbec734-a15a-42ab-8d51-60790de7750b 2016 Braves Cubs \n", + "4 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd 2016 Phillies Cubs \n", + "\n", + " duration_minutes \n", + "0 187 \n", + "1 189 \n", + "2 165 \n", + "3 222 \n", + "4 164 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column_filtered = df[[\"gameId\", \"year\", \"homeTeamName\", \"awayTeamName\", \"duration_minutes\"]]\n", + "column_filtered.peek()" + ] + }, + { + "cell_type": "markdown", + "id": "d4d52c41", + "metadata": {}, + "source": [ + "Filter by rows using a boolean Series. This Series must be derived from the DataFrame being filtered so that the NULL index can still align correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a6b8b3ac-1df8-46ff-ac4f-d6e7657fc80c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0be8e44d-854a-45ca-950b-269280e3de41 is DONE. 582.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdgameNumberseasonIdyeartypedayNightdurationduration_minuteshomeTeamIdhomeTeamNameawayTeamIdawayTeamNamestartTimeattendancestatuscreated
063f14670-c28e-432b-84ee-1a2c6ac295271565de4be-dc80-4849-a7e1-54bc79156cc82016REGN2:4316303556285-bdbb-4576-a06d-42f71f46ddc5Marlins55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-23 23:10:00+00:0025291closed2016-10-06 06:25:15+00:00
1bf4e80d1-3125-44fa-8a89-de93d039d4651565de4be-dc80-4849-a7e1-54bc79156cc82016REGN3:2420403556285-bdbb-4576-a06d-42f71f46ddc5Marlins55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-24 23:10:00+00:0024385closed2016-10-06 06:25:15+00:00
2e8af534c-36ed-4ff9-8511-780825fdd0411565de4be-dc80-4849-a7e1-54bc79156cc82016REGN2:5117112079497-e414-450a-8bf2-29f91de646bfBraves55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-10 23:35:00+00:0030547closed2016-10-06 06:25:15+00:00
3e599c525-ac42-4b54-928d-7ee5fbe67dd91565de4be-dc80-4849-a7e1-54bc79156cc82016REGN2:451652142e1ba-3b40-445c-b8bb-f1f8b1054220Phillies55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-07 23:05:00+00:0027381closed2016-10-06 06:25:15+00:00
4d80ffb65-57a4-42c9-ae1c-2c51d06503361565de4be-dc80-4849-a7e1-54bc79156cc82016REGN3:051852142e1ba-3b40-445c-b8bb-f1f8b1054220Phillies55714da8-fcaf-4574-8443-59bfb511a524Cubs2016-06-06 23:05:00+00:0022162closed2016-10-06 06:25:15+00:00
\n", + "
" + ], + "text/plain": [ + " gameId gameNumber \\\n", + "0 63f14670-c28e-432b-84ee-1a2c6ac29527 1 \n", + "1 bf4e80d1-3125-44fa-8a89-de93d039d465 1 \n", + "2 e8af534c-36ed-4ff9-8511-780825fdd041 1 \n", + "3 e599c525-ac42-4b54-928d-7ee5fbe67dd9 1 \n", + "4 d80ffb65-57a4-42c9-ae1c-2c51d0650336 1 \n", + "\n", + " seasonId year type dayNight duration \\\n", + "0 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG N 2:43 \n", + "1 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG N 3:24 \n", + "2 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG N 2:51 \n", + "3 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG N 2:45 \n", + "4 565de4be-dc80-4849-a7e1-54bc79156cc8 2016 REG N 3:05 \n", + "\n", + " duration_minutes homeTeamId homeTeamName \\\n", + "0 163 03556285-bdbb-4576-a06d-42f71f46ddc5 Marlins \n", + "1 204 03556285-bdbb-4576-a06d-42f71f46ddc5 Marlins \n", + "2 171 12079497-e414-450a-8bf2-29f91de646bf Braves \n", + "3 165 2142e1ba-3b40-445c-b8bb-f1f8b1054220 Phillies \n", + "4 185 2142e1ba-3b40-445c-b8bb-f1f8b1054220 Phillies \n", + "\n", + " awayTeamId awayTeamName \\\n", + "0 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "1 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "2 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "3 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "4 55714da8-fcaf-4574-8443-59bfb511a524 Cubs \n", + "\n", + " startTime attendance status created \n", + "0 2016-06-23 23:10:00+00:00 25291 closed 2016-10-06 06:25:15+00:00 \n", + "1 2016-06-24 23:10:00+00:00 24385 closed 2016-10-06 06:25:15+00:00 \n", + "2 2016-06-10 23:35:00+00:00 30547 closed 2016-10-06 06:25:15+00:00 \n", + "3 2016-06-07 23:05:00+00:00 27381 closed 2016-10-06 06:25:15+00:00 \n", + "4 2016-06-06 23:05:00+00:00 22162 closed 2016-10-06 06:25:15+00:00 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "night_games = df[df['dayNight'] == 'N']\n", + "night_games.peek()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "80e9a2e2-c4c9-4c17-bbd0-06882d7657fe", + "metadata": {}, + "source": [ + "### Join two DataFrames\n", + "\n", + "Even though pandas usually joins by the index, NULL index objects can still be manually joined by a column using the `on` parameter in `merge`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3f09ff32-ef43-4fab-a86b-8868afc34363", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 5d2c69d2-33fe-4513-923b-fd64f4da098b is DONE. 113.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdhomeTeamName
0e14b6493-9e7f-404f-840a-8a680cc364bfMarlins
11f32b347-cbcb-4c31-a145-0e685306d168Marlins
20c2292d1-7398-48be-bf8e-b41dad5e1a43Braves
38fbec734-a15a-42ab-8d51-60790de7750bBraves
489e514d5-fbf5-4b9d-bdac-6ca45bfd18ddPhillies
\n", + "
" + ], + "text/plain": [ + " gameId homeTeamName\n", + "0 e14b6493-9e7f-404f-840a-8a680cc364bf Marlins\n", + "1 1f32b347-cbcb-4c31-a145-0e685306d168 Marlins\n", + "2 0c2292d1-7398-48be-bf8e-b41dad5e1a43 Braves\n", + "3 8fbec734-a15a-42ab-8d51-60790de7750b Braves\n", + "4 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd Phillies" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df[[\"gameId\", \"homeTeamName\"]]\n", + "df1.peek()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5331d2c8-7912-4d96-8da1-f64b57374df3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job b6b70d6d-a490-44d6-ba74-0ee32b4f0a1a is DONE. 582.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 68acd168-8b42-44f8-8702-99618935991e is DONE. 94 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdawayTeamName
0af72a0b9-65f7-49fb-9b30-d505068bdf6dBrewers
1d60c6036-0ce1-4c90-8dd9-de3b403c92a8Brewers
\n", + "
" + ], + "text/plain": [ + " gameId awayTeamName\n", + "0 af72a0b9-65f7-49fb-9b30-d505068bdf6d Brewers\n", + "1 d60c6036-0ce1-4c90-8dd9-de3b403c92a8 Brewers" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = df[[\"gameId\", \"awayTeamName\"]].head(2)\n", + "df2.peek()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a574ad3e-a219-454c-8bb5-c5ed6627f2c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0ac171dd-3859-4589-b7ff-59fd81ec3c3a is DONE. 582.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 034f8807-c128-444a-8033-0c95f34b0e32 is DONE. 111 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdhomeTeamNameawayTeamName
0af72a0b9-65f7-49fb-9b30-d505068bdf6dRedsBrewers
1d60c6036-0ce1-4c90-8dd9-de3b403c92a8NationalsBrewers
\n", + "
" + ], + "text/plain": [ + " gameId homeTeamName awayTeamName\n", + "0 af72a0b9-65f7-49fb-9b30-d505068bdf6d Reds Brewers\n", + "1 d60c6036-0ce1-4c90-8dd9-de3b403c92a8 Nationals Brewers" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged = df1.merge(df2, on=\"gameId\", how=\"inner\")\n", + "merged.peek()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "288e7a95-a077-46c4-8fe6-802474c01f8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 30fd5a60-772c-4ef0-a151-5ab390ff4322 is DONE. 582.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 701fa9a8-1ec6-49b9-ac41-228cb34d4c8c is DONE. 114.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdhomeTeamNameawayTeamName
0039bb40e-7613-4674-a653-584b93e9b21bAmerican League<NA>
178000e12-2ef3-4246-adc1-c8a4d157631cAngels<NA>
2de5555dc-9228-4f7c-88ae-4451e3ffb980Angels<NA>
3f29a2754-004b-436c-91fe-3d86c0bb17a8Angels<NA>
48e5af008-8a07-4f9a-90cb-336ca4c84c71Angels<NA>
\n", + "
" + ], + "text/plain": [ + " gameId homeTeamName awayTeamName\n", + "0 039bb40e-7613-4674-a653-584b93e9b21b American League \n", + "1 78000e12-2ef3-4246-adc1-c8a4d157631c Angels \n", + "2 de5555dc-9228-4f7c-88ae-4451e3ffb980 Angels \n", + "3 f29a2754-004b-436c-91fe-3d86c0bb17a8 Angels \n", + "4 8e5af008-8a07-4f9a-90cb-336ca4c84c71 Angels " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged = df1.merge(df2, on=\"gameId\", how=\"outer\")\n", + "merged.peek()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7ee87a01-2ff5-4021-855d-44b71cf2a225", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job e3d8168c-48e9-4ba9-a916-10259ad9c0ea is DONE. 582.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 82d2a5e4-66a8-4478-92de-57d3f806aa76 is DONE. 114.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdhomeTeamNameawayTeamName
0039bb40e-7613-4674-a653-584b93e9b21bAmerican League<NA>
1f6fcd83c-e130-487c-a0cc-d00b2712d08bAngels<NA>
2fe401dd2-089c-4822-8657-4d510d460f38Angels<NA>
3c894bdee-5dda-49f4-87c8-53b9b9bfcd3bAngels<NA>
4bbda59d9-fd52-4bed-bcfb-2ceed4be997cAngels<NA>
\n", + "
" + ], + "text/plain": [ + " gameId homeTeamName awayTeamName\n", + "0 039bb40e-7613-4674-a653-584b93e9b21b American League \n", + "1 f6fcd83c-e130-487c-a0cc-d00b2712d08b Angels \n", + "2 fe401dd2-089c-4822-8657-4d510d460f38 Angels \n", + "3 c894bdee-5dda-49f4-87c8-53b9b9bfcd3b Angels \n", + "4 bbda59d9-fd52-4bed-bcfb-2ceed4be997c Angels " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged = df1.merge(df2, on=\"gameId\", how=\"left\")\n", + "merged.peek()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "330ed69c-f122-4af9-bf5e-96e309d3fa0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 518ed511-606a-42b2-a28d-61a601eccfa7 is DONE. 582.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bc381640-74e0-4885-9c32-87805a49f357 is DONE. 111 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdhomeTeamNameawayTeamName
0af72a0b9-65f7-49fb-9b30-d505068bdf6dRedsBrewers
1d60c6036-0ce1-4c90-8dd9-de3b403c92a8NationalsBrewers
\n", + "
" + ], + "text/plain": [ + " gameId homeTeamName awayTeamName\n", + "0 af72a0b9-65f7-49fb-9b30-d505068bdf6d Reds Brewers\n", + "1 d60c6036-0ce1-4c90-8dd9-de3b403c92a8 Nationals Brewers" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged = df1.merge(df2, on=\"gameId\", how=\"right\")\n", + "merged.peek()" + ] + }, + { + "cell_type": "markdown", + "id": "162eede7", + "metadata": {}, + "source": [ + "### Download the result as (in-memory) pandas DataFrame\n", + "\n", + "Use the `ordered=False` argument for more efficient query execution." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ab429fa5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2d4fbd55-ba6a-46d2-87ae-5da416ad3642 is DONE. 159 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gameIdhomeTeamNameawayTeamName
0d60c6036-0ce1-4c90-8dd9-de3b403c92a8NationalsBrewers
1af72a0b9-65f7-49fb-9b30-d505068bdf6dRedsBrewers
\n", + "
" + ], + "text/plain": [ + " gameId homeTeamName awayTeamName\n", + "0 d60c6036-0ce1-4c90-8dd9-de3b403c92a8 Nationals Brewers\n", + "1 af72a0b9-65f7-49fb-9b30-d505068bdf6d Reds Brewers" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfp = merged.to_pandas(ordered=False)\n", + "dfp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "896212ab", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/system/conftest.py b/tests/system/conftest.py index f7fbd5f4b..250169308 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -395,7 +395,7 @@ def scalars_df_index( @pytest.fixture(scope="session") -def scalars_df_empty_index( +def scalars_df_null_index( scalars_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py deleted file mode 100644 index 3216264a8..000000000 --- a/tests/system/small/test_empty_index.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pandas as pd -import pytest - -import bigframes.exceptions -import bigframes.pandas as bpd -from tests.system.utils import skip_legacy_pandas - - -def test_empty_index_materialize( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = scalars_df_empty_index.to_pandas() - pd.testing.assert_frame_equal( - bf_result, scalars_pandas_df_default_index, check_index_type=False - ) - - -def test_empty_index_series_repr( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = scalars_df_empty_index["int64_too"].head(5).__repr__() - pd_result = ( - scalars_pandas_df_default_index["int64_too"] - .head(5) - .to_string(dtype=True, index=False, length=False, name=True) - ) - assert bf_result == pd_result - - -def test_empty_index_dataframe_repr( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = scalars_df_empty_index[["int64_too", "int64_col"]].head(5).__repr__() - pd_result = ( - scalars_pandas_df_default_index[["int64_too", "int64_col"]] - .head(5) - .to_string(index=False) - ) - assert bf_result == pd_result + "\n\n[5 rows x 2 columns]" - - -def test_empty_index_reset_index( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = scalars_df_empty_index.reset_index().to_pandas() - pd_result = scalars_pandas_df_default_index.reset_index(drop=True) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_empty_index_set_index(scalars_df_empty_index, scalars_pandas_df_default_index): - bf_result = scalars_df_empty_index.set_index("int64_col").to_pandas() - pd_result = scalars_pandas_df_default_index.set_index("int64_col") - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_empty_index_concat(scalars_df_empty_index, scalars_pandas_df_default_index): - bf_result = bpd.concat( - [scalars_df_empty_index, scalars_df_empty_index], axis=0 - ).to_pandas() - pd_result = pd.concat( - [scalars_pandas_df_default_index, scalars_pandas_df_default_index], axis=0 - ) - pd.testing.assert_frame_equal(bf_result, pd_result.reset_index(drop=True)) - - -def test_empty_index_aggregate(scalars_df_empty_index, scalars_pandas_df_default_index): - bf_result = scalars_df_empty_index.count().to_pandas() - pd_result = scalars_pandas_df_default_index.count() - - pd_result.index = pd_result.index.astype("string[pyarrow]") - - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_empty_index_groupby_aggregate( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = scalars_df_empty_index.groupby("int64_col").count().to_pandas() - pd_result = scalars_pandas_df_default_index.groupby("int64_col").count() - - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@skip_legacy_pandas -def test_empty_index_analytic(scalars_df_empty_index, scalars_pandas_df_default_index): - bf_result = scalars_df_empty_index["int64_col"].cumsum().to_pandas() - pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() - pd.testing.assert_series_equal( - bf_result, pd_result.reset_index(drop=True), check_dtype=False - ) - - -def test_empty_index_groupby_analytic( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = ( - scalars_df_empty_index.groupby("bool_col")["int64_col"].cummax().to_pandas() - ) - pd_result = scalars_pandas_df_default_index.groupby("bool_col")[ - "int64_col" - ].cummax() - pd.testing.assert_series_equal( - bf_result, pd_result.reset_index(drop=True), check_dtype=False - ) - - -@skip_legacy_pandas -def test_empty_index_stack(scalars_df_empty_index, scalars_pandas_df_default_index): - stacking_cols = ["int64_col", "int64_too"] - bf_result = scalars_df_empty_index[stacking_cols].stack().to_pandas() - pd_result = ( - scalars_pandas_df_default_index[stacking_cols] - .stack(future_stack=True) - .droplevel(level=0, axis=0) - ) - pd_result.index = pd_result.index.astype(bf_result.index.dtype) - pd.testing.assert_series_equal( - bf_result, - pd_result, - check_dtype=False, - ) - - -def test_empty_index_series_self_aligns( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = ( - scalars_df_empty_index["int64_col"] + scalars_df_empty_index["int64_too"] - ) - pd_result = ( - scalars_pandas_df_default_index["int64_col"] - + scalars_pandas_df_default_index["int64_too"] - ) - pd.testing.assert_series_equal( - bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False - ) - - -def test_empty_index_df_self_aligns( - scalars_df_empty_index, scalars_pandas_df_default_index -): - bf_result = ( - scalars_df_empty_index[["int64_col", "float64_col"]] - + scalars_df_empty_index[["int64_col", "float64_col"]] - ) - pd_result = ( - scalars_pandas_df_default_index[["int64_col", "float64_col"]] - + scalars_pandas_df_default_index[["int64_col", "float64_col"]] - ) - pd.testing.assert_frame_equal( - bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False - ) - - -def test_empty_index_setitem(scalars_df_empty_index, scalars_pandas_df_default_index): - bf_result = scalars_df_empty_index.copy() - bf_result["new_col"] = ( - scalars_df_empty_index["int64_col"] + scalars_df_empty_index["float64_col"] - ) - pd_result = scalars_pandas_df_default_index.copy() - pd_result["new_col"] = ( - scalars_pandas_df_default_index["int64_col"] - + scalars_pandas_df_default_index["float64_col"] - ) - pd.testing.assert_frame_equal( - bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False - ) - - -def test_empty_index_df_concat(scalars_df_empty_index, scalars_pandas_df_default_index): - bf_result = bpd.concat([scalars_df_empty_index, scalars_df_empty_index]) - pd_result = pd.concat( - [scalars_pandas_df_default_index, scalars_pandas_df_default_index] - ) - pd.testing.assert_frame_equal( - bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False - ) - - -def test_empty_index_align_error(scalars_df_empty_index): - with pytest.raises(bigframes.exceptions.NullIndexError): - _ = ( - scalars_df_empty_index["int64_col"] - + scalars_df_empty_index["int64_col"].cumsum() - ) - - -def test_empty_index_loc_error(scalars_df_empty_index): - with pytest.raises(bigframes.exceptions.NullIndexError): - scalars_df_empty_index["int64_col"].loc[1] - - -def test_empty_index_at_error(scalars_df_empty_index): - with pytest.raises(bigframes.exceptions.NullIndexError): - scalars_df_empty_index["int64_col"].at[1] - - -def test_empty_index_idxmin_error(scalars_df_empty_index): - with pytest.raises(bigframes.exceptions.NullIndexError): - scalars_df_empty_index[["int64_col", "int64_too"]].idxmin() - - -def test_empty_index_index_property(scalars_df_empty_index): - with pytest.raises(bigframes.exceptions.NullIndexError): - _ = scalars_df_empty_index.index - - -def test_empty_index_transpose(scalars_df_empty_index): - with pytest.raises(bigframes.exceptions.NullIndexError): - _ = scalars_df_empty_index.T diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py new file mode 100644 index 000000000..27a3d8dff --- /dev/null +++ b/tests/system/small/test_null_index.py @@ -0,0 +1,288 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pandas as pd +import pytest + +import bigframes.exceptions +import bigframes.pandas as bpd +from tests.system.utils import skip_legacy_pandas + + +def test_null_index_materialize(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index.to_pandas() + pd.testing.assert_frame_equal( + bf_result, scalars_pandas_df_default_index, check_index_type=False + ) + + +def test_null_index_series_repr(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index["int64_too"].head(5).__repr__() + pd_result = ( + scalars_pandas_df_default_index["int64_too"] + .head(5) + .to_string(dtype=True, index=False, length=False, name=True) + ) + assert bf_result == pd_result + + +def test_null_index_dataframe_repr( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index[["int64_too", "int64_col"]].head(5).__repr__() + pd_result = ( + scalars_pandas_df_default_index[["int64_too", "int64_col"]] + .head(5) + .to_string(index=False) + ) + assert bf_result == pd_result + "\n\n[5 rows x 2 columns]" + + +def test_null_index_reset_index(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index.reset_index().to_pandas() + pd_result = scalars_pandas_df_default_index.reset_index(drop=True) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_null_index_set_index(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index.set_index("int64_col").to_pandas() + pd_result = scalars_pandas_df_default_index.set_index("int64_col") + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_null_index_concat(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = bpd.concat( + [scalars_df_null_index, scalars_df_null_index], axis=0 + ).to_pandas() + pd_result = pd.concat( + [scalars_pandas_df_default_index, scalars_pandas_df_default_index], axis=0 + ) + pd.testing.assert_frame_equal(bf_result, pd_result.reset_index(drop=True)) + + +def test_null_index_aggregate(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index.count().to_pandas() + pd_result = scalars_pandas_df_default_index.count() + + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_null_index_groupby_aggregate( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index.groupby("int64_col").count().to_pandas() + pd_result = scalars_pandas_df_default_index.groupby("int64_col").count() + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@skip_legacy_pandas +def test_null_index_analytic(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index["int64_col"].cumsum().to_pandas() + pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() + pd.testing.assert_series_equal( + bf_result, pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_null_index_groupby_analytic( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_null_index.groupby("bool_col")["int64_col"].cummax().to_pandas() + ) + pd_result = scalars_pandas_df_default_index.groupby("bool_col")[ + "int64_col" + ].cummax() + pd.testing.assert_series_equal( + bf_result, pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_null_index_merge_left_null_index_object( + scalars_df_null_index, scalars_df_default_index, scalars_pandas_df_default_index +): + df1 = scalars_df_null_index[scalars_df_null_index["int64_col"] > 0] + df1_pd = scalars_pandas_df_default_index[ + scalars_pandas_df_default_index["int64_col"] > 0 + ] + assert not df1._has_index + df2 = scalars_df_default_index[scalars_df_default_index["int64_col"] <= 55555] + df2_pd = scalars_pandas_df_default_index[ + scalars_pandas_df_default_index["int64_col"] <= 55555 + ] + assert df2._has_index + + got = df1.merge(df2, how="inner", on="bool_col") + expected = df1_pd.merge(df2_pd, how="inner", on="bool_col") + + # Combining any NULL index object should result in a NULL index. + # This keeps us from generating an index if the user joins a large + # BigQuery table against small local data, for example. + assert not got._has_index + assert got.shape == expected.shape + + +def test_null_index_merge_right_null_index_object( + scalars_df_null_index, scalars_df_default_index, scalars_pandas_df_default_index +): + df1 = scalars_df_default_index[scalars_df_default_index["int64_col"] > 0] + df1_pd = scalars_pandas_df_default_index[ + scalars_pandas_df_default_index["int64_col"] > 0 + ] + assert df1._has_index + df2 = scalars_df_null_index[scalars_df_null_index["int64_col"] <= 55555] + df2_pd = scalars_pandas_df_default_index[ + scalars_pandas_df_default_index["int64_col"] <= 55555 + ] + assert not df2._has_index + + got = df1.merge(df2, how="left", on="bool_col") + expected = df1_pd.merge(df2_pd, how="left", on="bool_col") + + # Combining any NULL index object should result in a NULL index. + # This keeps us from generating an index if the user joins a large + # BigQuery table against small local data, for example. + assert not got._has_index + assert got.shape == expected.shape + + +def test_null_index_merge_two_null_index_objects( + scalars_df_null_index, scalars_pandas_df_default_index +): + df1 = scalars_df_null_index[scalars_df_null_index["int64_col"] > 0] + df1_pd = scalars_pandas_df_default_index[ + scalars_pandas_df_default_index["int64_col"] > 0 + ] + assert not df1._has_index + df2 = scalars_df_null_index[scalars_df_null_index["int64_col"] <= 55555] + df2_pd = scalars_pandas_df_default_index[ + scalars_pandas_df_default_index["int64_col"] <= 55555 + ] + assert not df2._has_index + + got = df1.merge(df2, how="outer", on="bool_col") + expected = df1_pd.merge(df2_pd, how="outer", on="bool_col") + + assert not got._has_index + assert got.shape == expected.shape + + +@skip_legacy_pandas +def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index): + stacking_cols = ["int64_col", "int64_too"] + bf_result = scalars_df_null_index[stacking_cols].stack().to_pandas() + pd_result = ( + scalars_pandas_df_default_index[stacking_cols] + .stack(future_stack=True) + .droplevel(level=0, axis=0) + ) + pd_result.index = pd_result.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + ) + + +def test_null_index_series_self_aligns( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index["int64_col"] + scalars_df_null_index["int64_too"] + pd_result = ( + scalars_pandas_df_default_index["int64_col"] + + scalars_pandas_df_default_index["int64_too"] + ) + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_null_index_df_self_aligns( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_null_index[["int64_col", "float64_col"]] + + scalars_df_null_index[["int64_col", "float64_col"]] + ) + pd_result = ( + scalars_pandas_df_default_index[["int64_col", "float64_col"]] + + scalars_pandas_df_default_index[["int64_col", "float64_col"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_null_index_setitem(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = scalars_df_null_index.copy() + bf_result["new_col"] = ( + scalars_df_null_index["int64_col"] + scalars_df_null_index["float64_col"] + ) + pd_result = scalars_pandas_df_default_index.copy() + pd_result["new_col"] = ( + scalars_pandas_df_default_index["int64_col"] + + scalars_pandas_df_default_index["float64_col"] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_null_index_df_concat(scalars_df_null_index, scalars_pandas_df_default_index): + bf_result = bpd.concat([scalars_df_null_index, scalars_df_null_index]) + pd_result = pd.concat( + [scalars_pandas_df_default_index, scalars_pandas_df_default_index] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_null_index_align_error(scalars_df_null_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = ( + scalars_df_null_index["int64_col"] + + scalars_df_null_index["int64_col"].cumsum() + ) + + +def test_null_index_loc_error(scalars_df_null_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_null_index["int64_col"].loc[1] + + +def test_null_index_at_error(scalars_df_null_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_null_index["int64_col"].at[1] + + +def test_null_index_idxmin_error(scalars_df_null_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_null_index[["int64_col", "int64_too"]].idxmin() + + +def test_null_index_index_property(scalars_df_null_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = scalars_df_null_index.index + + +def test_null_index_transpose(scalars_df_null_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = scalars_df_null_index.T