In [1]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Integrating with BigQuery DataFrames

This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one.

In [4]:
import bigframes.pandas as bpd

# Sample data
df = bpd.DataFrame({
    "index": [0, 1, 2, 3, 4],
    "int_col": [1, 2, 3, 4, 5],
    "float_col": [1.0, -0.5, 0.25, -0.125, 0.0625],
    "string_col": ["a", "b", "c", "d", "e"],
}).set_index("index")

  return Session(context)


## Accepting a BigQuery DataFrames (bigframes) DataFrame

The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the `DataFrame.to_gbq()` method. With no `destination_table`, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID.

In [5]:
table_id = df.to_gbq()
table_id

'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240326_6742d434ead64fd4bfebed692dbc8c12'

### Sharing the table with your application's backend

Tables created in the user's anonymous dataset are only queryable by the user who created them. Many applications authenticate with a [service account](https://cloud.google.com/iam/docs/service-account-overview), which may be different from the end-user running BigQuery DataFrames (bigframes).

Grant your application access to this table by granting your application's service account associated with the customer the `roles/bigquery.dataViewer` role on the [BigQuery table with an IAM policy](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_table_or_view).

In [10]:
your_service_account_email = "your-service-account@bigframes-samples.iam.gserviceaccount.com"


def df_to_gbq_plus_permissions(df):
    table_id = df.to_gbq()

    # TODO(tswast): Expose a public property to get the session from a DataFrame.
    bqclient = df._session.bqclient

    policy = bqclient.get_iam_policy(table_id)
    binding = {
        "role": "roles/bigquery.dataViewer",
        "members": {f"serviceAccount:{your_service_account_email}"},
    }
    policy.bindings.append(binding)
    bqclient.set_iam_policy(table_id, policy)

    # TODO(developer): Pass table_id to your application and start your workload.
    example_workload(table_id)


def example_workload(table_id):
    # For example, for one node workloads, use the client library to read the table
    # as a pandas DataFrame.
    from google.cloud import bigquery

    client = bigquery.Client()
    pandas_df = client.list_rows(table_id).to_dataframe()
    print(pandas_df)


df_to_gbq_plus_permissions(df)


   index  int_col  float_col string_col
0      0        1     1.0000          a
1      2        3     0.2500          c
2      3        4    -0.1250          d
3      4        5     0.0625          e
4      1        2    -0.5000          b


### Preserving order

Depending on your use case, you may want to include the ordering so that it can be restored withing your application.

In [11]:
ordering_column = "ordering_id_maybe_with_some_random_text_to_avoid_collisions"
table_id = df.to_gbq(ordering_id=ordering_column)
table_id

'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240326_581100a67c924da696d72497ca934905'

### Creating clustered tables

Large tables can be optimized by passing in `clustering_columns` to create a [clustered table](https://cloud.google.com/bigquery/docs/clustered-tables).

In [12]:
table_id = df.to_gbq(clustering_columns=("index", "int_col"))
table_id

'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240326_4e9c510e5f784970a50ac03c6b25e0f1'

## Returning a BigQuery DataFrames (bigframes) DataFrame