In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. 

# Preparation

First, import the BigFrames modules.

In [2]:
import bigframes
import bigframes.pandas as bpd

Make sure the BigFrames version is at least `1.38.0`

In [3]:
from packaging.version import Version

assert Version(bigframes.__version__) >= Version("1.38.0")

Connect to test environmet

In [4]:
bigframes.options.experiments.blob = True
bigframes.options._bigquery_options.client_endpoints_override = {"bqclient": "https://test-bigquery.sandbox.google.com", 
                                                           "bqconnectionclient": "test-bigqueryconnection.sandbox.googleapis.com", 
                                                           "bqstoragereadclient": "test-bigquerystorage-grpc.sandbox.googleapis.com"}



# PDF chunk

Retrieval of PDF URLs, text extraction, and chunking.

In [5]:
chunks_df = bpd.from_glob_path("gs://shuowei_bucket/pdf/*", name="pdf")

  return func(get_global_session(), *args, **kwargs)


In [6]:
# copy files to genearte more inputs, now we have 1000 PDF files
#copies = [chunks_df] * 20
#chunks_df = bpd.concat(copies, ignore_index=True)
#chunks_df = chunks_df.cache()

In [7]:
# copy files to genearte more inputs, now we have 10,000 PDF files
copies = [chunks_df] * 100
chunks_df = bpd.concat(copies, ignore_index=True)
chunks_df = chunks_df.cache()

In [8]:
# copy files again, now we have 1,000,000 PDF files
copies = [chunks_df] * 100
chunks_df = bpd.concat(copies, ignore_index=True)
chunks_df = chunks_df.cache()

In [9]:
chunks_df

Unnamed: 0,pdf
0,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-hydra-bidirectional-state-space-models-through-generalized-matrix-mixers-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
1,"uri: gs://shuowei_bucket/pdf/NeurIPS-2023-neural-latent-geometry-search-product-manifold-inference-via-gromov-hausdorff-informed-bayesian-optimization-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
2,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-a-robust-inlier-identification-algorithm-for-point-cloud-registration-via-mathbfell_0-minimization-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
3,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-can-an-ai-agent-safely-run-a-government-existence-of-probably-approximately-aligned-policies-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
4,"uri: gs://shuowei_bucket/pdf/2502.12961v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
5,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-inexact-augmented-lagrangian-methods-for-conic-optimization-quadratic-growth-and-linear-convergence-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
6,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-predicting-the-performance-of-foundation-models-via-agreement-on-the-line-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
7,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-prediction-with-action-visual-policy-learning-via-joint-denoising-process-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
8,"uri: gs://shuowei_bucket/pdf/NeurIPS-2023-look-ma-no-hands-agent-environment-factorization-of-egocentric-videos-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"
9,"uri: gs://shuowei_bucket/pdf/NeurIPS-2024-cross-scale-self-supervised-blind-image-deblurring-via-implicit-neural-representation-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection"


In [10]:
bq_connection = "bigframes-dev.us.bigframes-default-connection"
chunks_df["chunk_text"] = chunks_df["pdf"].blob.pdf_chunk(
    connection=bq_connection, chunk_size=2000, overlap_size=200,
    max_batching_rows=1, container_cpu=2, container_memory="1Gi")
# notes: use connection is not necessary, we can use default connection.
# However, in current stage, using a specfic conneciton will grant more quota



Explode column for future processing.

In [11]:
chunk_df_exploded = chunks_df["chunk_text"].explode()

Save to a temporary table

In [12]:
chunk_df_exploded = chunk_df_exploded.cache()

BadRequest: 400 GET https://test-bigquery.sandbox.google.com/bigquery/v2/projects/bigframes-dev/queries/e671bba2-377c-45b9-9947-44f1914fae4e?maxResults=0&location=US&prettyPrint=false: The job encountered an error during execution. Retrying the job may solve the problem.

Location: US
Job ID: e671bba2-377c-45b9-9947-44f1914fae4e
 Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[CONNECTION_ERROR] debug=Dremel returned an error: generic::UNAVAILABLE: Reached maximum number of retriable errors. errorProto=code: "CONNECTION_ERROR"\n\n\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:776)\n\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:780)\n\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:60)\n\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:783)\n\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:697)\n\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\n\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\n\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\n\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\n\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\n\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\n\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\n\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\n\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\n\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\n\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\n\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\n\tSuppressed: java.lang.Exception: Including call stack from HelixFutures\n\t\tat com.google.cloud.helix.common.HelixFutures.getHelixException(HelixFutures.java:76)\n\t\tat com.google.cloud.helix.common.HelixFutures.getDone(HelixFutures.java:55)\n\t\tat com.google.cloud.helix.server.job.LocalQueryJobController.handleQueryDone(LocalQueryJobController.java:2626)\n\t\tat com.google.cloud.helix.server.job.LocalQueryJobController.lambda$runJob$1(LocalQueryJobController.java:2539)\n\t\tat com.google.common.util.concurrent.CombinedFuture$CallableInterruptibleTask.runInterruptibly(CombinedFuture.java:196)\n\t\tat com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:74)\n\t\tat com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\n\t\tat io.grpc.Context.run(Context.java:536)\n\t\tat com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:78)\n\t\tat com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\n\t\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\n\t\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\n\t\tat java.base/java.lang.Thread.run(Unknown Source)\n'}]

In [None]:
chunk_df_exploded

# Generate Embeddings

Generation of embeddings within BigFrames.

In [None]:
from bigframes.ml import llm

text_embedding_model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
embeddings_df = text_embedding_model.predict(chunk_df_exploded)

Create Embedding table in Bigquery if not exist.

In [10]:
test_project_id = "bigframes-dev"
test_dataset_id = "shuowei_test_us"
test_table_id = "pdf_chunk_embedding"
embedding_table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}"

Save embedding into a BigQuery table for downstream processing.

In [None]:
embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists="replace")

# Create vector search index

Construction of an index over these embeddings

In [None]:
import bigframes.bigquery as bbq
bbq.create_vector_index(
    table_id=embedding_table_id,
    column_name="ml_generate_embedding_result",
    distance_type="cosine",
    index_type="ivf",
    ivf_options={"num_lists": 100},
)

# Search with pointers to the original pdf

Execution of vector search, with results linked back to the original PDFs

In [None]:
# generate the embedding of the words for search
searched_words = ["reinforce"]
searched_words_embeddings = text_embedding_model.predict(searched_words)
embedding_result_column = "ml_generate_embedding_result"

In [None]:
# perform vector search
search_result = (
    bbq.vector_search(
        base_table=embedding_table_id,
        column_to_search=embedding_result_column,
        query=searched_words_embeddings,
        query_column_to_search=embedding_result_column,
        top_k=3,
    )
)

In [None]:
search_result