In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. 

# Preparation

First, import the BigFrames modules.

In [2]:
import bigframes
import bigframes.pandas as bpd

Make sure the BigFrames version is at least `1.38.0`

In [3]:
from packaging.version import Version

assert Version(bigframes.__version__) >= Version("1.38.0")

Set blob to true for testing

In [4]:
bigframes.options.experiments.blob = True

to change in the future.[0m


# PDF chunk

Retrieval of PDF URLs, text extraction, and chunking.

In [5]:
chunks_df = bpd.from_glob_path("gs://shuowei_bucket/pdf/*", name="pdf")

  return func(get_global_session(), *args, **kwargs)


In [6]:
# copy files to genearte more inputs, now we have 1000 PDF files
#copies = [chunks_df] * 20
#chunks_df = bpd.concat(copies, ignore_index=True)
#chunks_df = chunks_df.cache()

In [7]:
# copy files to genearte more inputs, now we have 10,000 PDF files
copies = [chunks_df] * 100
chunks_df = bpd.concat(copies, ignore_index=True)
chunks_df = chunks_df.cache()

In [8]:
# copy files again, now we have 1,000,000 PDF files
copies = [chunks_df] * 100
chunks_df = bpd.concat(copies, ignore_index=True)
chunks_df = chunks_df.cache()

In [9]:
#chunks_df

In [10]:
bq_connection = "bigframes-dev.us.bigframes-default-connection"
chunks_df["chunk_text"] = chunks_df["pdf"].blob.pdf_chunk(
    connection=bq_connection, chunk_size=2000, overlap_size=200,
    max_batching_rows=1, container_cpu=2, container_memory="1Gi")
# notes: use connection is not necessary, we can use default connection.
# However, in current stage, using a specfic conneciton will grant more quota



Explode column for future processing.

In [11]:
chunk_df_exploded = chunks_df["chunk_text"].explode()

Save to a temporary table

In [12]:
chunk_df_exploded = chunk_df_exploded.cache()

In [13]:
chunk_df_exploded.head(5)

0    Integrating Reinforcement Learning, Action Mod...
0    Benyamin)
Preprint submitted to Artificial Int...
0    classical, discrete, environments.
Therefore, ...
0    setting we consider in this work isoffline lea...
0    more complex
problems that required longer-ter...
Name: chunk_text, dtype: string

# Generate Embeddings

Generation of embeddings within BigFrames.

In [14]:
from bigframes.ml import llm

text_embedding_model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
embeddings_df = text_embedding_model.predict(chunk_df_exploded)

is in preview; this behavior may change in future versions.[0m


detailed status. You may want to filter the failed rows and retry.[0m


Create Embedding table in Bigquery if not exist.

In [15]:
test_project_id = "bigframes-dev"
test_dataset_id = "shuowei_test_us"
test_table_id = "pdf_chunk_embedding"
embedding_table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}"

Save embedding into a BigQuery table for downstream processing.

In [16]:
embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists="replace")

'bigframes-dev.shuowei_test_us.pdf_chunk_embedding'

# Create vector search index

Construction of an index over these embeddings

In [17]:
import bigframes.bigquery as bbq
bbq.create_vector_index(
    table_id=embedding_table_id,
    column_name="ml_generate_embedding_result",
    distance_type="cosine",
    index_type="ivf",
    ivf_options={"num_lists": 100},
)

BadRequest: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/acfac823-c809-4928-8b1c-132f7f84ea11?maxResults=0&location=US&prettyPrint=false: Column 'ml_generate_embedding_result' must have the same array length, while the minimum length is 0 and the maximum length is 768.

Location: US
Job ID: acfac823-c809-4928-8b1c-132f7f84ea11
 Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.39.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[INVALID_INPUT] message=QUERY_ERROR: [Column \'ml_generate_embedding_result\' must have the same array length, while the minimum length is 0 and the maximum length is 768.] debug=code: \t BAD_QUERY\ndescription: "Column \\\'ml_generate_embedding_result\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768."\ncause: USER_ERROR\naddress: "http://jdyd1.prod.google.com:4901/task?handle=logs.0.prod-ml-us.server.cloud-dataengine-ml.10584282029591"\nstatus_proto {\n  code: 3\n  space: "generic"\n  message: "Column \\\'ml_generate_embedding_result\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768."\n}\nerror_details {\n  argument_error {\n    query_error {\n    }\n  }\n  debug_info {\n    error_message_template: "Column \\\'$0\\\' must have the same array length, while the minimum length is $1 and the maximum length is $2."\n    error_id: 3839077984\n  }\n}\n errorProto=code: "QUERY_ERROR"\nargument: "Column \\\'ml_generate_embedding_result\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768."\nlocation_type: OTHER\nlocation: "query"\n\n\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:1993)\n\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1206)\n\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:766)\n\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:693)\n\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\n\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\n\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\n\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\n\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\n\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\n\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\n\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\n\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\n\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\n\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\n\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\n'}]

# Search with pointers to the original pdf

Execution of vector search, with results linked back to the original PDFs

In [None]:
# generate the embedding of the words for search
searched_words = ["reinforce"]
searched_words_embeddings = text_embedding_model.predict(searched_words)
embedding_result_column = "ml_generate_embedding_result"

In [None]:
# perform vector search
search_result = (
    bbq.vector_search(
        base_table=embedding_table_id,
        column_to_search=embedding_result_column,
        query=searched_words_embeddings,
        query_column_to_search=embedding_result_column,
        top_k=3,
    )
)

In [None]:
search_result