In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. 

# Preparation

First, import the BigFrames modules.

In [1]:
import bigframes
import bigframes.pandas as bpd

Make sure the BigFrames version is at least `1.38.0`

In [2]:
from packaging.version import Version

assert Version(bigframes.__version__) >= Version("1.38.0")

Set blob to true for testing

In [3]:
bigframes.options.experiments.blob = True



# PDF chunk

Retrieval of PDF URLs, text extraction, and chunking.

In [4]:
chunks_df = bpd.from_glob_path("gs://garrettwu_bucket/pdfs/*")
chunks_df.columns = ["uri"]
bq_connection = "bigframes-dev.us.bigframes-default-connection"
chunks_df["chunk_text"] = chunks_df["uri"].blob.pdf_chunk(
    connection=bq_connection, chunk_size=2000, overlap_size=200,
    max_batching_rows=1
)
chunk_df_exploded = chunks_df["chunk_text"].explode()
chunk_df_exploded.cache()

  _global_session = bigframes.session.connect(


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-ka1m0ue4fptfmt9siejdd5lom7p39upa.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fpydata-google-auth.readthedocs.io%2Fen%2Flatest%2Foauth.html&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=GE3CiB2iPQ32Mbcgug2H68pdMulb7j&prompt=consent&access_type=offline


ValueError: Project must be set to initialize BigQuery client. Try setting `bigframes.options.bigquery.project` first.

In [None]:
chunks_df = bpd.from_glob_path("gs://shuowei_bucket/pdf/*", name="pdf")

  return func(get_global_session(), *args, **kwargs)


In [5]:
# copy files to genearte more inputs, now we have 1000 PDF files
#copies = [chunks_df] * 20
#chunks_df = bpd.concat(copies, ignore_index=True)
#chunks_df = chunks_df.cache()
chunks_df = chunks_df.head(5)

In [None]:
# copy files to genearte more inputs, now we have 10,000 PDF files
copies = [chunks_df] * 100
chunks_df = bpd.concat(copies, ignore_index=True)
chunks_df = chunks_df.cache()

In [None]:
# copy files again, now we have 100,000 PDF files
copies = [chunks_df] * 10
chunks_df = bpd.concat(copies, ignore_index=True)
chunks_df = chunks_df.cache()

In [6]:
bq_connection = "bigframes-dev.us.bigframes-default-connection"
chunks_df["chunk_text"] = chunks_df["pdf"].blob.pdf_chunk(
    connection=bq_connection)
# notes: use connection is not necessary, we can use default connection.
# However, in current stage, using a specfic conneciton will grant more quota



Explode column for future processing.

In [7]:
chunk_df_exploded = chunks_df["chunk_text"].explode()

Save to a temporary table

In [8]:
chunk_df_exploded = chunk_df_exploded.cache()

In [9]:
chunk_df_exploded

0    Integrating Reinforcement Learning, Action Mod...
0    Benyamin)
Preprint submitted to Artificial Int...
0    classical, discrete, environments.
Therefore, ...
0    setting we consider in this work isoffline lea...
0    more complex
problems that required longer-ter...
0    domain models for planning, and RL. We also pr...
0    means that a planning domain defines parameter...
0    which actions to perform in
order to collect n...
0    these
assumptions, NSAM is guaranteed to retur...
0    policy.
Off-policy algorithms are algorithms t...
0    the
environment, mining resources, collecting ...
0    must:
1. Harvest at least one wood block from ...
0    irreversible and the amount of resources in a ...
0    created by observing an expert solve different...
0    Moreover, most actions are TP TO actions, whic...
0    our RL models. Moreover, our gym environment i...
0    within that time limit,
we consider the run as...
0    length.
4https://imitation.readthedocs.io
5htt...
0    plann

# Generate Embeddings

Generation of embeddings within BigFrames.

In [None]:
from bigframes.ml import llm

text_embedding_model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
embeddings_df = text_embedding_model.predict(chunk_df_exploded)

Create Embedding table in Bigquery if not exist.

In [15]:
test_project_id = "bigframes-dev"
test_dataset_id = "shuowei_test_us"
test_table_id = "pdf_chunk_embedding"
embedding_table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}"

Save embedding into a BigQuery table for downstream processing.

In [None]:
embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists="replace")

# Create vector search index

Construction of an index over these embeddings

In [None]:
import bigframes.bigquery as bbq
bbq.create_vector_index(
    table_id=embedding_table_id,
    column_name="ml_generate_embedding_result",
    distance_type="cosine",
    index_type="ivf",
    ivf_options={"num_lists": 100},
)

# Search with pointers to the original pdf

Execution of vector search, with results linked back to the original PDFs

In [None]:
# generate the embedding of the words for search
searched_words = ["reinforce"]
searched_words_embeddings = text_embedding_model.predict(searched_words)
embedding_result_column = "ml_generate_embedding_result"

In [None]:
# perform vector search
search_result = (
    bbq.vector_search(
        base_table=embedding_table_id,
        column_to_search=embedding_result_column,
        query=searched_words_embeddings,
        query_column_to_search=embedding_result_column,
        top_k=3,
    )
)

In [None]:
search_result