In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. 

# Preparation

First, import the BigFrames modules.

In [2]:
import bigframes
import bigframes.pandas as bpd

Make sure the BigFrames version is at least `1.36.0`

In [3]:
from packaging.version import Version

assert Version(bigframes.__version__) >= Version("1.36.0")

Connect to test environmet

In [4]:
bigframes.options.experiments.blob = True
bigframes.options._bigquery_options.client_endpoints_override = {"bqclient": "https://test-bigquery.sandbox.google.com", 
                                                           "bqconnectionclient": "test-bigqueryconnection.sandbox.googleapis.com", 
                                                           "bqstoragereadclient": "test-bigquerystorage-grpc.sandbox.googleapis.com"}

to change in the future.[0m
endpoints. Incorrect use may lead to unexpected behavior or system
instability. Proceed only if you fully understand its implications.[0m


# PDF chunk

Retrieval of PDF URLs

In [5]:
chunks_df = bpd.from_glob_path("gs://shuowei_bucket/pdf/*")
chunks_df.columns = ["uri"]

  return func(get_global_session(), *args, **kwargs)


In [6]:
#chunks_df = chunks_df.head(50)

In [7]:
# copy twice for testing
#copies = [chunks_df] * 10000
#chunks_df = bpd.concat(copies, ignore_index=True)

Text extraction, and chunking

In [8]:
bq_connection = "bigframes-dev.us.bigframes-default-connection"
chunks_df["chunk_text"] = chunks_df["uri"].blob.pdf_chunk(connection=bq_connection, chunk_size=2000, overlap_size=200)



Explode column for future processing.

In [9]:
chunk_df_exploded = chunks_df["chunk_text"].explode()

In [10]:
chunk_df_exploded.cache()

0    Hydra: Bidirectional State Space Models
Throug...
0    multiple domains, including language and visio...
0    Mixing
��ℳ��&
Figure 1: (Left) A schematic of ...
0    parameterizations underpin efficient sequence ...
0    more
coherent and theoretically grounded advan...
0    important characteristics of downstream sequen...
0    preprocessing function and the matrix construc...
0    Sequence Aligned Matrices (SAM) to
systematica...
0    Toeplitz matrix mixer; GSS [26] adds a data-de...
0    (FNet is a structured matrix mixer without seq...
0    each generated fromQand K. Specifically, each
...
0    ��$"��$:&×"��&"��$"��$:'×"��'"
��&"��&:!×"��!"...
0    represented within the matrix mixer framework,...
0    defined
as follows: a matrixM is N-quasisepara...
0    This generosity in the rank-based definition s...
0    consequence of the favorable mathematical prop...
0    84.1 88.2 69.1 91.0 85.9 47.6 83.9 78.4
Attent...
0    analyzing the matrix mixer framework through e...
0    rigor

# Generate Embeddings

Generation of embeddings within BigFrames.

In [11]:
from bigframes.ml import llm

text_embedding_model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")

In [12]:
# generate embeddings
embeddings_df = text_embedding_model.predict(chunk_df_exploded)

is in preview; this behavior may change in future versions.[0m


Create Embedding table in Bigquery if not exist.

In [13]:
test_project_id = "bigframes-dev"
test_dataset_id = "shuowei_test_us"
test_table_id = "pdf_chunk_embedding_v10"
embedding_table_id = f"{test_project_id}.{test_dataset_id}.{test_table_id}"

Save embedding into a BigQuery table for downstream processing..

In [14]:

embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists="replace")

'bigframes-dev.shuowei_test_us.pdf_chunk_embedding_v10'

# Create vector search index

Construction of an index over these embeddings

In [15]:
import bigframes.bigquery as bbq
bbq.create_vector_index(
    table_id=embedding_table_id_v11,
    column_name="ml_generate_embedding_result",
    distance_type="cosine",
    index_type="ivf",
    ivf_options={"num_lists": 100},
)

NameError: name 'embedding_table_id_v11' is not defined

# Search with pointers to the original pdf

Execution of semantic search, with results linked back to the original PDFs

In [None]:
# temp test code, reada from gbq
embeddings_df = bpd.read_gbq(embedding_table_id)
embedding_table_id_v11 = "bigframes-dev.shuowei_test_us.pdf_chunk_embedding_v11"
# copy twice for testing
copies = [embeddings_df] * 5
embeddings_df= bpd.concat(copies, ignore_index=True)
type(embeddings_df)
embeddings_df.to_gbq(destination_table=embedding_table_id_v11, if_exists="replace")

In [None]:
bigframes.options.experiments.semantic_operators = True

embeddings_df.semantics.search(
    "ml_generate_embedding_result", 
    "reinforce", 
    top_k=3, 
    model=text_embedding_model, 
    score_column="distance")

In [None]:
# generate embedding for the word for searching
searched_words = ["reinforce"]
searched_words_embeddings = text_embedding_model.predict(searched_words)


In [None]:

search_query = bpd.DataFrame({"query_id": ["dog", "cat"], embedding=})
result_df = bbq.vector_search(
    base_table=
    column_to_search=
    query=search_query,
    distance_type="cosine",
    top_k=5,
)