In [11]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# BigFrames Multimodal DataFrame

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/googleapis/python-bigquery-dataframes/blob/main/notebooks/multimodal/multimodal_dataframe.ipynb">
      <img src="https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/colab-logo.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/multimodal/multimodal_dataframe.ipynb">
      <img src="https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/github-logo.png" width="32" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/multimodal/multimodal_dataframe.ipynb">
      <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35">
      Open in BQ Studio
    </a>
  </td>
</table>


This notebook is introducing BigFrames Multimodal features:
1. Create Multimodal DataFrame
2. Combine unstructured data with structured data
3. Conduct image transformations
4. Use LLM models to ask questions and generate embeddings on images
5. PDF chunking function
6. Transcribe audio
7. Extract EXIF metadata from images

### Setup

Install the latest bigframes package if bigframes version < 2.4.0

In [12]:
# !pip install bigframes --upgrade

In [13]:
PROJECT = "bigframes-dev" # replace with your project. 
# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions

LOCATION = "us" # replace with your location.

# Dataset where the UDF will be created.
DATASET_ID = "bigframes_samples" # replace with your dataset ID.

OUTPUT_BUCKET = "bigframes_blob_test" # replace with your GCS bucket. 
# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. 
# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.
# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.

import bigframes
# Setup project
bigframes.options.bigquery.project = PROJECT
bigframes.options.bigquery.location = LOCATION

# Display options
bigframes.options.display.blob_display_width = 300
bigframes.options.display.progress_bar = None

import bigframes.pandas as bpd
import bigframes.bigquery as bbq

In [14]:
import bigframes.bigquery as bbq

def get_runtime_json_str(series, mode="R", with_metadata=False):
    """
    Get the runtime (contains signed URL to access gcs data) and apply the
    ToJSONSTring transformation.
    
    Args:
        series: bigframes.series.Series to operate on.
        mode: "R" for read, "RW" for read/write.
        with_metadata: Whether to fetch and include blob metadata.
    """
    # 1. Optionally fetch metadata
    s = (
        bbq.obj.fetch_metadata(series)
        if with_metadata
        else series
    )
    
    # 2. Retrieve the access URL runtime object
    runtime = bbq.obj.get_access_url(s, mode=mode)
    
    # 3. Convert the runtime object to a JSON string
    return bbq.to_json_string(runtime)

def get_metadata(series):
    # Fetch metadata and extract GCS metadata from the details JSON field
    metadata_obj = bbq.obj.fetch_metadata(series)
    return bbq.json_query(metadata_obj.struct.field("details"), "$.gcs_metadata")

def get_content_type(series):
    return bbq.json_value(get_metadata(series), "$.content_type")

def get_size(series):
    return bbq.json_value(get_metadata(series), "$.size").astype("Int64")

def get_updated(series):
    return bpd.to_datetime(bbq.json_value(get_metadata(series), "$.updated").astype("Int64"), unit="us", utc=True)

def display_blob(series, n=3):
    import IPython.display as ipy_display
    import pandas as pd
    import requests
    
    # Retrieve access URLs and content types
    runtime_json = bbq.to_json_string(bbq.obj.get_access_url(series, mode="R"))
    read_url = bbq.json_value(runtime_json, "$.access_urls.read_url")
    content_type = get_content_type(series)
    
    # Pull to pandas to display
    pdf = bpd.DataFrame({"read_url": read_url, "content_type": content_type}).head(n).to_pandas()
    
    width = bigframes.options.display.blob_display_width
    height = bigframes.options.display.blob_display_height
    
    for _, row in pdf.iterrows():
        if pd.isna(row["read_url"]):
            ipy_display.display("<NA>")
        elif pd.isna(row["content_type"]):
            ipy_display.display(requests.get(row["read_url"]).content)
        elif row["content_type"].casefold().startswith("image"):
            ipy_display.display(ipy_display.Image(url=row["read_url"], width=width, height=height))
        elif row["content_type"].casefold().startswith("audio"):
            ipy_display.display(ipy_display.Audio(requests.get(row["read_url"]).content))
        elif row["content_type"].casefold().startswith("video"):
            ipy_display.display(ipy_display.Video(row["read_url"], width=width, height=height))
        else:
            ipy_display.display(requests.get(row["read_url"]).content)

### 1. Create Multimodal DataFrame
There are several ways to create Multimodal DataFrame. The easiest way is from the wildcard paths.

In [15]:
# Create blob columns from wildcard path.
df_image = bpd.from_glob_path(
    "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*", name="image"
)

# From an existing object table
# df = bpd.read_gbq_object_table("<my_object_table>", name="blob_col")

In [16]:
# Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame
df_image = df_image.head(5)
df_image

instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)


Unnamed: 0,image
0,
1,
2,
3,
4,


### 2. Combine unstructured data with structured data

Now you can put more information into the table to describe the files. Such as author info from inputs, or other metadata from the gcs object itself.

In [17]:
# Combine unstructured data with structured data
df_image = df_image.head(5)
df_image["author"] = ["alice", "bob", "bob", "alice", "bob"]  # type: ignore
df_image["content_type"] = get_content_type(df_image["image"])
df_image["size"] = get_size(df_image["image"])
df_image["updated"] = get_updated(df_image["image"])
df_image

instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)


Unnamed: 0,image,author,content_type,size,updated
0,,alice,image/png,1591240,2025-03-20 17:45:04+00:00
1,,bob,image/png,1182951,2025-03-20 17:45:02+00:00
2,,bob,image/png,1520884,2025-03-20 17:44:55+00:00
3,,alice,image/png,1235401,2025-03-20 17:45:19+00:00
4,,bob,image/png,1591923,2025-03-20 17:44:47+00:00


Then you can filter the rows based on the structured data. And for different content types, you can display them respectively or together.

In [18]:
# filter images and display, you can also display audio and video types
display_blob(df_image[df_image["author"] == "alice"]["image"])

### 3. Conduct image transformations

This section demonstrates how to perform image transformations like blur, resize, and normalize using custom BigQuery Python UDFs and the `opencv-python` library.

In [19]:
# Construct the canonical connection ID
FULL_CONNECTION_ID = f"{PROJECT}.{LOCATION}.bigframes-default-connection"

@bpd.udf(
    input_types=[str, str, str, int, int, bool],
    output_type=str,
    dataset=DATASET_ID,
    name="image_blur",
    bigquery_connection=FULL_CONNECTION_ID,
    packages=["opencv-python", "numpy", "requests"],
)
def image_blur(src_rt: str, dst_rt: str, ext: str, kx: int, ky: int, verbose: bool) -> str:
    import json
    import cv2 as cv
    import numpy as np
    import requests
    from requests import adapters
    try:
        session = requests.Session()
        session.mount("https://", adapters.HTTPAdapter(max_retries=3))
        src_obj, dst_obj = json.loads(src_rt), json.loads(dst_rt)
        src_url, dst_url = src_obj["access_urls"]["read_url"], dst_obj["access_urls"]["write_url"]
        response = session.get(src_url, timeout=30)
        response.raise_for_status()
        img = cv.imdecode(np.frombuffer(response.content, np.uint8), cv.IMREAD_UNCHANGED)
        kx, ky = int(kx), int(ky)
        img_blurred = cv.blur(img, (kx, ky))
        ext = ext.lower()
        success, encoded = cv.imencode(ext, img_blurred)
        if not success:
            raise ValueError(f"cv.imencode failed for extension {ext}")
        session.put(dst_url, data=encoded.tobytes(), headers={"Content-Type": "image/" + ext.replace(".", "")}, timeout=30).raise_for_status()
        return json.dumps({"status": "", "content": dst_rt}) if verbose else dst_obj["objectref"]["uri"]
    except Exception as e:
        if verbose: return json.dumps({"status": str(e), "content": ""})
        raise e

@bpd.udf(
    input_types=[str, str, str, int, int, float, float, bool],
    output_type=str,
    dataset=DATASET_ID,
    name="image_resize",
    bigquery_connection=FULL_CONNECTION_ID,
    packages=["opencv-python", "numpy", "requests"],
)
def image_resize(src_rt: str, dst_rt: str, ext: str, dx: int, dy: int, fx: float, fy: float, verbose: bool) -> str:
    import json
    import cv2 as cv
    import numpy as np
    import requests
    from requests import adapters
    try:
        session = requests.Session()
        session.mount("https://", adapters.HTTPAdapter(max_retries=3))
        src_obj, dst_obj = json.loads(src_rt), json.loads(dst_rt)
        src_url, dst_url = src_obj["access_urls"]["read_url"], dst_obj["access_urls"]["write_url"]
        response = session.get(src_url, timeout=30)
        response.raise_for_status()
        img = cv.imdecode(np.frombuffer(response.content, np.uint8), cv.IMREAD_UNCHANGED)
        img_resized = cv.resize(img, dsize=(dx, dy), fx=fx, fy=fy)
        success, encoded = cv.imencode(ext, img_resized)
        session.put(dst_url, data=encoded.tobytes(), headers={"Content-Type": "image/" + ext.replace(".", "")}, timeout=30).raise_for_status()
        return json.dumps({"status": "", "content": dst_rt}) if verbose else dst_obj["objectref"]["uri"]
    except Exception as e:
        if verbose: return json.dumps({"status": str(e), "content": ""})
        raise e

@bpd.udf(
    input_types=[str, str, str, float, float, str, bool],
    output_type=str,
    dataset=DATASET_ID,
    name="image_normalize",
    bigquery_connection=FULL_CONNECTION_ID,
    packages=["opencv-python", "numpy", "requests"],
)
def image_normalize(src_rt: str, dst_rt: str, ext: str, alpha: float, beta: float, norm_type: str, verbose: bool) -> str:
    import json
    import cv2 as cv
    import numpy as np
    import requests
    from requests import adapters
    try:
        session = requests.Session()
        session.mount("https://", adapters.HTTPAdapter(max_retries=3))
        src_obj, dst_obj = json.loads(src_rt), json.loads(dst_rt)
        src_url, dst_url = src_obj["access_urls"]["read_url"], dst_obj["access_urls"]["write_url"]
        response = session.get(src_url, timeout=30)
        response.raise_for_status()
        img = cv.imdecode(np.frombuffer(response.content, np.uint8), cv.IMREAD_UNCHANGED)
        norm_map = {"inf": cv.NORM_INF, "l1": cv.NORM_L1, "l2": cv.NORM_L2, "minmax": cv.NORM_MINMAX}
        img_normalized = cv.normalize(img, None, alpha=alpha, beta=beta, norm_type=norm_map[norm_type])
        success, encoded = cv.imencode(ext, img_normalized)
        session.put(dst_url, data=encoded.tobytes(), headers={"Content-Type": "image/" + ext.replace(".", "")}, timeout=30).raise_for_status()
        return json.dumps({"status": "", "content": dst_rt}) if verbose else dst_obj["objectref"]["uri"]
    except Exception as e:
        if verbose: return json.dumps({"status": str(e), "content": ""})
        raise e

def apply_transformation(series, dst_folder, udf, *args, verbose=False):
    import os
    dst_folder = os.path.join(dst_folder, "")
    # Fetch metadata to get the URI
    metadata = bbq.obj.fetch_metadata(series)
    current_uri = metadata.struct.field("uri")
    dst_uri = current_uri.str.replace(r"^.*\/(.*)$", rf"{dst_folder}\1", regex=True)
    dst_blob = dst_uri.str.to_blob(connection=FULL_CONNECTION_ID)
    df_transform = bpd.DataFrame({
        "src_rt": get_runtime_json_str(series, mode="R"),
        "dst_rt": get_runtime_json_str(dst_blob, mode="RW"),
        "ext": dst_uri.str.extract(r"(\.[0-9a-zA-Z]+$)")[0]
    })
    res = df_transform.apply(udf, axis=1, args=(*args, verbose))
    return res if verbose else res.str.to_blob(connection=FULL_CONNECTION_ID)

# Apply transformations
df_image["blurred"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_blur_transformed/", image_blur, 20, 20)
df_image["resized"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_resize_transformed/", image_resize, 300, 200, 0.0, 0.0)
df_image["normalized"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_normalize_transformed/", image_normalize, 50.0, 150.0, "minmax")

  return global_session.with_default_session(


### You can also chain functions together

Now you can put more information into the table to describe the files. Such as author info from inputs, or other metadata from the gcs object itself.

In [20]:
df_image["blurred_verbose"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_blur_transformed_verbose/", image_blur, 20, 20, verbose=True)
df_image[["blurred_verbose"]]



Unnamed: 0,blurred_verbose
0,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
1,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
2,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
3,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
4,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."


Then you can filter the rows based on the structured data. And for different content types, you can display them respectively or together.

In [21]:
# filter images and display, you can also display audio and video types
display_blob(df_image[df_image["author"] == "alice"]["image"])

### 3. Conduct image transformations
BigFrames Multimodal DataFrame provides image(and other) transformation functions. Such as image_blur, image_resize and image_normalize. The output can be saved to GCS folders or to BQ as bytes.

In [22]:
df_image["blurred"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_blur_transformed/", image_blur, 20, 20)
df_image["resized"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_resize_transformed/", image_resize, 300, 200, 0.0, 0.0)
df_image["normalized"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_normalize_transformed/", image_normalize, 50.0, 150.0, "minmax")



In [23]:
# You can also chain functions together
df_image["blur_resized"] = apply_transformation(df_image["blurred"], f"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/", image_resize, 300, 200, 0.0, 0.0)



### Using `verbose` mode for detailed output

All multimodal functions support a `verbose` parameter, which defaults to `False`.

*   When `verbose=False` (the default), the function will only return the main content of the result (e.g., the transformed image, the extracted text).
*   When `verbose=True`, the function returns a `STRUCT` containing two fields:
    *   `content`: The main result of the operation.
    *   `status`: An informational field. If the operation is successful, this will be empty. If an error occurs during the processing of a specific row, this field will contain the error message, allowing the overall job to complete without failing.

Using `verbose=True` is highly recommended for debugging and for workflows where you need to handle potential failures on a row-by-row basis. Let's see it in action with the `image_blur` function.

In [24]:
df_image["blurred_verbose"] = apply_transformation(df_image["image"], f"gs://{OUTPUT_BUCKET}/image_blur_transformed_verbose/", image_blur, 20, 20, verbose=True)
df_image[["blurred_verbose"]]



Unnamed: 0,blurred_verbose
0,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
1,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
2,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
3,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."
4,"{""status"": """", ""content"": ""{\""access_urls\"":{\..."


In [25]:
df_image

instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)
  return prop(*args, **kwargs)
  return prop(*args, **kwargs)
  return prop(*args, **kwargs)
  return prop(*args, **kwargs)


Unnamed: 0,image,author,content_type,size,updated,blurred,resized,normalized,blurred_verbose,blur_resized
0,,alice,image/png,1591240,2025-03-20 17:45:04+00:00,,,,"{""status"": """", ""content"": ""{\""access_urls\"":{\""expiry_time\"":\""2026-02-14T02:58:13Z\"",\""read_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fk9-guard-dog-paw-balm.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=8d2b714a76ac46c5af4a505f0867b0fcc3bd56b5190382ca42565dbd00e0a6e8aca32d8ac1db6003b847b92af4fcf6f2d2bf1175b3f5e804ed1c5505a095e4698462782a0695009c377d3c1d8ca00855d96bdeffc1b26c4d0cc4c3a6b58e343cb83c8eb0fcb206288be9b97b4759f2c40947f59f7d4816ad344192b18be3ce1a03e3cffed83d39f66b3572e542e7886ca46e5e8fd67b4ad2072ff3ab05cbc66bbb619109011ca70323237e3c97178d56c416bbf8c544b8beebfb53c5a3d9cf1513efd6b6edad603e41f21cb8375d25cddbc524a39213cf9d0da518292b6cbddce6a31468f554a6aa20e40752fb00b50b49fdabe77fa474e8fcb8e0b97280736e\"",\""write_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fk9-guard-dog-paw-balm.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=60dfe615382ea90d6af20427f885980794fe943f09b15457f1f26ed353f59b4abfcd0805578e9ba6c0b1688d1307c439b8294a3f74d2b0b43e89fc23d4b8802e06c713565394d6a962edbac9b874f309edde8c26c95c79d36d71f66d99dcae13d3da56044a4b9e995def8adfca7ce113e8c359cee784a3b99f582bace1271d4e0f9832fa2935d6cc847c7ffe99244257b3aa0920264027d8089566406c99792c19f933bb55db5b8f46d7755fc6aa868a2e9441b44c52839d99dd2ccc826a06c5b5d0b8575c4e836d2036e58d68dcaf41a7aefd9c3fbaa9b57bd6d6f20de38cbec71e66ec8f77c8470c69a8c001f36dd0a47717db408cc3ea7b747bec084a4d1f\""},\""objectref\"":{\""authorizer\"":\""bigframes-dev.us.bigframes-default-connection\"",\""uri\"":\""gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-paw-balm.png\""}}""}",
1,,bob,image/png,1182951,2025-03-20 17:45:02+00:00,,,,"{""status"": """", ""content"": ""{\""access_urls\"":{\""expiry_time\"":\""2026-02-14T02:58:13Z\"",\""read_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fk9-guard-dog-hot-spot-spray.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=0eb022977eec931fe33382899ed9f31ea6bc3b49b772a7e9f79a76e0a22c16267591fb28589a42dfe297361730acbd5391c2b1a28c07584e2be7f2e911eeae824f43dd85930a6f7f4ca07a0dba27af4be7750b000a4a2a961409d9cf6d9ab694cb2f5ef9b5747cbd569bb0689e384d3064636bc8a1c68a7a6a20aa5511d4fd51f496be35257b801c841f797a4a295c35827af35c6d2329676273248f101a02bb3a329561bcd5a8be6655f93b034a6338f3828e117b14e7786068887272ba42ff5f24f87c5f8b4a819a810af1d5c67d0e3c50fe0b26715603e74edadf5fc571d92f4a251758921cb3c5136286d621087eb2ceafb7a4a97eea6ae2670031cccebb\"",\""write_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fk9-guard-dog-hot-spot-spray.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=3b15d6570ebfd1dfbe8a6b0025a7eb80c9d7f6747a27c44586099f291d98ea40dcea90d6fd282fec0a9ce3e609372918e51b10c93515920642bc629129779e9dbd8dceef8cc72dac96e9da344cf44e0f5f1de4f6ba8a4e4c8fa4851df9203dd5904068d7b519b4740638ceade4f0bd70ac2f0f33cc7cd7770ea6363142c3d68c8ac59e845919f2f053d2818218416bff2edabf37d60f7954f6d55520a979700cbf61c186f21a607af48fbdeefb38953711d919efd3e4d5caee5fad36dbe798469f3c6bdd4d806cd7730ea448eee78fe94d3465528e2fe4bdc91369fd6de38f8c1d16bace9c4c898a91fc8c342fe9eb6c09b2f35aaed36910cedbda75605ea1b1\""},\""objectref\"":{\""authorizer\"":\""bigframes-dev.us.bigframes-default-connection\"",\""uri\"":\""gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-hot-spot-spray.png\""}}""}",
2,,bob,image/png,1520884,2025-03-20 17:44:55+00:00,,,,"{""status"": """", ""content"": ""{\""access_urls\"":{\""expiry_time\"":\""2026-02-14T02:58:13Z\"",\""read_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Ffluffy-buns-chinchilla-food-variety-pack.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=19c6bdad06410bb165f07c62722ea7bc2aa417422c84b588bd63a24074bc8cfa9a0d01c7ed44203ca5ba4a368cacd7425f3525d0bb6db02eaada53711ad0f065865d2ce3c0e93d12218e887f6449b510e0b8fecf6c2ba27968b6b3c79d7453cfc514306bd9e07e745892d630a2b33a66be8eafaffea22a7fc79abd1526ca24efde5bfa1c467da87c8f8b86174cb37e8792e546659242b01ea82972bb5469cc5941862e4a222cdd57ae62d928eacdb20301ad049f44b6f97b35e0275a90b84af2a4e320a411908b16da0e1de7be894dbc1c88d3c828fd3924c359b61bfcdfc527a2e6338c44e46df68258ea714a84f800b3554d691c8e1f00965b6db8a130d39d\"",\""write_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Ffluffy-buns-chinchilla-food-variety-pack.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=29eb2181bb94a9a2d33f02f8c1a00d98a399cfeb6fc344e47188e441fb5d5866f3ff67d79219f0bc8649918e08eb55cf9018aed588255428f25712570908ab4d7bdd5c855c5d9d472d749154cb6a1535d466ddf134f9cf373816fb511c77f19cc19421b8d20a19c3cfc79f197e09b0d9b9dcc8bdb02e823256f142a9b95a6b9425ed1b7696dc282ae814fb594e45e62c493334c4e3628836728fe396f33cc092fccd762d71e560223f488b76874cf8a21ee3dbaf2fe23faaab4062e3f42086bc9c46845e913bd8becd63e3877b1cfd2a806f85c082add4f34532bf91623b8eed95e902721516df4ba8fe715581ccdfbb0c69dc56652beee1434d08126064fae2\""},\""objectref\"":{\""authorizer\"":\""bigframes-dev.us.bigframes-default-connection\"",\""uri\"":\""gs://bigframes_blob_test/image_blur_transformed_verbose/fluffy-buns-chinchilla-food-variety-pack.png\""}}""}",
3,,alice,image/png,1235401,2025-03-20 17:45:19+00:00,,,,"{""status"": """", ""content"": ""{\""access_urls\"":{\""expiry_time\"":\""2026-02-14T02:58:13Z\"",\""read_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fpurrfect-perch-cat-scratcher.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=6d400c9b92869c7a979ff223125518ec9f7fb37197e25b7bfbc0e0376830ba344a1f5b3a9f105d45faf9138b2e67e18f778a144a6cf4bb832d73efdd49003c95c50123c4de9bde28c3b70695bfbfb655d16c2aa92a6fad861d071af509202b82b71e99e0aa992367fb266bda24074acbcfc7201b087d4fb62b9c42d47482406ad9b87dda7ca803274fceca26d7e20e899ba6b6c4a07f0af67ed5b12f7af72d311777dd99e276bda3e3b30e3d723e70d0bd4d6a53abdb02f1108319d6270ee9e5fe30c3782f31f33c6fafce71fb533609358ad970e7b6d35b8486700ba3e7d7f7bfb9fa2f13d5fe0edd83a8201837288d919c3da7ff967e0a1340233de4ee4bf2\"",\""write_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fpurrfect-perch-cat-scratcher.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=4f6c06e462a7faa5f1f3a341ba115d84cdb8a674f58d9678f3c6b1e69c668259d144dbe59b6a73739758a0080fe7931e13026f0346df9eecb3f4f808deb199e7b2edcb7c6d3933e591b9689cad95fe85a02aabc98a59c80fcd695fc655f9501840eebbcf1ebf03433dc9370985599c14a65f8f9b763f14571977688d5da6e56b6979cec3200499dbd8a81fe58786558257e93dfc159a866878468a36cbb8abecce2c4845ab930862f5cd74dce148fd5103279361fc0343735cda980311222c0710236002648d6b6b735d68e2fded9e1c96501a8f080a2c4e913183e752b61912ea1ab3f09586ef1e2564744a5f5d477526b7eb49702c173752cd15c91b0a283d\""},\""objectref\"":{\""authorizer\"":\""bigframes-dev.us.bigframes-default-connection\"",\""uri\"":\""gs://bigframes_blob_test/image_blur_transformed_verbose/purrfect-perch-cat-scratcher.png\""}}""}",
4,,bob,image/png,1591923,2025-03-20 17:44:47+00:00,,,,"{""status"": """", ""content"": ""{\""access_urls\"":{\""expiry_time\"":\""2026-02-14T02:58:13Z\"",\""read_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fchirpy-seed-deluxe-bird-food.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=ae50ee928cb422e14117cd4e6743c5501ec897036ee52e285ef18a65cb083f50c34fb7f7b859df5ef9cc05a522726cb89a86270a7b48a02082ec4de96fa351340b4b07bd5cd2b097aa2d86a9272ddb0ab61725041e7ddfeb5c3851df473ab3f9dbbcaa0956847f7da76ee2706b5b999e02c7d4058d66566573cbc9e15e6bf8c7ff92ef7470ad4730ab3214d33447a2fa29ac3f12909b9e68f77ab9e879b24b320d16ec7ebddd29cf482f79bfc62d06ea623868f6b79fc40b912efa78d81f1a8c3b368ca7f3b30881bc269e9a7ccc91ad312a41ff378c808e5ae9016711333708d1f87b090dce503ba37da918b0b0a776554375a9cf5be61a934c3f186e24da10\"",\""write_url\"":\""https://storage.googleapis.com/bigframes_blob_test/image_blur_transformed_verbose%2Fchirpy-seed-deluxe-bird-food.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=bqcx-1084210331973-pcbl%40gcp-sa-bigquery-condel.iam.gserviceaccount.com%2F20260213%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260213T205813Z&X-Goog-Expires=21600&X-Goog-SignedHeaders=host&X-Goog-Signature=0046282ccd9e822e87f190d24935b0adab7f38ed6ecfd4c4c0b0c6c9049267684796306f0058e3d8aaf91c166500c25bed719d5953989262050faec7b4dd1f9d3e1c6a71ecdcf45a0e92e61cf962010a2268770a8e75d68849ba98742c874f7e49034fedb01fc1e5d8f37321b36f66f386b6d1de964de10977f6df5896b47451219b65c4f9a254855216402bacd2f88ebdc473dd95502d51f1b8b2b198ea1b1bb845bcd0eefa498f5e6486457d10e4d636c864d7c184bb03a03f66407401e556897109caeda270b32792f2ddff82d8ce3841b77ad993209803f2abaae7e8f78a3c4cfa9eca0ef9d7b920b79419572bb1c794759ef87b8b90b718f6454b8c4f74\""},\""objectref\"":{\""authorizer\"":\""bigframes-dev.us.bigframes-default-connection\"",\""uri\"":\""gs://bigframes_blob_test/image_blur_transformed_verbose/chirpy-seed-deluxe-bird-food.png\""}}""}",


### 4. Use LLM models to ask questions and generate embeddings on images

In [26]:
from bigframes.ml import llm
gemini = llm.GeminiTextGenerator()

default model will be removed in BigFrames 3.0. Please supply an
explicit model to avoid this message.
  return method(*args, **kwargs)


In [27]:
# Ask the same question on the images
answer = gemini.predict(df_image, prompt=["what item is it?", df_image["image"]])
answer[["ml_generate_text_llm_result", "image"]]

instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)
instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)


Unnamed: 0,ml_generate_text_llm_result,image
0,"Based on the image, the item is a tin of dog paw balm. It's labeled ""K9Guard Dog Paw Balm.""",
1,The item is a bottle of K9 Guard Dog Hot Spot Spray.,
2,"The item is rabbit food or treats from ""Fluffy Buns"". There are three varieties: ""Timoth Hay Lend Variety Blend"", ""Herbal Greeıs Mix Variety Blend"", and ""Berry & Blossom Treat Blend"".\n",
3,The item is a cat tree.\n,
4,"The item is a bag of ""Chirpy Seed"" Deluxe Bird Food.",


In [28]:
# Ask different questions
df_image["question"] = [
    "what item is it?",
    "what color is the picture?",
    "what is the product name?",
    "is it for pets?",
    "what is the weight of the product?",
]

In [29]:
answer_alt = gemini.predict(df_image, prompt=[df_image["question"], df_image["image"]])
answer_alt[["ml_generate_text_llm_result", "image"]]

instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)
instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)


Unnamed: 0,ml_generate_text_llm_result,image
0,The item is a tin of K9 Guard Dog Paw Balm.,
1,"The picture has multiple colors, including white, light blue, black, and green. The background is a light gray.\n",
2,Here are the three product names that are visible in the image:\n\n1. **Timothy Hay Blend Variety Blend**\n2. **Herbal Greens Mix Variety Blend**\n3. **Berry & Blossom Treat Blend**,
3,"Yes, the item in the image is a cat tree, which is a type of furniture designed for pets, specifically cats.",
4,The net weight of the product is 15 oz or 257g.,


In [30]:
# Generate embeddings.
embed_model = llm.MultimodalEmbeddingGenerator()
embeddings = embed_model.predict(df_image["image"])
embeddings

default model will be removed in BigFrames 3.0. Please supply an
explicit model to avoid this message.
  return method(*args, **kwargs)
instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.
  return prop(*args, **kwargs)
instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.


Unnamed: 0,ml_generate_embedding_result,ml_generate_embedding_status,ml_generate_embedding_start_sec,ml_generate_embedding_end_sec,content
0,[ 0.00638822 0.01666385 0.00451817 ... -0.02...,,,,"{""access_urls"":{""expiry_time"":""2026-02-14T03:0..."
1,[ 0.00973689 0.02148374 0.00244311 ... 0.00...,,,,"{""access_urls"":{""expiry_time"":""2026-02-14T03:0..."
2,[ 0.01197331 0.02138491 0.05967776 ... -0.01...,,,,"{""access_urls"":{""expiry_time"":""2026-02-14T03:0..."
3,[-0.02621007 0.02797794 0.04416854 ... -0.01...,,,,"{""access_urls"":{""expiry_time"":""2026-02-14T03:0..."
4,[ 0.05918613 0.01251376 0.01907326 ... 0.01...,,,,"{""access_urls"":{""expiry_time"":""2026-02-14T03:0..."


### 5. PDF extraction and chunking function

This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library.

In [31]:
# Construct the canonical connection ID
FULL_CONNECTION_ID = f"{PROJECT}.{LOCATION}.bigframes-default-connection"

@bpd.udf(
    input_types=[str],
    output_type=str,
    dataset=DATASET_ID,
    name="pdf_extract",
    bigquery_connection=FULL_CONNECTION_ID,
    packages=["pypdf", "requests", "cryptography"],
)
def pdf_extract(src_obj_ref_rt: str) -> str:
    import io
    import json
    from pypdf import PdfReader
    import requests
    from requests import adapters
    session = requests.Session()
    session.mount("https://", adapters.HTTPAdapter(max_retries=3))
    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
    src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
    response = session.get(src_url, timeout=30, stream=True)
    response.raise_for_status()
    pdf_bytes = response.content
    pdf_file = io.BytesIO(pdf_bytes)
    reader = PdfReader(pdf_file, strict=False)
    all_text = ""
    for page in reader.pages:
        page_extract_text = page.extract_text()
        if page_extract_text:
            all_text += page_extract_text
    return all_text

@bpd.udf(
    input_types=[str, int, int],
    output_type=list[str],
    dataset=DATASET_ID,
    name="pdf_chunk",
    bigquery_connection=FULL_CONNECTION_ID,
    packages=["pypdf", "requests", "cryptography"],
)
def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:
    import io
    import json
    from pypdf import PdfReader
    import requests
    from requests import adapters
    session = requests.Session()
    session.mount("https://", adapters.HTTPAdapter(max_retries=3))
    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
    src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
    response = session.get(src_url, timeout=30, stream=True)
    response.raise_for_status()
    pdf_bytes = response.content
    pdf_file = io.BytesIO(pdf_bytes)
    reader = PdfReader(pdf_file, strict=False)
    all_text_chunks = []
    curr_chunk = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            curr_chunk += page_text
            while len(curr_chunk) >= chunk_size:
                split_idx = curr_chunk.rfind(" ", 0, chunk_size)
                if split_idx == -1:
                    split_idx = chunk_size
                actual_chunk = curr_chunk[:split_idx]
                all_text_chunks.append(actual_chunk)
                overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]
                curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]
    if curr_chunk:
        all_text_chunks.append(curr_chunk)
    return all_text_chunks

  return global_session.with_default_session(


In [32]:
df_pdf = bpd.from_glob_path("gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf")

# Generate a JSON string containing the runtime information (including signed read URLs)
access_urls = get_runtime_json_str(df_pdf["pdf"], mode="R")

# Apply PDF extraction
df_pdf["extracted_text"] = access_urls.apply(pdf_extract)

# Apply PDF chunking
df_pdf["chunked"] = access_urls.apply(pdf_chunk, args=(2000, 200))

df_pdf[["extracted_text", "chunked"]]

Unnamed: 0,extracted_text,chunked
0,CritterCuisine Pro 5000 - Automatic Pet Feeder...,"[""CritterCuisine Pro 5000 - Automatic Pet Feed..."


In [33]:
# Explode the chunks to see each chunk as a separate row
chunked = df_pdf["chunked"].explode()
chunked

### 6. Audio transcribe

In [34]:
audio_gcs_path = "gs://bigframes_blob_test/audio/*"
df = bpd.from_glob_path(audio_gcs_path, name="audio")

In [35]:
# The audio_transcribe function is a convenience wrapper around bigframes.bigquery.ai.generate.
# Here's how to perform the same operation directly:

audio_series = df['audio']
prompt_text = (
    "**Task:** Transcribe the provided audio. **Instructions:** - Your response "
    "must contain only the verbatim transcription of the audio. - Do not include "
    "any introductory text, summaries, or conversational filler in your response. "
    "The output should begin directly with the first word of the audio."
)

# Convert the audio series to the runtime representation required by the model.
# This involves fetching metadata and getting a signed access URL.
audio_metadata = bbq.obj.fetch_metadata(audio_series)
audio_runtime = bbq.obj.get_access_url(audio_metadata, mode="R")

transcribed_results = bbq.ai.generate(
    prompt=(prompt_text, audio_runtime),
    endpoint="gemini-2.0-flash-001",
    model_params={"generationConfig": {"temperature": 0.0}},
)

transcribed_series = transcribed_results.struct.field("result").rename("transcribed_content")
transcribed_series

instead of using `db_dtypes` in the future when available in pandas
(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.


In [36]:
# To get verbose results (including status), we can extract both fields from the result struct.
transcribed_content_series = transcribed_results.struct.field("result")
transcribed_status_series = transcribed_results.struct.field("status")

transcribed_series_verbose = bpd.DataFrame(
    {
        "status": transcribed_status_series,
        "content": transcribed_content_series,
    }
)
# Package as a struct for consistent display
transcribed_series_verbose = bbq.struct(transcribed_series_verbose).rename("transcription_results")
transcribed_series_verbose

### 7. Extract EXIF metadata from images

This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library.

In [37]:
# Construct the canonical connection ID
FULL_CONNECTION_ID = f"{PROJECT}.{LOCATION}.bigframes-default-connection"

@bpd.udf(
    input_types=[str],
    output_type=str,
    dataset=DATASET_ID,
    name="extract_exif",
    bigquery_connection=FULL_CONNECTION_ID,
    packages=["pillow", "requests"],
    max_batching_rows=8192,
    container_cpu=0.33,
    container_memory="512Mi"
)
def extract_exif(src_obj_ref_rt: str) -> str:
    import io
    import json
    from PIL import ExifTags, Image
    import requests
    from requests import adapters
    session = requests.Session()
    session.mount("https://", adapters.HTTPAdapter(max_retries=3))
    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
    src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
    response = session.get(src_url, timeout=30)
    bts = response.content
    image = Image.open(io.BytesIO(bts))
    exif_data = image.getexif()
    exif_dict = {}
    if exif_data:
        for tag, value in exif_data.items():
            tag_name = ExifTags.TAGS.get(tag, tag)
            exif_dict[tag_name] = value
    return json.dumps(exif_dict)

  return global_session.with_default_session(


In [38]:
# Create a Multimodal DataFrame from the sample image URIs
exif_image_df = bpd.from_glob_path(
    "gs://bigframes_blob_test/images_exif/*",
    name="blob_col",
)

# Generate a JSON string containing the runtime information (including signed read URLs)
# This allows the UDF to download the images from Google Cloud Storage
access_urls = get_runtime_json_str(exif_image_df["blob_col"], mode="R")

# Apply the BigQuery Python UDF to the runtime JSON strings
# We cast to string to ensure the input matches the UDF's signature
exif_json = access_urls.astype(str).apply(extract_exif)

# Parse the resulting JSON strings back into a structured JSON type for easier access
exif_data = bbq.parse_json(exif_json)

exif_data

change in future versions.
