In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# BigFrames LLM Output Schema

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb">
      <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35">
      Open in BQ Studio
    </a>
  </td>
</table>


This Notebook introduces BigFrames LLM with output schema to generate structured output dataframes.

### Setup

In [2]:
PROJECT = "bigframes-dev" # replace with your project

import bigframes
# Setup project
bigframes.options.bigquery.project = PROJECT
bigframes.options.display.progress_bar = None

import bigframes.pandas as bpd
from bigframes.ml import llm

### 1. Create a BigFrames DataFrame and a Gemini model
Starting from creating a simple dataframe of several cities and a Gemini model in BigFrames

In [3]:
df = bpd.DataFrame({"city": ["Seattle", "New York", "Shanghai"]})
df

  _global_session = bigframes.session.connect(


Unnamed: 0,city
0,Seattle
1,New York
2,Shanghai


In [4]:
gemini = llm.GeminiTextGenerator()

default model will be removed in BigFrames 3.0. Please supply an
explicit model to avoid this message.
  return method(*args, **kwargs)


### 2. Generate structured output data
Before, llm models can only generate text output. Saying if you want to know whether the city is a US city, for example:

In [5]:
result = gemini.predict(df, prompt=[df["city"], "is a US city?"])
result[["city", "ml_generate_text_llm_result"]]

`db_dtypes` is a preview feature and subject to change.


Unnamed: 0,city,ml_generate_text_llm_result
0,Seattle,"Yes, Seattle is a city in the United States. I..."
1,New York,"Yes, New York City is a city in the United Sta..."
2,Shanghai,"No, Shanghai is not a US city. It is a major c..."


The outputs are text results that human can read. But if want the output data to be more useful for analysis, it is better to transfer to structured data like boolean, int or float values. Usually the process wasn't easy.

Now you can get structured output out-of-the-box by specifying the output_schema parameter in Gemini model predict method. In below example, the outputs are only boolean values.

In [6]:
result = gemini.predict(df, prompt=[df["city"], "is a US city?"], output_schema={"is_us_city": "bool"})
result[["city", "is_us_city"]]

`db_dtypes` is a preview feature and subject to change.


Unnamed: 0,city,is_us_city
0,Seattle,True
1,New York,True
2,Shanghai,False


You can also get float or int values, for example, to get populations in millions:

In [7]:
result = gemini.predict(df, prompt=["what is the population in millions of", df["city"]], output_schema={"population_in_millions": "float64"})
result[["city", "population_in_millions"]]

`db_dtypes` is a preview feature and subject to change.


Unnamed: 0,city,population_in_millions
0,Seattle,0.75
1,New York,19.68
2,Shanghai,26.32


And yearly rainy days:

In [8]:
result = gemini.predict(df, prompt=["how many rainy days per year in", df["city"]], output_schema={"rainy_days": "int64"})
result[["city", "rainy_days"]]

`db_dtypes` is a preview feature and subject to change.


Unnamed: 0,city,rainy_days
0,Seattle,152
1,New York,123
2,Shanghai,123


### 3. Generate all types of data in one prediction
You can get the different output columns and types in one prediction. 

Note it doesn't require dedicated prompts, as long as the output column names are informative to the model.

In [9]:
result = gemini.predict(df, prompt=[df["city"]], output_schema={"is_US_city": "bool", "population_in_millions": "float64", "rainy_days_per_year": "int64"})
result[["city", "is_US_city", "population_in_millions", "rainy_days_per_year"]]

`db_dtypes` is a preview feature and subject to change.


Unnamed: 0,city,is_US_city,population_in_millions,rainy_days_per_year
0,Seattle,True,0.75,152
1,New York,True,8.8,121
2,Shanghai,False,26.32,115


### 4. Generate composite data types

Composite datatypes like array and struct can also be generated. Here the example generates a places_to_visit column as array of strings and a gps_coordinates as struct of floats. Along with previous fields, all in one prediction.

In [10]:
result = gemini.predict(df, prompt=[df["city"]], output_schema={"is_US_city": "bool", "population_in_millions": "float64", "rainy_days_per_year": "int64", "places_to_visit": "array<string>", "gps_coordinates": "struct<latitude float64, longitude float64>"})
result[["city", "is_US_city", "population_in_millions", "rainy_days_per_year", "places_to_visit", "gps_coordinates"]]

`db_dtypes` is a preview feature and subject to change.


Unnamed: 0,city,is_US_city,population_in_millions,rainy_days_per_year,places_to_visit,gps_coordinates
0,Seattle,True,0.74,150,['Space Needle' 'Pike Place Market' 'Museum of...,"{'latitude': 47.6062, 'longitude': -122.3321}"
1,New York,True,8.4,121,['Times Square' 'Central Park' 'Statue of Libe...,"{'latitude': 40.7128, 'longitude': -74.006}"
2,Shanghai,False,26.32,115,['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori...,"{'latitude': 31.2304, 'longitude': 121.4737}"
