In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# BigQuery DataFrames: Synthetic Data Generation

In addition to BigQuery DataFrames (installing which also installs `pandas` as a dependency) we will use
`faker` library as a building block for synthetic data generation.

In [None]:
!pip install faker

Collecting faker
  Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-24.9.0


In [2]:
import bigframes.pandas as bpd
bpd.options.bigquery.project = PROJECT_ID

Let's use `GeminiTextGenerator` for our purpose, which is BigQuery DataFrame's state-of-the-art LLM integration at the time of writing this notebook (Apr 16 2024).

In [3]:
from bigframes.ml.llm import GeminiTextGenerator

model = GeminiTextGenerator()

  return Session(context)


Craft a prompt for the LLM to indicate the schema of the desired data and hints for the code that could generate such data. 

In [5]:
prompt = """\
Write python code to generate a pandas dataframe based on the requirements:
  Column name: Name, type: string, Description: Latin American Names
  Column name: Age, type: int
  Column name: Gender, type: string, Description: Inclusive

Note:
  - Return the code only, no additional texts or comments
  - Use faker library
  - Generate 100 rows
  - The final dataframe should be named 'result_df'.
"""

df_prompt = bpd.DataFrame({"prompt" : [prompt]})
df_prompt

Unnamed: 0,prompt
0,Write python code to generate a pandas datafra...


Be accommodating that LLM may not produce a runnable code in the first go and may need some nudging. We will retry by adding the failing code and the exception it throws as additional context in the prompt.

In [6]:
max_tries = 5
for i in range(max_tries):
  # Get LLM generated code
  df_result = model.predict(df_prompt)
  llm_result = df_result['ml_generate_text_llm_result'].iloc[0]

  # Python code comes back as a markdown code block,
  # remove the prefix "```python" and suffix "```"
  code = llm_result[9:-3]
  print(code)

  # Check if the generated code is runnable
  try:
    exec(code)
    break
  except Exception as ex:
    print(ex)
    error_context = f"""
Previous code:
{code}

Had this exception:
{ex}"""

    # Update the prompt to help LLM correct error
    df_prompt["prompt"] += error_context

    # If we have exhausted max tries then stop trying
    if i+1 == max_tries:
      raise Exception("Failed to generate runnable code")


import pandas as pd
from faker import Faker

fake = Faker('es_ES')
result_df = pd.DataFrame({
    'Name': [fake.name() for _ in range(100)],
    'Age': [fake.random_int(min=18, max=65) for _ in range(100)],
    'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]
})



Run the generated code and verify that it produced the desired data.

In [7]:
execution_context = {}
exec(code, execution_context)
execution_context.get("result_df")

Unnamed: 0,Name,Age,Gender
0,Pastora Acuña Company,21,Male
1,León Reig-Salom,39,Non-binary
2,Aura Tomás Llobet,30,Female
3,Vicente Correa Palomar,64,Female
4,Benito del Fuster,34,Female
...,...,...,...
95,Eduardo Cabrera,27,Non-binary
96,Nazaret de Izaguirre,40,Non-binary
97,Manuela Agullo Bustamante,27,Female
98,Eugenio Mateo Naranjo Blazquez,36,Non-binary


We want to run this code at scale to generate since we want to generate large amount of data. Let's deploy a `remote_function` for this purpose.

In [8]:
@bpd.remote_function([int], str, packages=['faker', 'pandas'])
def data_generator(id):
  context = {}
  exec(code, context)
  result_df = context.get("result_df")
  return result_df.to_json(orient="records")

data_generator.bigframes_cloud_function

'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'

Let’s say we want to generate 1 million rows of synthetic data. Since our generated code produces 100 rows in one run, we can initialize an indicator dataframe with 1M/100 = 10K indicator rows. Then we can apply the remote function to produce 100 synthetic data rows for each indicator row.

In [10]:
desired_num_rows = 1_000_000 # 1 million rows
batch_size = 100 # used in the prompt
num_batches = int(desired_num_rows/batch_size)

df = bpd.DataFrame({"row_id": range(num_batches)})

In [11]:
df["json_data"] = df["row_id"].apply(data_generator)

At this point each item in `df["json_data"]` is a json serialized array of 100 records. Let’s flatten that into 1 record per row using a direct SQL.

In [12]:
sql = f"""
WITH T0 AS ({df.sql}),
T1 AS (
  SELECT PARSE_JSON(json_row) AS json_row
  FROM T0, UNNEST(JSON_EXTRACT_ARRAY(json_data)) AS json_row
)
SELECT STRING(json_row.Name) AS Name,
       INT64(json_row.Age) AS Age,
       STRING(json_row.Gender) AS Gender
FROM T1
"""
df_result = bpd.read_gbq(sql)
df_result

Unnamed: 0,Name,Age,Gender
0,Eloy Santiago-Aragón,31,Male
1,Amanda Mata Abril,20,Non-binary
2,Danilo Velázquez Salcedo,58,Male
3,Leyre Alba España,61,Female
4,Paulina Amores Pastor,41,Male
5,Jorge Cuadrado Mena,50,Female
6,Chucho Catalán,36,Non-binary
7,Vidal Benavente Lerma,38,Male
8,Clementina Álamo,32,Female
9,Petrona Roselló-Valls,61,Male


There you have it, 1 million synthetic data rows ready to use, or save them in a BigQuery table for future use.