# Create synthetic data from a Gretel Cloud Project

This Blueprint will walk you through consuming records from a Gretel Cloud Project and creating synthetic data from it. This blueprint assumes you already have a Gretel Cloud Projeect created with records uploaded. If you do not, feel free to create a new project from our console (https://console.gretel.cloud) and select this blueprint. Sample data will automatically be uploaded for you.

In [None]:
%%capture

!pip install -U gretel-client gretel-synthetics

In [None]:
# Be sure to use your Gretel URI here, which is available from the Integration menu in the Console

import getpass
import os

gretel_uri = os.getenv("GRETEL_URI") or getpass.getpass("Your Gretel URI")

In [None]:
# Install Gretel SDKs

from gretel_client import project_from_uri

project = project_from_uri(gretel_uri)
project.client.install_packages()

In [None]:
# Capture transient import errors in Google Colab

try:
    from gretel_helpers.synthetics import SyntheticDataBundle
except FileNotFoundError:
    from gretel_helpers.synthetics import SyntheticDataBundle

In [None]:
# Download records from Gretel Cloud and create a training DataFrame

from gretel_helpers.synthetics import create_df, SyntheticDataBundle

training_df = create_df(
    gretel_uri,
    num_rows=15000  # set to ``None`` to include all records
)

# Preview the data that will be synthesized
training_df

In [None]:
# Create synthetic training configuration

from pathlib import Path

checkpoint_dir = str(Path.cwd() / "checkpoints")

# All params: https://gretel-synthetics.readthedocs.io/en/stable/api/config.html
config_template = {
    "checkpoint_dir": checkpoint_dir,
    "vocab_size": 20000
}

In [None]:
# Create a model object, which provides high level API interfaces for building / saving / generating synthetic data

model = SyntheticDataBundle(
    training_df=training_df,
    delimiter=None, # if ``None``, it will try and automatically be detected, otherwise you can set it
    auto_validate=True, # build record validators that learn per-column, these are used to ensure generated records have the same composition as the original
    synthetic_config=config_template, # the config for Synthetics
    synthetic_batch_size=30, # cluster up to this many fields per individual model
)

In [None]:
# Create model metadata

model.build()

In [None]:
# Train the model

model.train()

In [None]:
# Generate some data

model.generate(num_lines=5000, max_invalid=5000)

In [None]:
# Re-assemble synthetic data back into a DataFrame

model.get_synthetic_df()

In [None]:
# Save your model, you can load this back into a Bundle later on

model.save("my_model.tar.gz")

In [None]:
# Generate a report that shows how the new synthetic data compares to the original training data
import IPython

report_path = './report.html'
model.generate_report(report_path=report_path)
IPython.display.HTML(filename=report_path)