# Create a synthetic version of your own CSV or DataFrame

This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns.

In [None]:
%%capture

!pip install -U gretel-client "gretel-synthetics>=0.14.0" pandas

In [None]:
# Load your Gretel API key. You can acquire this from the Gretel Console @ https://console.gretel.cloud

import getpass
import os

gretel_api_key = os.getenv("GRETEL_API_KEY") or getpass.getpass("Your Gretel API Key")

In [None]:
# Install Gretel SDKs

from gretel_client import get_cloud_client

client = get_cloud_client("api", api_key=gretel_api_key)

client.install_packages()

In [None]:
# Load and preview dataset

import pandas as pd

dataset_path = 'https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/healthcare-analytics-vidhya/train_data.csv'
nrows = 10000  # We will use this later when generating data
training_df = pd.read_csv(dataset_path, nrows=nrows)
training_df.head()

In [None]:
# Create the Gretel Synthtetics Training / Model Configuration

from pathlib import Path

checkpoint_dir = str(Path.cwd() / "checkpoints")

config_template = {
    "checkpoint_dir": checkpoint_dir,
    "vocab_size": 20000
}

In [None]:
# Create a Gretel Synthetic Data Bundle

from gretel_helpers.synthetics import create_df, SyntheticDataBundle

bundle = SyntheticDataBundle(
    training_df=training_df,
    delimiter=None, # if ``None``, it will try and automatically be detected, otherwise you can set it
    auto_validate=True, # build record validators that learn per-column, these are used to ensure generated records have the same composition as the original
    synthetic_config=config_template, # the config for Synthetics
)

In [None]:
bundle.build()

In [None]:
bundle.train()

In [None]:
# num_lines: how many rows to generate
# max_invalid: the number of rows that do not pass semantic validation, if this number is exceeded, training will
# stop
bundle.generate(num_lines=nrows, max_invalid=nrows)

In [None]:
bundle.get_synthetic_df().head()

In [None]:
# Generate report that shows the statistical performance between the training and synthetic data

bundle.generate_report()

In [None]:
# Optionally save your model

bundle.save("my_model.tar.gz")

In [None]:
# Export your new data

bundle.get_synthetic_df().to_csv('synthetic-data.csv', index=False)