<a href="https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/AW%2Fadd-notebooks/Synthetic_Data_Walkthrough.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create synthetic data with the Python SDK

This notebook utilizes Gretel's SDK and APIs to create a synthetic version of a popular machine learning financial dataset. 

To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.



In [None]:
%%capture
!pip install -U gretel-client

In [None]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client.config import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

Enter Gretel API key··········


In [None]:
# Create a project

from gretel_client.projects import get_project

project = get_project(create=True)

## Create the synthetic data configuration
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [None]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Set the model epochs to 50
config['models'][0]['synthetics']['params']['epochs'] = 50

print(json.dumps(config, indent=2))

{
  "schema_version": 1.0,
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 50,
          "batch_size": 64,
          "vocab_size": 20000,
          "reset_states": false,
          "learning_rate": 0.001,
          "rnn_units": 256,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": true,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 5000,
          "max_invalid": 5000
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": "medium"
        }
      }
    }
  ]
}


## Load and preview the source dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [None]:
# Load and preview dataset to train the synthetic model on.
import pandas as pd

model = project.create_model(model_config=config)
model.data_source = 'https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/USAdultIncome5k.csv'

pd.read_csv(model.data_source)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,42,Private,255847,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,4386,0,48,United-States,>50K
1,34,Private,111567,HS-grad,9,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States,<=50K
2,34,Private,263307,Bachelors,13,Never-married,Sales,Unmarried,Black,Male,0,0,45,?,<=50K
3,69,Private,174474,10th,6,Separated,Machine-op-inspct,Not-in-family,White,Female,0,0,28,Peru,<=50K
4,26,Private,260614,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,42,Self-emp-inc,287037,12th,8,Divorced,Craft-repair,Not-in-family,White,Male,0,0,10,United-States,<=50K
4996,48,Private,236858,11th,7,Divorced,Other-service,Not-in-family,White,Female,0,0,31,United-States,<=50K
4997,53,Private,317313,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K
4998,23,Private,113601,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States,<=50K


## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [None]:
from gretel_client.config import RunnerMode
from gretel_client.helpers import poll

model.create(runner_mode=RunnerMode.CLOUD, upload_data_source=True)

poll(model)

{
    "uid": "60de2dae6b12aa118f5a3730",
    "runner_mode": "cloud",
    "user_id": "5f3c5f8f492fbf247e0726cc",
    "project_id": "60de2da65192b9b14f9e9aec",
    "logs": null,
    "status_history": {
        "created": "2021-07-01T21:03:42.832573Z"
    },
    "last_modified": "2021-07-01T21:03:42.880370Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-east-2.amazonaws.com/gretelai/synthetics@sha256:c67c9bc6601cef6ca7e4abceb0dbfe4e05fba13f8a3ce1d2902f92b0006873aa",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 50,
                        "batch_size": 64,
                        "vocab_size": 20000,
                        "

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2021-07-01T21:08:24.491359Z  Starting synthetic model training
2021-07-01T21:08:24.493234Z  Loading training data
2021-07-01T21:08:24.730460Z  Training data loaded
{
    "record_count": 5000,
    "field_count": 15
}
2021-07-01T21:08:26.584046Z  Creating semantic validators and preparing training data
2021-07-01T21:08:31.851391Z  Beginning ML model training
2021-07-01T21:08:42.151139Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.2057812511920929,
    "loss": 4.527383327484131,
    "val_accuracy": 0.21019886434078217,
    "val_loss": 4.171525001525879,
    "batch": 0
}
2021-07-01T21:08:44.402823Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.21437890827655792,
    "loss": 4.0911784172058105,
    "val_accuracy": 0.23761

# View the generated synthetic data

In [None]:
# View the synthetic data

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')

synthetic_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,19,Private,413986.0,9th,7,Never-married,?,Own-child,White,Female,0,0,25,United-States,<=50K
1,27,Private,211027.0,9th,5,Never-married,?,Own-child,White,Male,0,0,40,Mexico,<=50K
2,68,Private,347902.0,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
3,66,Private,213187.0,HS-grad,9,Widowed,Sales,Unmarried,White,Female,0,0,45,United-States,<=50K
4,42,Private,187761.0,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,42,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,72,?,346465.0,HS-grad,9,Widowed,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
4996,28,Private,188069.0,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,70,?,<=50K
4997,22,State-gov,203673.0,9th,7,Never-married,Craft-repair,Unmarried,White,Female,0,0,36,United-States,>50K
4998,53,Private,144190.0,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,50,United-States,>50K


# View the synthetic data quality report

In [None]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())

0,1,2,3,4,5
Synthetic Data Use Cases,Excellent,Good,Moderate,Poor,Very Poor
Significant tuning required to improve model,,,,,
Improve your model using our tips and advice,,,,,
Demo environments or mock data,,,,,
Pre-production testing environments,,,,,
Balance or augment machine learning data sources,,,,,
Machine learning or statistical analysis,,,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,5000,5000
Column Count,15,15
Training Lines Duplicated,--,0

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
education,16,0,8.43,Categorical,Good
education_num,16,0,1.55,Numeric,Good
fnlwgt,4557,0,5.83,Numeric,Good
native_country,40,0,12.3,Categorical,Excellent
occupation,15,0,12.18,Categorical,Excellent
hours_per_week,82,0,1.98,Numeric,Excellent
age,70,0,2.0,Numeric,Excellent
relationship,6,0,9.1,Categorical,Excellent
race,5,0,5.54,Categorical,Excellent
workclass,8,0,7.89,Categorical,Excellent


# Generate unlimited synthetic data
You can now use the trained synthetic model to generate as much synthetic data as you like.

In [None]:
# Generate more records from the model

record_handler = model.create_record_handler()

record_handler.create(
    action="generate",
    runner_mode=RunnerMode.CLOUD,
    params={"num_records": 100, "max_invalid": 500}
)

poll(record_handler)

{
    "uid": "60de2fa8f0e10454f1075f93",
    "runner_mode": "cloud",
    "user_id": "5f3c5f8f492fbf247e0726cc",
    "project_id": "60de2da65192b9b14f9e9aec",
    "logs": null,
    "status_history": {
        "created": "2021-07-01T21:12:08.348000Z"
    },
    "last_modified": "2021-07-01T21:12:08.395000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-east-2.amazonaws.com/gretelai/synthetics@sha256:c67c9bc6601cef6ca7e4abceb0dbfe4e05fba13f8a3ce1d2902f92b0006873aa",
    "model_id": "60de2dae6b12aa118f5a3730",
    "action": "generate",
    "config": {
        "data_source": null,
        "params": {
            "num_records": 100,
            "max_invalid": 500
        }
    }
}


[32mINFO: [0mStatus is created. A Record generation job has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2021-07-01T21:12:27.395533Z  Downloading model to worker
2021-07-01T21:12:27.912501Z  Checking for synthetic smart seeds
2021-07-01T21:12:27.912909Z  No smart seeds provided, will attempt generation without them
2021-07-01T21:12:27.913141Z  Loading model
2021-07-01T21:12:30.216895Z  Generating records
{
    "num_records": 100
}
2021-07-01T21:12:35.222667Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2021-07-01T21:12:40.229547Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 15,
    "new_valid_count": 0,
    "new_invalid_count": 15,
    "completion_percent": 0.0
}
2021-07-01T21:12:42.

In [None]:
synthetic_df = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,49,Private,21525,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,7688,0,40,?,>50K
1,30,Private,233711,9th,5,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States,<=50K
2,61,Private,504544,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
3,47,Private,33193,Bachelors,13,Divorced,Transport-moving,Not-in-family,White,Male,0,0,54,?,<=50K
4,26,Private,266041,HS-grad,9,Separated,Adm-clerical,Other-relative,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,31,Local-gov,165946,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
96,25,Federal-gov,413299,Assoc-acdm,12,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K
97,60,Self-emp-not-inc,211015,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
98,33,Self-emp-not-inc,187262,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
