# Synthesize high dimensional data from your own DataFrame with CTGAN

This Blueprint demonstrates how to create synthetic data with Gretel-CTGAN.

In [1]:
%%capture

!pip install numpy matplotlib pandas
!pip install -U gretel-client

In [12]:
# Specify your Gretel API key

import pandas as pd
from gretel_client import configure_session

pd.set_option("max_colwidth", None)

configure_session(api_key="prompt", validate=True, clear=True)


Using endpoint https://api.gretel.cloud
Logged in as andrew@gretel.ai ✅


## Load our data containing shopping information for a grocery store

In [18]:
import matplotlib.pyplot as plt
import numpy as np

train_df = pd.read_csv("https://gretel-blueprints-pub.s3.us-west-2.amazonaws.com/CTGAN/grocery_orders.csv")


In [20]:
from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll
from gretel_client.projects.models import read_model_config


# Create a project and model configuration.
project = create_or_get_unique_project(name="ctgan-synthetics")

config = read_model_config("synthetics/high-dimensionality")

# Get a csv to work with, just dump out the train_df.
train_df.to_csv("train.csv", index=False)

model = project.create_model_obj(model_config=config, data_source="train.csv")

# Upload the training data. Train the model.
model.submit_cloud()
poll(model)

synthetic = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic


[32mINFO: [0mStarting poller


{
    "uid": "62d878a4503e01bb5537276c",
    "guid": "model_2CE3HTOtm8i3j9iFYYtgfDSN7OQ",
    "model_name": "high-dimensionality",
    "runner_mode": "cloud",
    "user_id": "61d5c57dbff621712241f583",
    "user_guid": "user_26hlZsMtIJvkFPd4AZ7x27CRP44",
    "billing_domain": "gretel.ai",
    "billing_domain_guid": "domain_28eujAnf9EFme26oSFok8xCUT4n",
    "project_id": "62c5e78bc473ac6b478bdf7b",
    "project_guid": "proj_2BaGyMvQ9DpshnGPk5HVqLEQ39t",
    "status_history": {
        "created": "2022-07-20T21:50:28.270048Z"
    },
    "last_modified": "2022-07-20T21:50:28.510515Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/models/ctgan@sha256:b59c22d0ec852177a9eb0a05d2857e0bd7b5baea6cde5de1cf375923fcb41d08",
    "model_type": "ctgan",
    "config": {
        "schema_version": "1.0",
   

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2022-07-20T21:51:01.095141Z  Starting CTGAN model training...
2022-07-20T21:59:12.045224Z  CTGAN model training complete.
2022-07-20T21:59:12.045969Z  Sampling 5000 records for data preview...
2022-07-20T21:59:13.596357Z  Preparing privacy filters
2022-07-20T21:59:14.101376Z  Loaded 1 privacy filters
2022-07-20T21:59:14.101776Z  Starting privacy filtering
2022-07-20T21:59:14.659255Z  Privacy filtering complete
2022-07-20T21:59:14.665980Z  Sampled 5000 records.
2022-07-20T21:59:15.801362Z  Creating synthetic quality report (SQS)...
2022-07-20T22:01:32.692917Z  Finished creating SQS
2022-07-20T22:01:36.320610Z  Uploading artifacts to Gretel Cloud


Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,1625546,6,14,29,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,5
1,1199367,1,20,29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,3420909,1,12,29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,1183786,2,21,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,2787649,2,20,29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1673080,2,17,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,5
4996,2087594,0,12,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4997,3082775,2,20,29,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,2
4998,2120038,1,20,30,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,2


In [22]:
from pprint import pprint 
pprint(model.peek_report())

{'field_correlation_stability': {'grade': 'Excellent',
                                 'raw_score': 0.009746851049648004,
                                 'score': 100},
 'field_distribution_stability': {'grade': 'Good',
                                  'raw_score': 0.1959363726179225,
                                  'score': 73},
 'principal_component_stability': {'grade': 'Good',
                                   'raw_score': 0.25910572808562765,
                                   'score': 77},
 'privacy_protection_level': None,
 'synthetic_data_quality_score': {'grade': 'Excellent',
                                  'raw_score': 84.24444444444443,
                                  'score': 84}}


In [26]:
seed_df = pd.DataFrame({"order_hour_of_day": [17]*500})

In [27]:
seed_df.to_csv("seeds.csv", index=False)

# Use the model to generate more synthetic data.
record_handler = model.create_record_handler_obj(
    params={"num_records": 500, "max_invalid": 5000},
    data_source="seeds.csv", # The data_source parameter determines the seed data
)

record_handler.submit_cloud()

poll(record_handler)

# Create a second synthetic dataframe
synthetic_2 = pd.read_csv(record_handler.get_artifact_link("data"), compression="gzip")
synthetic_2


[32mINFO: [0mStarting poller


{
    "uid": "62d88020c5b6650fdd7b5a9e",
    "guid": "model_run_2CE7ADGNN0LKHiSt4ny5vztIM1J",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "61d5c57dbff621712241f583",
    "user_guid": "user_26hlZsMtIJvkFPd4AZ7x27CRP44",
    "billing_domain": "gretel.ai",
    "billing_domain_guid": "domain_28eujAnf9EFme26oSFok8xCUT4n",
    "project_id": "62c5e78bc473ac6b478bdf7b",
    "project_guid": "proj_2BaGyMvQ9DpshnGPk5HVqLEQ39t",
    "status_history": {
        "created": "2022-07-20T22:22:24.361000Z"
    },
    "last_modified": "2022-07-20T22:22:24.529000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/models/ctgan@sha256:b59c22d0ec852177a9eb0a05d2857e0bd7b5baea6cde5de1cf375923fcb41d08",
    "model_id": "62d878a4503e01bb5537276c",
    "model_guid": "model_2CE3HTOtm8i3j9iFYYtgfDS

[32mINFO: [0mStatus is created. A job has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated
[32mINFO: [0mStatus is active. A worker has started!
2022-07-20T22:22:50.666330Z  Loading CTGAN model...
2022-07-20T22:22:50.693848Z  Sampling 500 records from conditioning input...
2022-07-20T22:22:54.716415Z  Preparing privacy filters
2022-07-20T22:22:54.721056Z  Loaded 1 privacy filters
2022-07-20T22:22:54.721288Z  Starting privacy filtering
2022-07-20T22:22:54.804505Z  Privacy filtering complete
2022-07-20T22:22:54.908146Z  Uploading artifacts to Gretel Cloud


Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,3420909,2,17,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,2005197,0,17,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
2,840436,1,17,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2
3,620237,4,17,12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6
4,2957240,4,17,29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1513161,1,17,30,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,2
496,3182332,6,17,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
497,1502079,4,17,30,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,8
498,3420909,2,17,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6


In [5]:
synthetic_2.to_csv("synthetic_2.csv", index=False)