## Installation

In [None]:
%%capture
!pip install numpy pandas matplotlib pycaret
!pip install -U gretel-client

## Log in to gretel using out API key

In [4]:
import pandas as pd
from gretel_client import configure_session

pd.set_option("max_colwidth", None)

configure_session(api_key="prompt", validate=True, clear=True)

Using endpoint https://api.gretel.cloud
Logged in as andrew@gretel.ai ✅


## Load data

We're going to explore using synthetic data as input to a downstream classification task. 

In [5]:
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("https://gretel-blueprints-pub.s3.us-west-2.amazonaws.com/CTGAN/grocery_orders.csv")

In [7]:
df.head()

Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,1597,1,8,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2011,4,10,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2822,0,8,29,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,2
3,2889,1,15,8,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,3971,2,18,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train a synthetic model and look at the generated data

In [6]:
from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll
from gretel_client.projects.models import read_model_config


# Create a project and model configuration.
project = create_or_get_unique_project(name="ctgan-synthetics-example")

config = read_model_config("synthetics/high-dimensionality")

# Get a csv to work with, just dump out the train_df.
df.to_csv("train.csv", index=False)

model = project.create_model_obj(model_config=config, data_source="train.csv")

# Upload the training data. Train the model.
model.submit_cloud()
poll(model)

synthetic = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic.head()

INFO: Starting poller


{
    "uid": "6311143a5a5a0844b8000b16",
    "guid": "model_2EBKjmTQtx1TaF1cCspoumhCIE8",
    "model_name": "high-dimensionality",
    "runner_mode": "cloud",
    "user_id": "61d5c57dbff621712241f583",
    "user_guid": "user_26hlZsMtIJvkFPd4AZ7x27CRP44",
    "billing_domain": "gretel.ai",
    "billing_domain_guid": "domain_28eujAnf9EFme26oSFok8xCUT4n",
    "project_id": "6311143235726ff1e8d9fd90",
    "project_guid": "proj_2EBKilBNK4voUE7OviQUnafKU0H",
    "status_history": {
        "created": "2022-09-01T20:21:14.273524Z"
    },
    "last_modified": "2022-09-01T20:21:14.536016Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/models/ctgan@sha256:8a0661b464e5d0209e905a3b27d2642c65cadc8caf6fb703838dc535af44da68",
    "model_type": "ctgan",
    "config": {
        "schema_version": "1.0",
   

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2022-09-01T20:21:44.456667Z  Starting CTGAN model training...
2022-09-01T20:21:44.641330Z  Training data loaded
{
    "record_count": 5000,
    "field_count": 138,
    "upsample_count": 0
}
2022-09-01T20:22:36.597703Z  Training epoch completed
{
    "epoch": 1,
    "loss_g": 1.924,
    "loss_d": -15.4327
}
2022-09-01T20:22:37.521454Z  Training epoch completed
{
    "epoch": 2,
    "loss_g": -14.9947,
    "loss_d": -3.7928
}
2022-09-01T20:22:38.434825Z  Training epoch completed
{
    "epoch": 3,
    "loss_g": -12.6132,
    "loss_d": -1.8457
}
2022-09-01T20:22:39.331073Z  Training epoch completed
{
    "epoch": 4,
    "loss_g": -2.9733,
    "loss_d": -0.396
}
2022-09-01T20:22:40.226898Z  Training epoch completed
{
    "epoch": 5,
    "loss_g": 2.9032,
    "loss_d": -0.938
}
202

Unnamed: 0,order_id,order_dow,order_hour_of_day,days_since_prior_order,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,1597,2,22,5,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,460872,0,14,9,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1597,4,11,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3420909,6,23,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2456962,4,6,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3420909,5,11,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,1754718,3,21,21,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,2308112,1,20,12,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4998,2412216,5,8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [8]:
from pprint import pprint 
pprint(model.peek_report()['synthetic_data_quality_score'])

{'grade': 'Excellent', 'raw_score': 91.85925925925925, 'score': 91}


## Downstream usecase

One huge benefit of synthetic data, outside of privacy preservation, is utility. The data isn't fake, it has all the same correlations as the original data - which means it can be used as input to a machine learning model. We train several classifiers and observe performance on various folds of the data

In [9]:
from pycaret.classification import setup, compare_models, evaluate_model, predict_model, create_model, plot_model

In [10]:
synthetic_df = synthetic.drop(['order_id'], axis=1)

In [11]:
synthetic_train_data, synthetic_test_data = synthetic_df.iloc[:int(len(synthetic_df) * 0.8)], synthetic_df.iloc[int(len(synthetic_df) * 0.8):]
original_train_data, original_test_data = df.iloc[:int(len(df) * 0.8)], df.iloc[int(len(df) * 0.8):]

We want to predict whether a customer will buy frozen pizza (and how many). This turns into a multi-class classifiation problem. We use the Pycaret library to test a huge number of hypothesis classes. This will take a few minutes to fit many different models on a variety of folds

In [13]:
s = setup(synthetic_train_data, target='frozen pizza')
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9275,0.5143,0.25,0.8602,0.8926,0.0,0.0,0.186
rf,Random Forest Classifier,0.9275,0.7732,0.25,0.8602,0.8926,0.0,0.0,0.089
dummy,Dummy Classifier,0.9275,0.5,0.25,0.8602,0.8926,0.0,0.0,0.013
et,Extra Trees Classifier,0.9271,0.767,0.2499,0.8602,0.8924,-0.0006,-0.0015,0.096
lightgbm,Light Gradient Boosting Machine,0.9264,0.6364,0.2497,0.8604,0.8922,0.0021,0.0053,0.079
ridge,Ridge Classifier,0.9207,0.0,0.2555,0.8699,0.8924,0.0342,0.0501,0.016
lr,Logistic Regression,0.91,0.6571,0.274,0.8759,0.8908,0.0802,0.0895,0.752
gbc,Gradient Boosting Classifier,0.9093,0.7267,0.2569,0.8709,0.8885,0.0529,0.0603,0.555
svm,SVM - Linear Kernel,0.9039,0.0,0.2599,0.8696,0.8852,0.0503,0.0529,0.047
ada,Ada Boost Classifier,0.8782,0.5114,0.2663,0.8747,0.8738,0.0547,0.0593,0.055


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
INFO:logs:compare_models() succesfully completed......................................


We then see how our "Best" classification model performs on the original data when trained on the synthetic data

In [19]:
print(predict_model(best, data=original_test_data))

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.95,0,0.25,0.9025,0.9256,0.0,0.0


      order_id  order_dow  order_hour_of_day  days_since_prior_order  \
4000   2713918          6                 13                       8   
4001   2714450          0                 14                       7   
4002   2715580          0                 15                      30   
4003   2715728          1                 12                      30   
4004   2716989          5                 11                       1   
...        ...        ...                ...                     ...   
4995   3419893          5                 19                      30   
4996   3420158          0                 14                       0   
4997   3420769          2                 17                      10   
4998   3420894          0                 14                      27   
4999   3420909          0                 19                      13   

      air fresheners candles  asian foods  baby accessories  \
4000                       0            0                 0   
4001     

In [20]:
print(predict_model(best, data=synthetic_test_data))

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform'), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.933,0,0.2,0.8705,0.9007,0.0,0.0


      order_dow  order_hour_of_day  days_since_prior_order  \
4000          5                  6                      13   
4001          0                 15                       4   
4002          6                  6                      29   
4003          6                  5                       0   
4004          0                 16                      10   
...         ...                ...                     ...   
4995          5                 11                       6   
4996          3                 21                      21   
4997          1                 20                      12   
4998          5                  8                       1   
4999          6                  8                       7   

      air fresheners candles  asian foods  baby accessories  \
4000                       0            0                 0   
4001                       0            1                 0   
4002                       0            1                 0   
400