# A differentially private, synthetic ride-share dataset

This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns.

In [1]:
%%capture

!pip install -U gretel-client gretel-synthetics pandas

In [12]:
# Load your Gretel API key. You can acquire this from the Gretel Console 
# @ https://console.gretel.cloud

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

Enter Gretel API key··········


In [13]:
# Load and preview dataset
dataset_path = 'https://gretel-public-website.s3.amazonaws.com/datasets/uber_dataset_with_canaries.csv'
training_df = pd.read_csv(dataset_path).round(5)
training_df.head()

Unnamed: 0,hour,bike_id,src_lat,src_lon,dst_lat,dst_lon
0,23,27018,34.01698,-118.50102,34.0265,-118.49686
1,11,55026,47.55661,-122.2713,47.57012,-122.29086
2,21,50241,38.93048,-77.03244,38.94392,-77.03337
3,2,31898,37.79193,-122.40047,37.79389,-122.42464
4,16,XEY338,33.99552,-118.44952,34.00123,-118.43805


In [14]:
from gretel_client.projects.models import read_model_config


# Create model configuration.
config = read_model_config("synthetics/default")

config['models'][0]["synthetics"]["params"]["vocab_size"] = 0
config['models'][0]["synthetics"]["params"]["epochs"] = 50
config['models'][0]["synthetics"]["params"]["learning_rate"] = 0.001  # set low to demonstrate gradient clipping
config['models'][0]["synthetics"]["params"]["batch_size"] = 4
config['models'][0]["synthetics"]["params"]["batch_size"] = 1

# Enable Differential Privacy:
config['models'][0]["synthetics"]["params"]["dp"] = True
config['models'][0]["synthetics"]["params"]["dp_noise_multiplier"] = 0.001
config['models'][0]["synthetics"]["params"]["dp_l2_norm_clip"] = 1.5

#Setting the privacy filters off, since we are already using DP.
config["models"][0]['synthetics']['privacy_filters']['outliers'] = None
config["models"][0]['synthetics']['privacy_filters']['similarity'] = None

seed_columns = ["hour", "bike_id"]
task = {"type": "seed", "attrs": {"fields": seed_columns}}
config["models"][0]["synthetics"]["task"] = task

In [16]:
# Create a project
from gretel_client.helpers import poll
from gretel_client import projects

from gretel_client import create_project


project = create_project(display_name="Scooter_DPModel")
model = project.create_model_obj(model_config=config, data_source=dataset_path)

model.submit(upload_data_source=True)
poll(model)

[32mINFO: [0mStarting poller


{
    "uid": "626b28349d7d9de7ec74a986",
    "guid": "model_28Rqaz6XjkX6ZHOtjDU74wSrUNf",
    "model_name": "capricious-brave-wombat",
    "runner_mode": "cloud",
    "user_id": "621eafb8bff6215ff5786d9d",
    "user_guid": "user_25o0fF0dofQvUKJdRFuMDBJFz8F",
    "billing_domain": "gretel.ai",
    "billing_domain_guid": null,
    "project_id": "626b281ba1f1a0aba2e760d1",
    "project_guid": "proj_28RqXq0qVQJghcu9fG2xE7PRDq9",
    "status_history": {
        "created": "2022-04-28T23:50:12.010741Z"
    },
    "last_modified": "2022-04-28T23:50:12.274233Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:752367ff31f3c57308527d7ff6773497bc5ae7a350c7a6ca882ed13922324381",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
  

[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2022-04-28T23:50:42.418592Z  Starting synthetic model training
2022-04-28T23:50:42.421405Z  Loading training data
2022-04-28T23:50:42.653670Z  Training data loaded, detected format: 'csv'
2022-04-28T23:50:42.683558Z  Training data loaded
{
    "record_count": 27386,
    "field_count": 6,
    "upsample_count": 0
}
2022-04-28T23:50:51.974981Z  Creating semantic validators and preparing training data
2022-04-28T23:51:04.068524Z  Beginning ML model training
2022-04-28T23:51:04.069194Z  Running training on 1 batches.
{
    "batch_sizes": "[6]"
}
2022-04-28T23:51:04.806001Z  Tokenizing input data
2022-04-28T23:51:05.048064Z  Shuffling input data
2022-04-28T23:51:07.867977Z  Initializing synthetic model
2022-04-28T23:54:57.090310Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.5002,
    "loss": 1.4526,
   

In [17]:
# Read the synthetic data created from the conditioned synthetic data model.

synthetic_ds = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic_ds.head()

Unnamed: 0,hour,bike_id,src_lat,src_lon,dst_lat,dst_lon
0,23,27018,38.89514,-77.05248,38.93119,-77.04524
1,11,55026,47.60005,-122.33972,47.61223,-122.34351
2,21,50241,38.91766,-77.02627,38.91984,-77.03694
3,2,31898,38.5734,-121.48947,38.54828,-121.4857
4,16,XEY338,38.57205,-121.50636,38.56737,-121.49542


In [18]:
secrets = [85.31243, 80.71705, 84.98992, 63.20242]

# Find the canaries that were replayed by our model
def find_canaries(df, secrets):
    raw = df.to_string()
    for secret in secrets:
        print(f"secret {secret} : found {raw.count(str(secret))} times")

print("searching for canaries in training set...")        
find_canaries(training_df, secrets)
print("searching for canaries in synthetic set...")        
find_canaries(synthetic_ds, secrets)


searching for canaries in training set...
secret 85.31243 : found 7 times
secret 80.71705 : found 30 times
secret 84.98992 : found 93 times
secret 63.20242 : found 141 times
searching for canaries in synthetic set...
secret 85.31243 : found 0 times
secret 80.71705 : found 0 times
secret 84.98992 : found 0 times
secret 63.20242 : found 0 times
