# A differentially private, synthetic ride-share dataset

This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns.

In [None]:
%%capture

!pip install -U gretel-client gretel-synthetics pandas

In [None]:
# Load your Gretel API key. You can acquire this from the Gretel Console 
# @ https://console.gretel.cloud

import pandas as pd
from gretel_client import get_cloud_client


pd.set_option('max_colwidth', None)

client = get_cloud_client(prefix="api", api_key="prompt")
client.install_packages()

Enter Gretel API key: ··········


INFO pkg_installers.py: Authenticating with package manager
INFO pkg_installers.py: Installing packages (this might take a while)
ERROR pkg_installers.py: /usr/bin/python3 -m pip --disable-pip-version-check install https://gretel-opt-prod-usw2.s3.amazonaws.com/priv/pip/gretel-helpers/0.8.2/gretel_helpers-0.8.2-py3-none-any.whl?AWSAccessKeyId=ASIARC2BUADHWQNNPWPH&Signature=fgkcseuoHN8WT%2BRUHaDAPFiEJSQ%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIQD5i4JXTHxf76VePnP08YG6Do%2BoBXjAZ1%2BjKKESM1xlGgIgIgCUURqBPwxV8ObphN9BYd7ygpoXx0mTzJNvRyWqgTsq3gEIIBACGgwwNzQ3NjI2ODI1NzUiDJO7ocKPryKX3%2FZrGyq7AemytcJJVrrpUFgKRCtpEiqEHoH5IyGDSVicc5viQFcwfZu0%2FG6lcPnXjuSzNM6AjVgP1hJEklIOQ3NIFmSisbsID0Bw69p%2BbjHJD3AnQzVYjutX7HBU7Zt94mATAOP6TXXgrrCA8h2sh81YF2KrHjuAFWrdRCwu8VBtOmBJl2pOHI6otCVlXOB1%2Fd8ni2i513ZusjdzspFtzOgm5D%2FuDz2VBYOUsMuM5he2qUWTqmRNbvonGfaqyVNq%2BpcwreCXhAY64AEjU5Z7sXRgh3X7Ipe4FCpkxrNNwL5py8PgwN9tkxrqha1oTlpFZ5Fi%2FoY11GqNuCXMAriYcP6%2FgEl

In [None]:
# Load and preview dataset

import logging
import pandas as pd


logging.basicConfig(level=logging.DEBUG)

dataset_path = 'https://gretel-public-website.s3.amazonaws.com/datasets/uber_dataset_with_canaries.csv'
training_df = pd.read_csv(dataset_path).round(5)
training_df

Unnamed: 0,hour,bike_id,src_lat,src_lon,dst_lat,dst_lon
0,23,27018,34.01698,-118.50102,34.0265,-118.49686
1,11,55026,47.55661,-122.2713,47.57012,-122.29086
2,21,50241,38.93048,-77.03244,38.94392,-77.03337
3,2,31898,37.79193,-122.40047,37.79389,-122.42464
4,16,XEY338,33.99552,-118.44952,34.00123,-118.43805
...,...,...,...,...,...,...
27381,20,XIT762,30.29091,-97.74907,30.29081,-97.74548
27382,7,BQN803,38.57168,-121.46315,38.56798,-121.46044
27383,0,GUF685,38.91714,-77.04085,38.92376,-77.04086
27384,21,SYR196,38.55243,-121.4696,38.57831,-121.48649


In [None]:
# Create the Gretel Synthtetics Training / Model Configuration

from pathlib import Path

checkpoint_dir = str(Path.cwd() / "checkpoints-dp")

config_template = {
    "checkpoint_dir": checkpoint_dir,
    "vocab_size": 0,
    "epochs": 50,
    "early_stopping": True,
    "learning_rate": 0.001,
    "rnn_units": 256,
    "batch_size": 4,
    "predict_batch_size": 1,
    "dp": True,
    "dp_noise_multiplier": 0.001, # set low to demonstrate gradient clipping
    "dp_l2_norm_clip": 1.5,
    "dp_microbatches": 1,    
    "overwrite": True
}

In [None]:
# Create a Gretel Synthetic Data Bundle

try:
    # Capture transient import errors in Google Colab
    from gretel_helpers.series_models import SeriesModel
except FileNotFoundError:
    from gretel_helpers.series_models import SeriesModel
    

# Use these values as a prompt to seed each record versus random generation
seed_columns = ["hour", "bike_id"]

model = SeriesModel(
    training_df=training_df,
    seed_columns=seed_columns,
    synthetic_config=config_template
)

model.train()
model.generate(max_invalid=1e5)

INFO model.py: Detecting record field delimiter...
INFO model.py: Analyzing DataFrame for optimal column batches and ordering...
INFO model.py: Creating model and data storage directories...
INFO batch.py: Creating directory structure for batch jobs...
INFO model.py: Generating training data from source dataset...
INFO batch.py: Generating training DF and CSV for batch 0
INFO model.py: Creating data validators...
INFO model.py: Creating validator for synthetic batch 0


  0%|          | 0/27386 [00:00<?, ?it/s][A[A

 43%|████▎     | 11657/27386 [00:00<00:00, 116567.33it/s][A[A

100%|██████████| 27386/27386 [00:00<00:00, 108339.90it/s]


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (4, None, 256)            16896     
_________________________________________________________________
dropout_6 (Dropout)          (4, None, 256)            0         
_________________________________________________________________
lstm_4 (LSTM)                (4, None, 256)            525312    
_________________________________________________________________
dropout_7 (Dropout)          (4, None, 256)            0         
_________________________________________________________________
lstm_5 (LSTM)                (4, None, 256)            525312    
_________________________________________________________________
dropout_8 (Dropout)          (4, None, 256)            0         
_________________________________________________________________
dense_2 (Dense)              (4, None, 66)            



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


HBox(children=(FloatProgress(value=0.0, description='Valid record count ', max=27386.0, style=ProgressStyle(de…

HBox(children=(FloatProgress(value=0.0, description='Invalid record count ', max=100000.0, style=ProgressStyle…



Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 256)            16896     
_________________________________________________________________
dropout_9 (Dropout)          (1, None, 256)            0         
_________________________________________________________________
lstm_6 (LSTM)                (1, None, 256)            525312    
_________________________________________________________________
dropout_10 (Dropout)         (1, None, 256)            0         
_________________________________________________________________
lstm_7 (LSTM)                (1, None, 256)            525312    
_________________________________________________________________
dropout_11 (Dropout)         (1, None, 256)            0         
_________________________________________________________________
dense_3 (Dense)              (1, None, 66)            

<gretel_helpers.series_models.SeriesModel at 0x7fa1459a00d0>

In [None]:
# Save synthetic dataframe locally and save to CSV 

df = model.df
df.to_csv('synthetic-data.csv', index=False)

In [None]:
secrets = [85.31243, 80.71705, 84.98992, 63.20242]

# Find the canaries that were replayed by our model
def find_canaries(df, secrets):
    raw = df.to_string()
    for secret in secrets:
        print(f"secret {secret} : found {raw.count(str(secret))} times")

print("searching for canaries in training set...")        
find_canaries(training_df, secrets)
print("searching for canaries in synthetic set...")        
find_canaries(df, secrets)


searching for canaries in training set...
secret 85.31243 : found 7 times
secret 80.71705 : found 30 times
secret 84.98992 : found 93 times
secret 63.20242 : found 141 times
searching for canaries in synthetic set...
secret 85.31243 : found 0 times
secret 80.71705 : found 0 times
secret 84.98992 : found 0 times
secret 63.20242 : found 0 times
