Skip to content

Commit

Permalink
ENGPROD-29: Amending privacy accounting
Browse files Browse the repository at this point in the history
* Amend input to compute_epsilon and expand orders for RDP

* Amend input and expand orders for RDP - styled

* Updating RDP input to remove duplicates

* Sorted inputs

* Add ValueError, correct function call, change variable name

* move constants out into module

* Typing for constant

* Fix import sort

* Changed how num_batches_train is calculated after a .filter() operation on tf dataset object

* Refine Typing and correct steps calculation

* Added tests for test_compute_dp_sgd_privacy output

* assert values in test

* assert values in test

* Style for test

GitOrigin-RevId: 2e6a5b49e85a48cf48d25b44a4dd6f6afc5f9dc3
  • Loading branch information
lipikaramaswamy committed Feb 22, 2022
1 parent 160ce91 commit 1e6fd38
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 16 deletions.
50 changes: 44 additions & 6 deletions src/gretel_synthetics/tensorflow/dp_model.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import importlib
import logging
import math

from typing import Tuple, TYPE_CHECKING
from typing import List, Tuple, TYPE_CHECKING, Union

import tensorflow as tf

from tensorflow.keras.optimizers import RMSprop
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import (
make_keras_optimizer_class,
)
Expand All @@ -16,6 +17,8 @@
else:
TensorFlowConfig = None

ORDERS = [1 + x / 20 for x in range(1, 100)] + list(range(6, 64)) + [128, 256, 512]


def loss(labels, logits):
return tf.keras.losses.sparse_categorical_crossentropy(
Expand Down Expand Up @@ -93,8 +96,42 @@ def build_dp_model(store, batch_size, vocab_size) -> tf.keras.Sequential:
return model


def compute_dp_sgd_privacy(
n: int,
batch_size: int,
noise_multiplier: float,
epochs: int,
delta: float,
orders: List[Union[float, int]] = ORDERS,
) -> Tuple[float, float]:
"""Compute epsilon based on the given hyperparameters.
Adaptation of tensorflow privacy with expanded rdp orders.
Args:
n: Number of examples in the training data
batch_size: Batch size used in training
noise_multiplier: Noise multiplier used in training
epochs: Number of epochs in training
delta: Value of delta for which to compute epsilon
Returns:
Tuple of eps, opt_order
"""
if n <= 0:
raise ValueError("Number of examples in the training data must be non-zero.")
q = batch_size / n # q - the sampling ratio.
if q > 1:
raise ValueError(
"Number of training examples must be larger than the batch size."
)
steps = int(math.ceil(epochs * n / batch_size))
return compute_dp_sgd_privacy_lib.apply_dp_sgd_analysis(
q, noise_multiplier, steps, orders, delta
)


def compute_epsilon(
steps: int, store: TensorFlowConfig, epoch_number: int = None
n: int, store: TensorFlowConfig, epoch_number: int = None
) -> Tuple[float, float]:
"""
Calculate epsilon and delta values for differential privacy
Expand All @@ -106,10 +143,11 @@ def compute_epsilon(
# delta in differential privacy
if epoch_number is None:
epoch_number = store.epochs - 1
return compute_dp_sgd_privacy.compute_dp_sgd_privacy(
n=steps,

return compute_dp_sgd_privacy(
n=n,
batch_size=store.batch_size,
noise_multiplier=store.dp_noise_multiplier,
epochs=epoch_number,
delta=1.0 / float(steps),
delta=1.0 / float(n),
)
24 changes: 14 additions & 10 deletions src/gretel_synthetics/tensorflow/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ class _ModelHistory(tf.keras.callbacks.Callback):
Callback class to compute loss and accuracy during model training
"""

def __init__(self, total_token_count: int, config: TensorFlowConfig):
self.total_token_count = total_token_count
def __init__(self, num_examples_train: int, config: TensorFlowConfig):
self.num_examples_train = num_examples_train
self.config = config
self.loss = []
self.accuracy = []
Expand All @@ -71,12 +71,12 @@ def on_epoch_end(self, epoch, logs: dict = None):
if self.config.dp:
# Account for tf-privacy library writing to stdout
with redirect_stdout(io.StringIO()):
eps, _ = compute_epsilon(self.total_token_count, self.config, epoch)
eps, _ = compute_epsilon(self.num_examples_train, self.config, epoch)
logs[METRIC_EPSILON] = eps

# NOTE: this is just a list of the same value, but
# is simpler for creating the history csv
delta = 1 / float(self.total_token_count)
delta = 1 / float(self.num_examples_train)
logs[METRIC_DELTA] = delta

self.epsilon.append(logs.get(METRIC_EPSILON, 0))
Expand Down Expand Up @@ -238,7 +238,7 @@ def train_rnn(params: TrainingParams):
"overwrite mode or delete the checkpoints first."
)

total_token_count, validation_dataset, training_dataset = _create_dataset(
num_batches_train, validation_dataset, training_dataset = _create_dataset(
store, text_iter, num_lines, tokenizer
)
logging.info("Initializing synthetic model")
Expand All @@ -259,7 +259,10 @@ def train_rnn(params: TrainingParams):
monitor=store.best_model_metric,
save_best_only=store.save_best_model,
)
history_callback = _ModelHistory(total_token_count, store)

num_examples_train = store.batch_size * num_batches_train

history_callback = _ModelHistory(num_examples_train, store)

_callbacks = [checkpoint_callback, history_callback]

Expand Down Expand Up @@ -331,11 +334,9 @@ def _create_dataset(
"""
logging.info("Tokenizing input data")
ids = []
total_token_count = 0
for line in tqdm(text_iter, total=num_lines):
_tokens = tokenizer.encode_to_ids(line)
ids.extend(_tokens)
total_token_count += len(_tokens)

logging.info("Shuffling input data")
char_dataset = tf.data.Dataset.from_tensor_slices(ids)
Expand Down Expand Up @@ -374,9 +375,12 @@ def recover(x, y):
.filter(is_train)
.map(recover, num_parallel_calls=tf.data.AUTOTUNE)
)
return total_token_count, validation_dataset, train_dataset

num_batches_train = len(list(train_dataset.as_numpy_iterator()))
return num_batches_train, validation_dataset, train_dataset
else:
return total_token_count, None, full_dataset
num_batches_train = tf.data.experimental.cardinality(full_dataset).numpy()
return num_batches_train, None, full_dataset


@tf.autograph.experimental.do_not_convert
Expand Down
20 changes: 20 additions & 0 deletions tests/tensorflow/test_dp_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import Tuple

import numpy as np

from gretel_synthetics.tensorflow.dp_model import compute_dp_sgd_privacy, ORDERS


def test_compute_dp_sgd_privacy():
out = compute_dp_sgd_privacy(
n=2000,
batch_size=128,
noise_multiplier=0.01,
epochs=50,
delta=1 / 2000,
orders=ORDERS,
)
assert np.isclose(out[0], 4060510)
assert out[1] == 1.05
assert len(out) == 2
assert isinstance(out, Tuple)

0 comments on commit 1e6fd38

Please sign in to comment.