ENGPROD-29: Amending privacy accounting

* Amend input to compute_epsilon and expand orders for RDP * Amend input and expand orders for RDP - styled * Updating RDP input to remove duplicates * Sorted inputs * Add ValueError, correct function call, change variable name * move constants out into module * Typing for constant * Fix import sort * Changed how num_batches_train is calculated after a .filter() operation on tf dataset object * Refine Typing and correct steps calculation * Added tests for test_compute_dp_sgd_privacy output * assert values in test * assert values in test * Style for test GitOrigin-RevId: 2e6a5b49e85a48cf48d25b44a4dd6f6afc5f9dc3
gretelai · Feb 22, 2022 · 1e6fd38 · 1e6fd38
1 parent 160ce91
commit 1e6fd38
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 16 deletions.
diff --git a/src/gretel_synthetics/tensorflow/dp_model.py b/src/gretel_synthetics/tensorflow/dp_model.py
@@ -1,12 +1,13 @@
 import importlib
 import logging
+import math
 
-from typing import Tuple, TYPE_CHECKING
+from typing import List, Tuple, TYPE_CHECKING, Union
 
 import tensorflow as tf
 
 from tensorflow.keras.optimizers import RMSprop
-from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
+from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
 from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import (
     make_keras_optimizer_class,
 )
@@ -16,6 +17,8 @@
 else:
     TensorFlowConfig = None
 
+ORDERS = [1 + x / 20 for x in range(1, 100)] + list(range(6, 64)) + [128, 256, 512]
+
 
 def loss(labels, logits):
     return tf.keras.losses.sparse_categorical_crossentropy(
@@ -93,8 +96,42 @@ def build_dp_model(store, batch_size, vocab_size) -> tf.keras.Sequential:
     return model
 
 
+def compute_dp_sgd_privacy(
+    n: int,
+    batch_size: int,
+    noise_multiplier: float,
+    epochs: int,
+    delta: float,
+    orders: List[Union[float, int]] = ORDERS,
+) -> Tuple[float, float]:
+    """Compute epsilon based on the given hyperparameters.
+    Adaptation of tensorflow privacy with expanded rdp orders.
+
+    Args:
+        n: Number of examples in the training data
+        batch_size: Batch size used in training
+        noise_multiplier: Noise multiplier used in training
+        epochs: Number of epochs in training
+        delta: Value of delta for which to compute epsilon
+
+    Returns:
+        Tuple of eps, opt_order
+    """
+    if n <= 0:
+        raise ValueError("Number of examples in the training data must be non-zero.")
+    q = batch_size / n  # q - the sampling ratio.
+    if q > 1:
+        raise ValueError(
+            "Number of training examples must be larger than the batch size."
+        )
+    steps = int(math.ceil(epochs * n / batch_size))
+    return compute_dp_sgd_privacy_lib.apply_dp_sgd_analysis(
+        q, noise_multiplier, steps, orders, delta
+    )
+
+
 def compute_epsilon(
-    steps: int, store: TensorFlowConfig, epoch_number: int = None
+    n: int, store: TensorFlowConfig, epoch_number: int = None
 ) -> Tuple[float, float]:
     """
     Calculate epsilon and delta values for differential privacy
@@ -106,10 +143,11 @@ def compute_epsilon(
     # delta in differential privacy
     if epoch_number is None:
         epoch_number = store.epochs - 1
-    return compute_dp_sgd_privacy.compute_dp_sgd_privacy(
-        n=steps,
+
+    return compute_dp_sgd_privacy(
+        n=n,
         batch_size=store.batch_size,
         noise_multiplier=store.dp_noise_multiplier,
         epochs=epoch_number,
-        delta=1.0 / float(steps),
+        delta=1.0 / float(n),
     )
diff --git a/src/gretel_synthetics/tensorflow/train.py b/src/gretel_synthetics/tensorflow/train.py
@@ -51,8 +51,8 @@ class _ModelHistory(tf.keras.callbacks.Callback):
     Callback class to compute loss and accuracy during model training
     """
 
-    def __init__(self, total_token_count: int, config: TensorFlowConfig):
-        self.total_token_count = total_token_count
+    def __init__(self, num_examples_train: int, config: TensorFlowConfig):
+        self.num_examples_train = num_examples_train
         self.config = config
         self.loss = []
         self.accuracy = []
@@ -71,12 +71,12 @@ def on_epoch_end(self, epoch, logs: dict = None):
         if self.config.dp:
             # Account for tf-privacy library writing to stdout
             with redirect_stdout(io.StringIO()):
-                eps, _ = compute_epsilon(self.total_token_count, self.config, epoch)
+                eps, _ = compute_epsilon(self.num_examples_train, self.config, epoch)
                 logs[METRIC_EPSILON] = eps
 
             # NOTE: this is just a list of the same value, but
             # is simpler for creating the history csv
-            delta = 1 / float(self.total_token_count)
+            delta = 1 / float(self.num_examples_train)
             logs[METRIC_DELTA] = delta
 
         self.epsilon.append(logs.get(METRIC_EPSILON, 0))
@@ -238,7 +238,7 @@ def train_rnn(params: TrainingParams):
                 "overwrite mode or delete the checkpoints first."
             )
 
-    total_token_count, validation_dataset, training_dataset = _create_dataset(
+    num_batches_train, validation_dataset, training_dataset = _create_dataset(
         store, text_iter, num_lines, tokenizer
     )
     logging.info("Initializing synthetic model")
@@ -259,7 +259,10 @@ def train_rnn(params: TrainingParams):
         monitor=store.best_model_metric,
         save_best_only=store.save_best_model,
     )
-    history_callback = _ModelHistory(total_token_count, store)
+
+    num_examples_train = store.batch_size * num_batches_train
+
+    history_callback = _ModelHistory(num_examples_train, store)
 
     _callbacks = [checkpoint_callback, history_callback]
 
@@ -331,11 +334,9 @@ def _create_dataset(
     """
     logging.info("Tokenizing input data")
     ids = []
-    total_token_count = 0
     for line in tqdm(text_iter, total=num_lines):
         _tokens = tokenizer.encode_to_ids(line)
         ids.extend(_tokens)
-        total_token_count += len(_tokens)
 
     logging.info("Shuffling input data")
     char_dataset = tf.data.Dataset.from_tensor_slices(ids)
@@ -374,9 +375,12 @@ def recover(x, y):
             .filter(is_train)
             .map(recover, num_parallel_calls=tf.data.AUTOTUNE)
         )
-        return total_token_count, validation_dataset, train_dataset
+
+        num_batches_train = len(list(train_dataset.as_numpy_iterator()))
+        return num_batches_train, validation_dataset, train_dataset
     else:
-        return total_token_count, None, full_dataset
+        num_batches_train = tf.data.experimental.cardinality(full_dataset).numpy()
+        return num_batches_train, None, full_dataset
 
 
 @tf.autograph.experimental.do_not_convert

diff --git a/tests/tensorflow/test_dp_model.py b/tests/tensorflow/test_dp_model.py
@@ -0,0 +1,20 @@
+from typing import Tuple
+
+import numpy as np
+
+from gretel_synthetics.tensorflow.dp_model import compute_dp_sgd_privacy, ORDERS
+
+
+def test_compute_dp_sgd_privacy():
+    out = compute_dp_sgd_privacy(
+        n=2000,
+        batch_size=128,
+        noise_multiplier=0.01,
+        epochs=50,
+        delta=1 / 2000,
+        orders=ORDERS,
+    )
+    assert np.isclose(out[0], 4060510)
+    assert out[1] == 1.05
+    assert len(out) == 2
+    assert isinstance(out, Tuple)