Clean-ups in the C++ library and Privacy-on-Beam

C++: - Formatting changes to partition selection code - Fix overflow in automatic bounds approximation - Stability and security improvements Privacy-on-Beam: - Small cleanup improvements. GitOrigin-RevId: fae985e84bb8ed01502e5995520fa9a7aa4ee93e Change-Id: I906bc8074eb4bfeee9ea73bbbb91a140f291d63b
google · Aug 24, 2020 · ab1b003 · ab1b003
1 parent 1324cfe
commit ab1b003
Show file tree

Hide file tree

Showing 37 changed files with 961 additions and 661 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -40,6 +40,14 @@ frameworks.
 * **Fine building blocks over large aggregates:** one should be able to
 use sub-operations like noise generation and bounds approximation separately
 from aggregation functions.
+* **Unbiasedness:** aggregations should be unbiased if possible. In particular,
+we prefer unbiased aggregations over aggregations that post process results for
+consistency reasons (e.g. we do not clipp negative count values to 0 as this
+would introduce bias). However, we may use biased aggregations if an unbiased
+solution is not known, provides inferior utility, does not support distributed
+computation or is significantly more complex to understand/implement/maintain.
+The library should clearly indicate which aggregations are unbiased and test for
+this property.
 * **Robust Testing:** each feature must come with a full set of unit tests, and
 the privacy guarantees must be tested end-to-end.
 * Markdown is preferred for explaining complex concepts and math over lengthy

diff --git a/accounting/python/BUILD.bazel b/accounting/python/BUILD.bazel
@@ -29,6 +29,7 @@ py_library(
     srcs = [
         "privacy_loss_distribution.py",
     ],
+    srcs_version = "PY3",
     deps = [
         requirement("numpy"),
         requirement("scipy"),

diff --git a/accounting/python/privacy_loss_distribution.py b/accounting/python/privacy_loss_distribution.py
@@ -253,17 +253,11 @@ def from_two_probability_mass_functions(
     # Discretize the probability mass so that the values are integer multiples
     # of value_discretization_interval
     rounded_probability_mass_function = collections.defaultdict(lambda: 0)
+    round_fn = math.ceil if pessimistic_estimate else math.floor
     for val in probability_mass_function:
-      if pessimistic_estimate:
-        # When we would like a pessimistic estimate, round the value up.
-        rounded_probability_mass_function[int(
-            math.ceil(val / value_discretization_interval)
-        )] += probability_mass_function[val]
-      else:
-        # When we would like an optimistic estimate, round the value down.
-        rounded_probability_mass_function[int(
-            math.floor(val / value_discretization_interval)
-        )] += probability_mass_function[val]
+      rounded_probability_mass_function[
+          round_fn(val / value_discretization_interval)
+          ] += probability_mass_function[val]
 
     return cls(rounded_probability_mass_function, value_discretization_interval,
                infinity_mass)

diff --git a/cc/algorithms/BUILD b/cc/algorithms/BUILD
@@ -41,6 +41,7 @@ cc_test(
     copts = ["-Wno-sign-compare"],
     deps = [
         ":algorithm",
+        "//base:statusor",
         "//base/testing:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
@@ -282,6 +283,7 @@ cc_test(
     deps = [
         ":count",
         ":numerical-mechanisms-testing",
+        "//base:statusor",
         "//base/testing:proto_matchers",
         "//base/testing:status_matchers",
         "@com_google_googletest//:gtest_main",
@@ -300,6 +302,7 @@ cc_library(
         "//base:logging",
         "//base:status",
         "//base:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -327,6 +330,7 @@ cc_library(
     deps = [
         ":rand",
         ":util",
+        "//base:logging",
         "//base:status",
         "//base:statusor",
         "@com_google_absl//absl/memory",
@@ -359,8 +363,14 @@ cc_library(
     deps = [
         ":distributions",
         ":util",
+        "//base:logging",
         "//base:status",
+        "//base:statusor",
         "@com_google_differential_privacy//proto:confidence_interval_cc_proto",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -372,6 +382,7 @@ cc_test(
     deps = [
         ":distributions",
         ":numerical-mechanisms",
+        "//base:statusor",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -451,6 +462,7 @@ cc_test(
         ":algorithm",
         ":approx-bounds",
         ":bounded-algorithm",
+        "//base:statusor",
         "//base/testing:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
@@ -497,10 +509,10 @@ cc_library(
     copts = ["-Wno-sign-compare"],
     deps = [
         ":numerical-mechanisms",
-        ":util",
         ":rand",
+        ":util",
         "//base:status",
-        "//base:statusor"
+        "//base:statusor",
     ],
 )
 
@@ -509,8 +521,8 @@ cc_test(
     srcs = ["partition-selection_test.cc"],
     copts = ["-Wno-sign-compare"],
     deps = [
-        ":partition-selection",
         ":numerical-mechanisms-testing",
+        ":partition-selection",
         "@com_google_googletest//:gtest_main",
     ],
-)
+)
diff --git a/cc/algorithms/algorithm_test.cc b/cc/algorithms/algorithm_test.cc
@@ -23,6 +23,7 @@
 #include "base/testing/status_matchers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "base/statusor.h"
 
 namespace differential_privacy {
 namespace {

diff --git a/cc/algorithms/approx-bounds.h b/cc/algorithms/approx-bounds.h
@@ -263,6 +263,10 @@ class ApproxBounds : public Algorithm<T> {
       return 0;
     }
 
+    // Clamp infinities to highest and lowest value.
+    value = Clamp(std::numeric_limits<T>::lowest(),
+                  std::numeric_limits<T>::max(), value);
+
     // Sometimes the minimum numeric limit has greater magnitude than the
     // maximum. In this case clamp its magnitude at the maximum numeric limit to
     // find msb. In reality our negative bin will accommodate the value.
@@ -538,7 +542,7 @@ class ApproxBounds : public Algorithm<T> {
     for (int i = 0; i < bins.size(); ++i) {
       double noised_dbl =
           mechanism_->AddNoise(static_cast<double>(bins[i]), privacy_budget);
-      noisy_bins[i] = SafeCastFromDouble<T>(noised_dbl);
+      SafeCastFromDouble<T>(noised_dbl, noisy_bins[i]);
     }
     return noisy_bins;
   }

diff --git a/cc/algorithms/approx-bounds_test.cc b/cc/algorithms/approx-bounds_test.cc
@@ -241,6 +241,27 @@ TEST(ApproxBoundsTest, DropNanEntries) {
   EXPECT_EQ(result.elements(1).value().float_value(), 1);
 }
 
+TEST(ApproxBounds, HandleInfinityEntries) {
+  std::vector<double> a = {1, 1, 1, INFINITY, INFINITY};
+  const double bins = 13;
+  const double base = 2;
+  const double scale = 7;
+  std::unique_ptr<ApproxBounds<double>> bounds =
+      ApproxBounds<double>::Builder()
+          .SetNumBins(bins)
+          .SetBase(base)
+          .SetScale(scale)
+          .SetThreshold(2)
+          .SetLaplaceMechanism(absl::make_unique<ZeroNoiseMechanism::Builder>())
+          .Build()
+          .ValueOrDie();
+  bounds->AddEntries(a.begin(), a.end());
+  auto result = bounds->PartialResult().ValueOrDie();
+  EXPECT_EQ(result.elements(0).value().float_value(), 0);
+  const double max_result = scale * std::pow(base, bins - 1);
+  EXPECT_EQ(result.elements(1).value().float_value(), max_result);
+}
+
 TEST(ApproxBoundsTest, NumPositiveBins) {
   std::unique_ptr<ApproxBounds<double>> bounds = ApproxBounds<double>::Builder()
                                                      .SetNumBins(2)

diff --git a/cc/algorithms/bounded-sum.h b/cc/algorithms/bounded-sum.h
@@ -285,7 +285,9 @@ class BoundedSum : public Algorithm<T> {
     // Add noise to sum. Use the remaining privacy budget.
     double noisy_sum = mechanism_->AddNoise(sum, remaining_budget);
     if (std::is_integral<T>::value) {
-      AddToOutput<T>(&output, std::round(noisy_sum));
+      T value;
+      SafeCastFromDouble<T>(std::round(noisy_sum), value);
+      AddToOutput<T>(&output, value);
     } else {
       AddToOutput<T>(&output, noisy_sum);
     }

diff --git a/cc/algorithms/bounded-variance.h b/cc/algorithms/bounded-variance.h
@@ -459,7 +459,7 @@ class BoundedVariance : public Algorithm<T> {
     return mechanism_builder->SetEpsilon(epsilon)
         .SetL0Sensitivity(l0_sensitivity)
         .SetLInfSensitivity(max_contributions_per_partition *
-                            static_cast<double>((upper - lower) / 2))
+                            static_cast<double>(upper - lower) / 2.0)
         .Build();
   }
 

diff --git a/cc/algorithms/count.h b/cc/algorithms/count.h
@@ -82,8 +82,9 @@ class Count : public Algorithm<T> {
   base::StatusOr<Output> GenerateResult(double privacy_budget,
                                         double noise_interval_level) override {
     Output output;
-    int64_t countWithNoise = SafeCastFromDouble<int64_t>(
-        std::round(mechanism_->AddNoise(count_, privacy_budget)));
+    int64_t countWithNoise;
+    SafeCastFromDouble(std::round(mechanism_->AddNoise(count_, privacy_budget)),
+                       countWithNoise);
     AddToOutput<int64_t>(&output, countWithNoise);
 
     base::StatusOr<ConfidenceInterval> interval =

diff --git a/cc/algorithms/count_test.cc b/cc/algorithms/count_test.cc
@@ -23,6 +23,7 @@
 #include "base/testing/status_matchers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "base/statusor.h"
 #include "algorithms/numerical-mechanisms-testing.h"
 #include "proto/data.pb.h"
 #include "proto/summary.pb.h"

diff --git a/cc/algorithms/distributions.cc b/cc/algorithms/distributions.cc
@@ -16,9 +16,11 @@
 #include "algorithms/distributions.h"
 
 #include <cmath>
+#include <limits>
 
 #include "absl/memory/memory.h"
 #include "absl/random/random.h"
+#include "base/statusor.h"
 #include "absl/strings/string_view.h"
 #include "algorithms/rand.h"
 #include "algorithms/util.h"
@@ -52,30 +54,32 @@ double ApproximateBinomialProbability(double sqrt_n, int64_t m) {
 
 }  // namespace
 
-GaussianDistribution::GaussianDistribution(double stddev)
-    : stddev_(stddev),
-      granularity_(GetNextPowerOfTwo(2 * stddev / kBinomialBound)) {
+GaussianDistribution::GaussianDistribution(double stddev) : stddev_(stddev) {
   DCHECK_GE(stddev, 0.0);
 }
 
 double GaussianDistribution::Sample(double scale) {
   DCHECK_GT(scale, 0);
   // TODO: make graceful behaviour when sigma is too big.
   double sigma = scale * stddev_;
+  double granularity = GetGranularity(scale);
 
   // The square root of n is chosen in a way that ensures that the respective
   // binomial distribution approximates a Gaussian distribution close enough.
   // The sqrt(n) is taken instead of n, to ensure that all results of arithmetic
   // operations fit in 64 bit integer range.
-  double sqrt_n = 2.0 * sigma / granularity_;
-  return SampleBinomial(sqrt_n) * granularity_;
+  double sqrt_n = 2.0 * sigma / granularity;
+  return SampleBinomial(sqrt_n) * granularity;
 }
 
 double GaussianDistribution::Sample() { return Sample(1.0); }
 
 double GaussianDistribution::Stddev() { return stddev_; }
 
-double GaussianDistribution::GetGranularity() { return granularity_; }
+double GaussianDistribution::GetGranularity(double scale) const {
+  double sigma = scale * stddev_;
+  return GetNextPowerOfTwo(2 * sigma / kBinomialBound);
+}
 
 GeometricDistribution::GeometricDistribution(double lambda) : lambda_(lambda) {
   DCHECK_GE(lambda, 0);

diff --git a/cc/algorithms/distributions.h b/cc/algorithms/distributions.h
@@ -17,6 +17,9 @@
 #ifndef DIFFERENTIAL_PRIVACY_ALGORITHMS_DISTRIBUTIONS_H_
 #define DIFFERENTIAL_PRIVACY_ALGORITHMS_DISTRIBUTIONS_H_
 
+#include <memory>
+
+#include <cstdint>
 #include "base/statusor.h"
 
 namespace differential_privacy {
@@ -44,7 +47,10 @@ class GaussianDistribution {
   // Returns the standard deviation of this distribution.
   double Stddev();
 
-  double GetGranularity();
+  // Returns the granularity that is also used when calculating Sample(). Be
+  // careful when using GetGranularity() together with Sample() and make sure to
+  // use the same parameter for scale in such cases.
+  double GetGranularity(double scale) const;
 
  private:
   // Sample from geometric distribution with probability 0.5. It is much faster
@@ -53,7 +59,6 @@ class GaussianDistribution {
   double SampleBinomial(double sqrt_n);
 
   double stddev_;
-  double granularity_;
 };
 
 // Returns a sample drawn from the geometric distribution of probability