<a href="https://colab.research.google.com/github/ghosh-sarbajit/DifferentialPrivacy/blob/main/DPSgd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install opacus
!pip install opendp==0.11
from IPython.display import clear_output
clear_output()

# CHAPTER 9 Differentially Private Machine Learning (219p)

# Differentially Private Gradient


In [17]:
import numpy as np
import opendp.prelude as dp

In [18]:
def make_nabla_loss_i(w):
    dp.assert_features("contrib", "floating-point")
    w_0, w_1 = w
    def f_compute_grads(data):
        x, y = data[np.newaxis].T
        y_hat = w_0 + w_1 * x # forward pass y^ = f(x)
        return (y_hat - y) * np.column_stack([np.ones(x.size), x])
    space = dp.numpy.array2_domain(T=float), dp.symmetric_distance()
    return dp.t.make_user_transformation(
        *space, *space, f_compute_grads,
        stability_map=lambda b_in: b_in)

In [19]:
N = 100_000
# public metadata
# "load" the data
x = np.random.uniform(-5, 5, size=N)
y = 3 + 2 * x + np.random.normal(size=x.size)
data = np.column_stack((x, y))
max_contributions = 1

In [20]:
# model hyperparameters
w = np.array([0.0, 0.0]) # initial choice of params
gamma, num_steps = 0.3, 20
norm = 2. # assumes most grads have magnitude lte 2
noise_std = 100.

In [32]:
# CHAPTER 3 Stable Transformations 79p
def make_np_sum(norm, p, origin=None):
    dp.assert_features("contrib", "floating-point")
    assert norm >= 0, "norm must not be negative"
    # assume the origin is at zero if not specified
    origin = 0.0 if origin is None else origin
    #    C = ||O||_p    + R
    constant = np.linalg.norm(np.atleast_1d(origin), ord=p) + norm
    return dp.t.make_user_transformation(input_domain=dp.numpy.array2_domain(norm=norm, p=p, origin=origin),
        input_metric=dp.symmetric_distance(),
        output_domain=dp.vector_domain(dp.atom_domain(T=float)),
        output_metric={1: dp.l1_distance, 2: dp.l2_distance}[p](T=float),
        function=lambda data: data.sum(axis=0),
        stability_map=lambda b_in: b_in * constant)

In [33]:
# CHAPTER 3 Stable Transformations 80p
def make_np_clamp(norm, p, origin=None):
    dp.assert_features("contrib", "floating-point")
    assert norm >= 0., "norm must not be negative"
    # assume the origin is at zero if not specified
    origin = 0.0 if origin is None else origin

    def clamp_row_norms(data):
        data = data.copy()
        # shift the data around zero
        data -= origin

        # compute the p-norm of each row
        row_norms = np.linalg.norm(data, ord=p, axis=1, keepdims=True)
        # scale each row down to have norm at most 1
        data /= np.maximum(row_norms / norm, 1)

        # shift the normed data around zero back to `origin`
        data += origin
        return data

    return dp.t.make_user_transformation(
    input_domain= dp.numpy.array2_domain (T=float), # input data is unconstrained
    input_metric=dp.symmetric_distance(),
    output_domain=dp.numpy.array2_domain(norm=norm, p=p, origin=origin),
    output_metric=dp.symmetric_distance(),
    function=clamp_row_norms,
    stability_map=lambda b_in: b_in) # norm clamping is 1-stable row-by-row

In [30]:
# https://github.com/opendp/opendp/discussions/304
from opendp.mod import enable_features
enable_features("contrib")
enable_features("floating-point")
enable_features("honest-but-curious")

In [34]:
sum_meas = make_np_clamp(norm, p=2) >> \
    make_np_sum(norm, p=2) >> \
    dp.m.then_gaussian(scale=noise_std) >> \
    np.array # a postprocessor- load into a numpy array

In [36]:
meas_comp = dp.c.make_sequential_composition(
    input_domain=sum_meas.input_domain,
    input_metric=sum_meas.input_metric,
    output_measure=dp.zero_concentrated_divergence(T=float),
    d_in=max_contributions,
    d_mids=[sum_meas.map(max_contributions)] * num_steps
)
# qbl is an instance of the compositor that allows up to `num_steps` queries
qbl = meas_comp(data)
# now the only way to access the data is through the compositor
del data

NameError: name 'data' is not defined

In [37]:
print(meas_comp.map(max_contributions)) # -> 0.004 = ρ
εδ_curve = dp.c.make_zCDP_to_approxDP(meas_comp).map(max_contributions)
print(εδ_curve.epsilon(1e-8)) # -> (0.4659, 1e-8) = (ε, δ)

0.004000000000000002
0.46596519652756707


In [38]:
for _ in range(num_steps):
    # make a mechanism that computes the gradient
    meas_nabla_loss = make_nabla_loss_i(w) >> sum_meas
    # privately release the gradient by querying the compositor
    w -= gamma * 2 / N * qbl(meas_nabla_loss)

AttributeError: module 'opendp.prelude' has no attribute 'np_array2_domain'

In [None]:
for _ in range(num_steps):
    # make a mechanism that computes the gradient
    meas_nabla_loss = make_nabla_loss_i(w) >> sum_meas
    # privately release the gradient by querying the compositor
    w -= gamma * 2 / N * qbl(meas_nabla_loss)

In [None]:
print("params:", w) # ~> [3.00183246 1.97430499]

# Stochastic Batching (DP-SGD)

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, \
OrdinalEncoder, Normalizer
from opacus import PrivacyEngine

In [None]:
class AdultDataSet(Dataset):
    def __init__(self, adult_data_file):
    header = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income']
    df = pd.read_csv(adult_data_file, header=None, names=header,
    sep=',\\s', na_values=['?'], engine='python')
    df = df.dropna()
    df = df.reset_index(drop=True)
    df['income'] = df['income'].apply(lambda x: x.replace('.', ''))
    categorical_columns = ['workclass', 'education', 'marital_status',
    'occupation', 'relationship', 'race', 'sex',
    'native_country']
    numerical_columns = ['age', 'capital_gain',
    'capital_loss', 'hours_per_week']
    column_transformer = make_column_transformer(
    (OrdinalEncoder(), categorical_columns),
    (StandardScaler(), numerical_columns),
    )
    self.y = LabelEncoder().fit_transform(df['income']).astype(float)
    self.X = column_transformer.fit_transform(df)
    self.X = Normalizer().fit_transform(self.X)
    def __len__(self):
    return len(self.y)