# DP-SGD (Differentially-Private Stochastic Gradient Descent)
reference : https://medium.com/pytorch/differential-privacy-series-part-1-dp-sgd-algorithm-explained-12512c3959a3

In [None]:
optimizer = torch.optim.SGD(lr=args.lr)

for batch in Dataloader(train_dataset, batch_size=32):
    x, y = batch
    y_hat = model(x)
    loss = criterion(y_hat, y)
    loss.backward()
    
    # Now these are filled:
    gradients = (p.grad for p in model.parameters())
  
    for p in model.parameters():

        # Add our differential privacy magic here
        p.grad += torch.normal(mean=0, std=args.sigma)
        
        # This is what optimizer.step() does
        p = p - args.lr * p.grad
        p.grad.zero_()

In [None]:
optimizer = torch.optim.SGD(lr=args.lr)

for batch in Dataloader(train_dataset, batch_size=32):
    all_per_sample_gradients = [] # will have len = batch_size
    for sample in batch:
        x, y = sample
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()  # Now p.grad for this x is filled
        
        # Need to clone it to save it
        per_sample_gradients = [p.grad.detach().clone() for p in model.parameters()]
        
        all_per_sample_gradients.append(per_sample_gradients)
        model.zero_grad()  # p.grad is cumulative so we'd better reset it

<img src = './figures/dp-sgd-algorithm.png' width=600>

1.Compute the per-sample gradients

2.Clip them to a fixed maximum norm

3.Aggregate them back into a single parameter gradient

4.Add noise to it

In [5]:
from torch.nn.utils import clip_grad_norm_

optimizer = torch.optim.SGD(lr=args.lr)

for batch in Dataloader(train_dataset, batch_size=32):
    for param in model.parameters():
        param.accumulated_grads = []
    
    # Run the microbatches
    for sample in batch:
        x, y = sample
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()
    
        # Clip each parameter's per-sample gradient
        for param in model.parameters():
            #1.compute the per-sample gradients
            per_sample_grad = p.grad.detach().clone()
            #clip them to a dixed maximum norm
            clip_grad_norm_(per_sample_grad, max_norm=args.max_grad_norm)  # in-place
            param.accumulated_grads.append(per_sample_grad)  
        
    # 3. Aggregate back
    for param in model.parameters():
        param.grad = torch.stack(param.accumulated_grads, dim=0)

    # Now we are ready to update and add noise!
    for param in model.parameters():
        param = param - args.lr * param.grad
        #4. add noise
        param += torch.normal(mean=0, std=args.noise_multiplier * args.max_grad_norm)
        
        param.grad = 0  # Reset for next iteration

In [3]:
# !pip install opacus

In [None]:
from opacus import PrivacyEngine
# define your components as usual
model = Net()
optimizer = SGD(model.parameters(), lr=0.05)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=1024)

# enter PrivacyEngine
privacy_engine = PrivacyEngine()
model, optimizer, data_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=data_loader,
    noise_multiplier=1.1,
    max_grad_norm=1.0,
)
# Now it's business as usual