To try:
turn off noise - 87% accuracy after one epoch (actually better than what I got for DPSGD, which is ~70% at best)
regularizer only (no public) - gets ~75% after one epoch w/ no noise, ~63% with noise
try adam
larger public batches - similar performance w/o noise (87%), memory issues w/ opacus (should ask Shuang)
without clipping if that's limitation
linear regression

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import numpy as np
import random
import copy
import time
from torch.optim.optimizer import Optimizer, required
from opacus.privacy_engine import PrivacyEngine
from opacus.utils.tensor_utils import calc_sample_norms

#devicestring = "cpu"
devicestring = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(devicestring)
print("Running on "+devicestring)

Running on cuda:0


In [22]:
#MNIST Dataset loading
publicratio = 1.0/100
batch_size = 500
transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))])
#First get whole training set
trainset = torchvision.datasets.MNIST(root = "./data", train=True, download=True, transform = transform)
#Compute its size
dataset_size = len(trainset)
#Split into two based on dataset_size and publicratio
publicset, privateset = torch.utils.data.random_split(trainset, [int(dataset_size*publicratio), dataset_size-int(dataset_size*publicratio)])
#Public data only has one batch so we can compute full gradient
publicloader = torch.utils.data.DataLoader(publicset, batch_size=int(dataset_size*publicratio), shuffle=True, num_workers=2)
privateloader = torch.utils.data.DataLoader(privateset, batch_size=batch_size, shuffle=True, num_workers=2)
testset = torchvision.datasets.MNIST(root="./data", train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=2)
        

In [14]:
#Check that the splitting procedure worked as desired
print(len(publicset))
print(len(privateset))
print(len(testset))

600
59400
10000


In [15]:
#MNIST net used in DPSGD experiments
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=(2, 2))
        self.pool1 = nn.MaxPool2d((2, 2), (1, 1))
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=(2, 2))
        self.pool2 = nn.MaxPool2d((2, 2), (1, 1))
        self.fc1 = nn.Linear(32*3*3, 32)
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        x = self.pool1(F.tanh(self.conv1(x)))
        x = self.pool2(F.tanh(self.conv2(x)))
        x = x.view(-1, 32*3*3)
        x = self.fc1(x)
        x = self.fc2(F.tanh(x))
        return F.log_softmax(x)

model = Net()

In [16]:
def printaccuracy(model):
    #Training accuracy computed wrt private data
    correct = 0
    total = 0
    with torch.no_grad():
        for data in privateloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Training accuracy: %f %%' % (
        100.0 * correct / total))
    trainacc = 100.0 * correct / total

    #Test accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Test accuracy: %f %%' % (
        100.0 * correct / total))
    testacc = 100.0 * correct / total
    return trainacc, testacc

In [20]:
#Parameters for pre-training on public data
publicepochs = 1000
publiceta = .001
#Parameters for mirror descent
privateepochs = 5
privatesubepochs = 15
privateeta = 1 #The multiplier in the mirror descent step; what eta would be if the public loss was l_2^2
gradienteta = 1.0/max(privatesubepochs, 1) #The eta used in gradient descent to approximately apply the mirror descent step
alpha = .01 #Multiplier for regularizer of public loss
max_grad_norm = 1.0
criterion = nn.CrossEntropyLoss()
PATH = "./mirrorMNIST.pt"

In [21]:
#Benchmark: SGD on private data (from warm start / with public-based clipping)

epochs = 5
model = Net()
#model.load_state_dict(torch.load(PATH))
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr = 1.0)
accountant = PrivacyEngine(model, sample_rate = 1.0*batch_size/len(privateset), epochs = epochs,
                           max_grad_norm = 1.0, noise_multiplier = 1.0, target_delta = .00001)
accountant.attach(optimizer)

for epoch in range(epochs):
    for i, data in enumerate(privateloader, 0):
        for ipub, datapub in enumerate(publicloader, 0):
            model.zero_grad()
            images, labels = datapub[0].to(device), datapub[1].to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            oldpubgrad = [param.grad.clone() for param in list(model.parameters())]
            all_norms = calc_sample_norms(accountant.clipper._named_grad_samples(),
                                          flat = not accountant.clipper.norm_clipper.is_per_layer,)
            batchmaxnorm = np.percentile(np.array(all_norms[0].tolist()), 90)
            print(batchmaxnorm)
#             accountant.clipper.norm_clipper.flat_value = batchmaxnorm
#             accountant.max_grad_norm = batchmaxnorm
            
        model.zero_grad()
        images, labels = data[0].to(device), data[1].to(device)
        outputs = model(images)
        reg = 0.0
        for param in model.parameters():
            reg += 0.5 * (param ** 2).sum()
        loss = criterion(outputs, labels) + alpha * reg / 2.0
        loss.backward()
        optimizer.step()
    print("Epoch "+str(epoch))
    printaccuracy(model)
    eps, _ = accountant.get_privacy_spent()
    print("Epsilon: "+str(eps)+", Delta: "+str(accountant.target_delta))

5.440585565567017
5.395879077911377
5.417283725738526
5.446928071975708
5.486677646636963
5.547674751281739
5.685459613800049
5.776526689529419
5.904817533493042
6.006949186325073
6.205539798736573
6.387717390060425
6.582955837249756
6.790817403793335
7.017303562164307
7.258028841018676
7.425089979171752
7.831024312973023
7.949736118316651
8.23702630996704
8.30252161026001
8.666950225830078
8.657898616790773
8.99028205871582
9.251654529571534
9.462717533111572
9.58435640335083
9.826332664489746
10.078086566925048
10.138338184356689
10.266630935668946
9.970556259155273
10.782227516174316
10.061916065216066
10.664013957977296
10.193002700805666
10.653643798828131
10.549207210540773
10.745615196228027
10.8653244972229
10.817012691497805
11.108688831329346
10.988939189910889
11.633142852783203
11.050825214385986
11.701076602935794
11.192299461364748
11.71213312149048
11.573079204559328
11.729810428619386
11.72019557952881
11.903264713287355
12.162795734405517
12.154720306396484
12.26750459

RuntimeError: Given groups=1, weight of size [16, 1, 5, 5], expected input[500, 3, 32, 32] to have 1 channels, but got 3 channels instead

In [None]:
model = model.to(device)
publicoptimizer = optim.SGD(model.parameters(), lr = publiceta)

#Pre-train with public data
for epoch in range(publicepochs):
    if epoch % 100 == 0:
        print(epoch)
    for i, data in enumerate(publicloader, 0):
        model.zero_grad()
        images, labels = data[0].to(device), data[1].to(device)
        outputs = model(images)
        reg = 0.0
        for param in model.parameters():
            reg += 0.5 * (param ** 2).sum()
        loss = criterion(outputs, labels) + alpha * reg / 2.0
        loss.backward()
        publicoptimizer.step()
torch.save(model.state_dict(), PATH)

In [25]:
model = Net()
#model.load_state_dict(torch.load(PATH)) #Toggle this to load warm start
model = model.to(device)
privateoptimizer = optim.SGD(model.parameters(), lr = 0.0)
accountant = PrivacyEngine(model, sample_rate = 1.0*batch_size/len(privateset), epochs = privateepochs,
                            max_grad_norm = 100.0, noise_multiplier = 1.0, target_delta = .00001)
accountant.attach(privateoptimizer)
mdoptimizer = optim.SGD(model.parameters(), lr = gradienteta)
#mdoptimizer = optim.Adam(model.parameters(), lr = .001)

def regularizer(model):
    reg = 0.0
    for param in model.parameters():
        reg += 0.5 * (param ** 2).sum()
    return alpha*reg
    

#Training accuracy computed wrt private data
print("Initial model:")
printaccuracy(model)
    
for epoch in range(privateepochs):
    starttime = time.time()
    for i, data in enumerate(privateloader, 0):
        #Store old gradient of regularizer
        model.zero_grad()
        loss = regularizer(model)
        loss.backward()
        oldreggrad = [param.grad.clone() for param in list(model.parameters())]
                
        #Store gradient of old regularized public loss
        for ipub, datapub in enumerate(publicloader, 0):
            model.zero_grad()
            images, labels = datapub[0].to(device), datapub[1].to(device)
            outputs = model(images)
            loss = criterion(outputs, labels) + regularizer(model)
            loss.backward()
            oldpubgrad = [param.grad.clone() for param in list(model.parameters())]
            all_norms = calc_sample_norms(accountant.clipper._named_grad_samples(),
                                          flat = not accountant.clipper.norm_clipper.is_per_layer,)
            batchmaxnorm = np.percentile(np.array(all_norms[0].tolist()), 90)
            accountant.clipper.norm_clipper.flat_value = batchmaxnorm
            accountant.max_grad_norm = batchmaxnorm
        print(batchmaxnorm)
        
        #Store gradient of private loss in privgrad
        model.zero_grad()
        images, labels = data[0].to(device), data[1].to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        #Compute the norm to use for clipping
        
        #Clip and add noise, also do first gradient step
        privateoptimizer.step()
        privgrad = [param.grad.clone() for param in list(model.parameters())] #Storing the clipped, noisy gradient

        for subepoch in range(privatesubepochs):
            #Store new gradient of regularizer
            model.zero_grad()
            loss = regularizer(model)
            loss.backward()
            newreggrad = [param.grad.clone() for param in list(model.parameters())]
            
            #Store gradient of new regularized public loss
            print(str(i+1)+"/"+str(1+int(len(privateset)/batch_size))+" "+str(subepoch+1)+"/"+str(privatesubepochs)+"          ", end="\r")
            for ipub, datapub in enumerate(publicloader, 0):
                model.zero_grad()
                images, labels = datapub[0].to(device), datapub[1].to(device)
                outputs = model(images)
                loss = criterion(outputs, labels) + regularizer(model)
                loss.backward()
            #Manually compute l_2 norm of public gradient 
            j = 0
            normsquared = 0.0
            for param in model.parameters():
                normsquared += ((param.grad.clone() - oldpubgrad[j])**2).sum()
                j += 1
            norm = normsquared ** 0.5
            #scale = min(1.0, max_grad_norm/norm) #Toggle this and next line to disable/enable public gradient scaling
            scale = 1.0
            #Accumulate the mirror descent gradient
            j = 0
            for param in model.parameters():
                mirrorgrad = (param.grad.clone() - oldpubgrad[j])*scale+newreggrad[j]-oldreggrad[j]+privgrad[j]*privateeta
                param.grad = mirrorgrad
                j += 1
            mdoptimizer.step()
    
    endtime = time.time()
    #Print training/test accuracy
    print("Epoch "+str(epoch+1)+"                    ")
    print("Time taken:"+str(endtime-starttime))
    printaccuracy(model)
    eps, _ = accountant.get_privacy_spent()
    print("Epsilon: "+str(eps)+", Delta: "+str(accountant.target_delta))

Initial model:
Training accuracy: 10.294613 %
Test accuracy: 10.490000 %
5.033283042907715
5.061603212356568    
5.424864864349365    
6.049815893173219    
6.904062271118164    
7.658203506469727    
8.338461780548096    
8.945199871063233    
9.412149715423583    
9.706239700317385    
9.792721366882326     
9.773414325714114     
9.753943347930909     
9.556483936309817     
9.327620792388917     
8.990334606170654     
8.626467037200928     
8.329613494873048     
7.8722510814666755    
7.633558940887453     
7.26455240249634      
6.913193464279176     
6.600597095489503     
6.280859661102296     
6.040863609313965     
5.839288949966431     
5.664372158050537     
5.415504980087281     
5.195626354217531     
5.08429651260376      
4.9632229804992685    
4.7866282939910905    
4.731948184967041     
4.595897579193116     
4.469481468200686     
4.34823899269104      
4.264409017562889     
4.16892666816716      
4.128041791915908     
4.088994503021249     
4.036056327819829    

KeyboardInterrupt: 