<a href="https://colab.research.google.com/github/eazydammy/private-ai/blob/master/course-projects/Differential%20Privacy%20Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# install PySyft
!pip install syft

# import required libraries
import torch
import random
import numpy as np
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Subset
from torchvision import datasets, transforms
from syft.frameworks.torch.differential_privacy import pate

# set training device as GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
# import datasets and apply transforms
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

In [0]:
# define function to split training data into sub-datasets for a number of teachers
# returns nt sub-datasets to train the each teacher model
# TODO: split into train and test 
def split_train_data(train_data, num_teachers):
    teacher_loaders = []
    p, q = divmod(len(train_data), num_teachers)
    #split_indices = list((train_data[i * p + min(i, q):(i + 1) * p + min(i + 1, q)] for i in range(num_teachers)))
    split_indices = list(range((i * p + min(i, q)) , ((i + 1) * p + min(i+1, q))) for i in range(num_teachers))
    for j in range(len(split_indices)):
        subset_j = Subset(train_data, split_indices[j])
        loader_j = torch.utils.data.DataLoader(subset_j, batch_size=64, shuffle=True)
        teacher_loaders.append(loader_j)
    return teacher_loaders

In [0]:
# define function to take a chunk of the test data as private dataset
# returns reduced test data and private dataset in ratio: 0 < ratio < 1
# to split reduced_test_data:private_data in 80:20, use ratio 0.8
def split_test_data(test_data, ratio):    
    divide = int(len(test_data) * ratio)
    
    reduced_indices = range(0, divide)
    private_indices = range(divide, len(test_data))

    reduced_subset = Subset(test_data, reduced_indices)
    private_subset = Subset(test_data, private_indices)
    
    test_loader = torch.utils.data.DataLoader(reduced_subset, batch_size=64, shuffle=True)
    private_loader = torch.utils.data.DataLoader(private_subset, batch_size=64, shuffle=True)
    
    return test_loader, private_loader

In [0]:
# define class to build linear classifier models for each teacher
class Classifier(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5):
        ''' Builds a feedforward network with arbitrary hidden layers.
        
            Arguments
            ---------
            input_size: integer, size of the input layer
            output_size: integer, size of the output layer
            hidden_layers: list of integers, the sizes of the hidden layers
        
        '''
        super().__init__()
        # Input to a hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])])
        
        # Add a variable number of more hidden layers
        layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        self.output = nn.Linear(hidden_layers[-1], output_size)
        self.dropout = nn.Dropout(p=drop_p)
        
    def forward(self, x):
        ''' Forward pass through the network, returns the output logits '''
        
        for each in self.hidden_layers:
            x = F.relu(each(x))
            x = self.dropout(x)
        x = self.output(x)
        
        return F.log_softmax(x, dim=1)

In [0]:
# define function to train model given train and test datasets
def train(model, trainloader, testloader, criterion, optimizer, epochs=5, print_every=50):
    steps = 0
    running_loss = 0
    model.to(device)
    for e in range(epochs):
        model.train() # Model in training mode, grad & dropout is on
        for images, labels in trainloader:
            images, labels = images.to(device), labels.to(device)
            steps += 1
            images.resize_(images.size()[0], 784)
            optimizer.zero_grad()
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            if steps % print_every == 0:
                model.eval() # Model in evaluation mode, grad & dropout is off
                with torch.no_grad():
                    test_loss, accuracy = validation(model, testloader, criterion)
                
                print("Epoch: {}/{}.. ".format(e+1, epochs),
                      "Training Loss: {:.3f}.. ".format(running_loss/print_every),
                      "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                      "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))
                
                running_loss = 0
                model.train() # Model in training mode, grad & dropout is on

In [0]:
# define function to validate model using the reduced test data set
def validation(model, testloader, criterion):
    accuracy = 0
    test_loss = 0
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        images = images.resize_(images.size()[0], 784)
        output = model.forward(images)
        test_loss += criterion(output, labels).item()
        ## Calculating the accuracy 
        # Model's output is log-softmax, take exponential to get the probabilities
        ps = torch.exp(output)
        equality = (labels.data == ps.max(1)[1])
        top_p, top_class = ps.topk(1, dim = 1)
        equality = top_class == labels.view(*top_class.shape)
        # Accuracy is number of correct predictions divided by all predictions, just take the mean
        accuracy += equality.float().mean()
    return test_loss, accuracy

In [0]:
# define function to build teacher models
def build_teacher_models(num_teachers, teacher_dropout):
    teacher_models = []
    for i in range(num_teachers):
        teacher_model = Classifier(784, 10, [512, 256, 128], drop_p = teacher_dropout)
        teacher_models.append(teacher_model)
    return teacher_models

In [0]:
# define function to train teacher models
def train_teacher_models(teacher_models, teacher_loaders, test_loader, teacher_criterion):
    for i in range(len(teacher_models)):
        print("Begin training Teacher", i+1)
        teacher_optimizer = optim.SGD(teacher_models[i].parameters(), lr=1e-3, momentum=0.9)
        train(teacher_models[i], teacher_loaders[i], test_loader, teacher_criterion, teacher_optimizer, epochs=50)
        print("Teacher",i+1,"trained successfully! \n")

In [0]:
# define function to perform PATE analysis
def run_pate_analysis(noised_teacher_labels, raw_teacher_labels, epsilon):
    data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=noised_teacher_labels, indices=raw_teacher_labels, noise_eps=epsilon, delta=1e-5)
    print("Data Independent Epsilon:", data_ind_eps)
    print("Data Dependent Epsilon:", data_dep_eps)

In [0]:
# define function to save model training results
def save_trained_models():
  for i in range(len(teacher_models)): 
    torch.save(teacher_models[i].state_dict(), 'checkpoint%d.pth'%(i+1))
    print("Saved model results for Teacher", (i+1))

In [0]:
# define function to load pre-trained models
def load_trained_models():
  for i in range(len(teacher_models)):
    state_dict = torch.load('checkpoint%d.pth'%(i+1))
    teacher_models[i].load_state_dict(state_dict)

In [0]:
# define function to classify private dataset
def label_private_data(teacher_models, private_loader):
  raw_teacher_labels = np.array([])
  for i in range(len(teacher_models)): 
    teacher_models[i].to(device)
    model_label = np.array([])#, dtype=np.int64).reshape(len(private_loader))
    for images, _ in private_loader:
        images = images.to(device)
        images = images.resize_(images.size()[0], 784)
        with torch.no_grad():
          output = teacher_models[i].forward(images)
          ps = torch.exp(output)
        top_p, top_label = ps.topk(1, dim = 1)
        top_label = top_label.to('cpu')
        model_label = np.vstack((model_label, top_label)) if model_label.size else np.array(top_label)
    #raw_teacher_labels.append(np.array(model_label))
    raw_teacher_labels = np.hstack((raw_teacher_labels, model_label)) if raw_teacher_labels.size else np.array(model_label)
  return raw_teacher_labels

In [0]:
# define function to condense teacher labels for each image
def condense_teacher_labels(teacher_labels):
  condensed_labels = []
  for teacher_label in teacher_labels:
    label_count = np.bincount(teacher_label, minlength = 10)
    condensed_label = np.argmax(label_count)
    condensed_labels.append(condensed_label)
  condensed_labels = np.array(condensed_labels)
  return condensed_labels

In [0]:
# define function to add Laplacian noise scaled by epsilon value
def add_laplacian_noise(raw_teacher_labels, epsilon):
  noised_teacher_labels = []
  beta = 1 / epsilon
  for raw_teacher_label in raw_teacher_labels:
    for i in range(len(raw_teacher_label)):
      raw_teacher_label[i] += np.random.laplace(0, beta, 1)
    noised_teacher_labels.append(raw_teacher_label)
  noised_teacher_labels = np.array(noised_teacher_labels)
  return noised_teacher_labels

In [0]:
# define function to create student model
def create_student_model():
  
  return student_model

In [0]:
# define function to train student model
def train_student_model():
  
  return trained_student_model

In [0]:
# define function to validate student model
def validate_student_model():
  
  return accuracy

In [0]:
## DEFINE HYPERPARAMETERS
num_teachers = 100
teacher_dropout = 0.25
teacher_criterion = nn.NLLLoss()
epsilon = 0.1

## CREATE DATALOADERS
teacher_loaders = split_train_data(train_data, num_teachers)
test_loader, private_loader = split_test_data(test_data, ratio=0.8)

## MODELING FOR TEACHERS
teacher_models = build_teacher_models(num_teachers, teacher_dropout)
train_teacher_models(teacher_models, teacher_loaders, test_loader, teacher_criterion)

## SAVE TRAINING RESULTS FOR TEACHERS
save_trained_models()

## USE TEACHER MODELS TO CLASSIFY PRIVATE DATASET
raw_teacher_labels = label_private_data(teacher_models, private_loader)
condensed_labels = condense_teacher_labels(raw_teacher_labels)

## ADD LAPLACIAN NOISE TO PREDICTIONS BY TEACHERS
noised_teacher_labels = add_laplacian_noise(raw_teacher_labels, epsilon)

## PERFORM PATE ANALYSIS TO CHECK PRIVACY LEAKAGE
run_pate_analysis(noised_teacher_labels.T, condensed_labels, epsilon)

## MODELING FOR STUDENT
#student_model = 
#train_student_model = 

## VALIDATE ACCURACY OF STUDENT MODEL
