<a href="https://colab.research.google.com/github/ewotawa/secure_private_ai/blob/master/Section_2_Federated_Learning_Final_Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Federated Learning Final Project

## Overview
* See  <a href="https://classroom.udacity.com/nanodegrees/nd185/parts/3fe1bb10-68d7-4d84-9c99-9539dedffad5/modules/28d685f0-0cb1-4f94-a8ea-2e16614ab421/lessons/c8fe481d-81ea-41be-8206-06d2deeb8575/concepts/a5fb4b4c-e38a-48de-b2a7-4e853c62acbe">video</a> for additional details. 
* Do Federated Learning where the central server is not trusted with the raw gradients.  
* In the final project notebook, you'll receive a dataset.  
* Train on the dataset using Federated Learning.  
* The gradients should not come up to the server in raw form.  
* Instead, use the new .move() command to move all of the gradients to one of the workers, sum them up there, and then bring that batch up to the central server and then bring that batch up 
* Idea: the central server never actually sees the raw gradient for any person.  
* We'll look at secure aggregation in course 3.  
* For now, do a larger-scale Federated Learning case where you handle the gradients in a special way.

## Approach
* Reviewing methods of classmates for Federated Learning. 

## References
*  <a href = "https://github.com/edgarinvillegas/private-ai/blob/master/Section%203%20-%20Final%20project.ipynb/">GitHub Notebook</a>
* <a href = "https://github.com/OpenMined/PySyft/blob/dev/examples/tutorials/Part%2010%20-%20Federated%20Learning%20with%20Secure%20Aggregation.ipynb">Part 10: Federated Learning with Encrypted Gradient Aggregation</a>


### Install libraries and dependencies

In [1]:
# PySyft

!pip install syft

import syft as sy

# PyTorch

!pip install torch
!pip install torchvision

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import torchvision
from torchvision import datasets, transforms

# Numpy

import numpy as np

# time

import time



W0722 23:10:22.735807 140675696850816 secure_random.py:26] Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/usr/local/lib/python3.6/dist-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.14.0.so'
W0722 23:10:22.750314 140675696850816 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/tf_encrypted/session.py:26: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.





### Normal Federated Learning

In [0]:
# Establish a parser to contain parameters for training

class Parser: 
  def __init__(self):
    self.epochs = 10
    self.lr = 0.001
    self.test_batch_size = 1
    self.batch_size = 1
    self.log_interval = 10
    self.seed = 1
    
args = Parser()

torch.manual_seed(args.seed)

kwargs = {}

In [0]:
# A Toy Dataset

data = torch.randn(50, 2, requires_grad=True) 
target = torch.randn(50, 1, requires_grad=True) 

data_test = torch.randn(50, 2, requires_grad=True) 
target_test = torch.randn(50, 1, requires_grad=True) 

In [0]:
train = TensorDataset(data, target)
test = TensorDataset(data_test, target_test)

trainloader = DataLoader(train, batch_size=args.batch_size, shuffle=True, **kwargs)
testloader = DataLoader(test, batch_size=args.test_batch_size, shuffle=True, **kwargs)

In [5]:
# A Toy Model

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.fc1 = nn.Linear(2,1)

  def forward(self, x):
    x = self.fc1(x)
    return x

# model = nn.Linear(2,1)

model = Net() 

print(model)

Net(
  (fc1): Linear(in_features=2, out_features=1, bias=True)
)


In [6]:
# Optimizer
opt = optim.SGD(params=model.parameters(), lr=args.lr)
print(opt)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
)


Set up hook, virtual workers, and virtual aggregator

In [0]:
hook = sy.TorchHook(torch)
vw00 = sy.VirtualWorker(hook, id="vw00")
vw01 = sy.VirtualWorker(hook, id="vw01")

aggr = sy.VirtualWorker(hook, id="aggr")

compute_nodes = [vw00, vw01]

In [8]:
vw00.clear_objects()
vw01.clear_objects()
aggr.clear_objects()

<VirtualWorker id:aggr #objects:0>

In [9]:
# Send data to the workers
train_dist_dataset = []

for batch_idx, (data, target) in enumerate(trainloader):
  data = data.send(compute_nodes[batch_idx % len(compute_nodes)])
  target = target.send(compute_nodes[batch_idx % len(compute_nodes)])
  train_dist_dataset.append((data, target))
  
print(train_dist_dataset)

[((Wrapper)>[PointerTensor | me:84986651634 -> vw00:9736110717], (Wrapper)>[PointerTensor | me:28922064506 -> vw00:79239569856]), ((Wrapper)>[PointerTensor | me:91960528364 -> vw01:14849918809], (Wrapper)>[PointerTensor | me:47627793448 -> vw01:12761496262]), ((Wrapper)>[PointerTensor | me:73105945937 -> vw00:3638353867], (Wrapper)>[PointerTensor | me:91887967349 -> vw00:54943560023]), ((Wrapper)>[PointerTensor | me:23581797819 -> vw01:21710482536], (Wrapper)>[PointerTensor | me:23349144114 -> vw01:90651078906]), ((Wrapper)>[PointerTensor | me:54854929432 -> vw00:82710441413], (Wrapper)>[PointerTensor | me:75836206845 -> vw00:81188717192]), ((Wrapper)>[PointerTensor | me:29587680328 -> vw01:99436922736], (Wrapper)>[PointerTensor | me:16339480525 -> vw01:38537620918]), ((Wrapper)>[PointerTensor | me:65998914431 -> vw00:11070956072], (Wrapper)>[PointerTensor | me:8069036501 -> vw00:23946814010]), ((Wrapper)>[PointerTensor | me:30923284894 -> vw01:74647737767], (Wrapper)>[PointerTensor | 

In [0]:
# define loss function

def loss_func(pred, target):
  loss = ((pred - target)**2).sum()
  return loss

In [0]:
# training function

def train(epoch):
  model.train()
  
  for batch_idx, (data, target) in enumerate(train_dist_dataset):
    # determine the active worker
    worker = data.location
    # send the model to the active worker
    model.send(worker)
    
    # do normal training
    opt.zero_grad()
    pred = model(data)
    loss = loss_func(pred, target)
    loss.backward()
    opt.step()
    model.get()

    if batch_idx % args.log_interval == 0:
      loss = loss.get()
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tloss: {:.6f}'.format(
        epoch, batch_idx, len(trainloader),
          100. * batch_idx / len(trainloader), loss.item()))

In [0]:
# testing function

def test():
  model.eval()
  test_loss = 0
  for data, target in testloader:
    pred = model(data)
    test_loss += loss_func(pred, target).item() # sum up batch loss
    
  test_loss /= len(testloader.dataset)
  print('\nTest Set: Average Loss: {:.4f}\n'.format(test_loss))

In [13]:
# train the model

t = time.time()

for epoch in range(1, args.epochs + 1):
  train(epoch)
  
total_time = time.time() - t
print('total', round(total_time, 2), 'sec')

total 2.68 sec


In [14]:
# calculating performance
test()


Test Set: Average Loss: 0.8006



### Add Encrypted Aggregation

In [15]:
# Send data to the workers
remote_dataset = (list(), list())

train_dist_dataset = []

for batch_idx, (data, target) in enumerate(trainloader):
  data = data.send(compute_nodes[batch_idx % len(compute_nodes)])
  target = target.send(compute_nodes[batch_idx % len(compute_nodes)])
  remote_dataset[batch_idx % len(compute_nodes)].append((data, target))
  

print(remote_dataset)

print(train_dist_dataset)

([((Wrapper)>[PointerTensor | me:82925490801 -> vw00:58106901474], (Wrapper)>[PointerTensor | me:29269605164 -> vw00:10994019681]), ((Wrapper)>[PointerTensor | me:77539390988 -> vw00:24317123840], (Wrapper)>[PointerTensor | me:7820972731 -> vw00:83437010019]), ((Wrapper)>[PointerTensor | me:23040771618 -> vw00:79590846742], (Wrapper)>[PointerTensor | me:98537957199 -> vw00:49993631202]), ((Wrapper)>[PointerTensor | me:40347384483 -> vw00:29895684361], (Wrapper)>[PointerTensor | me:96608799219 -> vw00:37541652217]), ((Wrapper)>[PointerTensor | me:94995372736 -> vw00:57199813825], (Wrapper)>[PointerTensor | me:2976383923 -> vw00:3290409406]), ((Wrapper)>[PointerTensor | me:46471665959 -> vw00:2818563227], (Wrapper)>[PointerTensor | me:30049133190 -> vw00:76489210412]), ((Wrapper)>[PointerTensor | me:953423876 -> vw00:65863077940], (Wrapper)>[PointerTensor | me:52218564031 -> vw00:15857973164]), ((Wrapper)>[PointerTensor | me:74175956840 -> vw00:46622251441], (Wrapper)>[PointerTensor | me

In [0]:
# need to be able to send distinct model to each worker to train.

def update(data, target, model, optimizer):
  model.send(data.location)
  optimizer.zero_grad()
  pred = model(data)
  loss = loss_func(pred, target)
  loss.backward()
  optimizer.step()
  return model

In [0]:
# define worker-based models and optimizer

vw00_model = Net()
vw01_model = Net()

vw00_optimizer = optim.SGD(vw00_model.parameters(), lr=args.lr)
vw01_optimizer = optim.SGD(vw01_model.parameters(), lr=args.lr)

models = [vw00_model, vw01_model]
params = [list(vw00_model.parameters()), list(vw01_model.parameters())]
optimizers = [vw00_optimizer, vw01_optimizer]

#### Train

In [0]:
# select batch on which to train
data_index = 0

# update remote models
# here, iterate once per model

for remote_index in range(len(compute_nodes)):
  data, target = remote_dataset[remote_index][data_index]
  models[remote_index] = update(data, target, models[remote_index], optimizers[remote_index])

#### Encrypted Aggregation

In [0]:
# list to store encrypted model average
new_params = list()

In [20]:
# visualize the parameters
for i in range(len(params[0])):
  
  # for each worker
  spdz2_params = list()
  for remote_index in range(len(compute_nodes)):
    
    lcop = params[remote_index][i].copy().get() 
    
    print("\nremote index: ", remote_index, "\n")
    print(lcop)
    print(lcop.type())
    
    lcop_th = lcop * 10000
    print(lcop_th)
    
    lcop_fx = lcop_th.type(torch.LongTensor)
    print(lcop_fx)
    print(lcop_fx.type())
    
    lcop_th_fl = lcop_fx.type(torch.FloatTensor)
    lcop_fl = lcop_th_fl / 10000
    print(lcop_fl)
    print(lcop_fl.type())


remote index:  0 

tensor([[-0.6983, -0.5240]], requires_grad=True)
torch.FloatTensor
tensor([[-6983.4028, -5239.7515]], grad_fn=<MulBackward0>)
tensor([[-6983, -5239]])
torch.LongTensor
tensor([[-0.6983, -0.5239]])
torch.FloatTensor

remote index:  1 

tensor([[-0.2316, -0.2829]], requires_grad=True)
torch.FloatTensor
tensor([[-2316.2527, -2829.3748]], grad_fn=<MulBackward0>)
tensor([[-2316, -2829]])
torch.LongTensor
tensor([[-0.2316, -0.2829]])
torch.FloatTensor

remote index:  0 

tensor([0.3239], requires_grad=True)
torch.FloatTensor
tensor([3239.4980], grad_fn=<MulBackward0>)
tensor([3239])
torch.LongTensor
tensor([0.3239])
torch.FloatTensor

remote index:  1 

tensor([-0.5834], requires_grad=True)
torch.FloatTensor
tensor([-5834.4795], grad_fn=<MulBackward0>)
tensor([-5834])
torch.LongTensor
tensor([-0.5834])
torch.FloatTensor


In [21]:
# iterate through each parameter
for i in range(len(params[0])):
  
  # for each worker
  spdz_params_th = list()
  for remote_index in range(len(compute_nodes)):
    
    # copy of parameter (cop): copy same parameter from each worker's model, floating type tensor
    cop = params[remote_index][i].copy()
    
    # copy of parameter, thousands (cop_th): copy of parameter scaled up so that precision is not lost
    # fixed precision parameter, thousands (fpp_th): fixed precision version of parameter scaled up so that precision is not lost
    fpp_th = cop * 10000
    
    # encrypt on the remote machine. Note: fixed_precision_param is already a pointer. 
    # calling share encrypts data to which pointer is pointing. Returns a pointer to the MPC secret shared object. Need to fetch object.
    encrypted_param_th = fpp_th.share(vw00, vw01, crypto_provider=aggr)
    
    # fetch the pointer to the MPC shared value
    param_th = encrypted_param_th.get()
    
    # save parameter so can average with same parameter from other workers.
    spdz_params_th.append(param_th)
    
  # average param from multiple workers, fetch back to local machine. Decrypt and decode from fixed precision back to float. 
  new_param_th = (spdz_params_th[0] + spdz_params_th[1]).get() / 2 #.float_precision() / 2 
  new_param_float_th = new_param_th.type(torch.FloatTensor) 
  new_param_float = new_param_float_th / 10000
  
  # save new averaged parameter
  new_params.append(new_param_float)

print(new_params)

[tensor([[-0.4649, -0.4034]]), tensor([-0.1297])]


#### Cleanup

In [0]:
with torch.no_grad():
  for model in params:
    for param in model:
      param *= 0
  
  for model in models:
    model.get()
    
  for remote_index in range(len(compute_nodes)):
    for param_index in range(len(params[remote_index])):
      params[remote_index][param_index].set_(new_params[param_index])

### Bring components together

In [23]:
vw00.clear_objects()
vw01.clear_objects()
aggr.clear_objects()

<VirtualWorker id:aggr #objects:0>

In [24]:
# Send data to the workers
remote_dataset = (list(), list())

train_dist_dataset = []

for batch_idx, (data, target) in enumerate(trainloader):
  data = data.send(compute_nodes[batch_idx % len(compute_nodes)])
  target = target.send(compute_nodes[batch_idx % len(compute_nodes)])
  remote_dataset[batch_idx % len(compute_nodes)].append((data, target))
  

print(remote_dataset)

print(train_dist_dataset)

([((Wrapper)>[PointerTensor | me:4123999739 -> vw00:5431085165], (Wrapper)>[PointerTensor | me:58315752382 -> vw00:8723786173]), ((Wrapper)>[PointerTensor | me:22099175038 -> vw00:20626012743], (Wrapper)>[PointerTensor | me:4690546354 -> vw00:58771177119]), ((Wrapper)>[PointerTensor | me:40550134044 -> vw00:99008222936], (Wrapper)>[PointerTensor | me:20968524747 -> vw00:90109189516]), ((Wrapper)>[PointerTensor | me:76869575886 -> vw00:72968744763], (Wrapper)>[PointerTensor | me:63865429116 -> vw00:3678453272]), ((Wrapper)>[PointerTensor | me:57349627461 -> vw00:21577710447], (Wrapper)>[PointerTensor | me:42807941661 -> vw00:20612847089]), ((Wrapper)>[PointerTensor | me:25291463134 -> vw00:44532892051], (Wrapper)>[PointerTensor | me:70257934229 -> vw00:51605873455]), ((Wrapper)>[PointerTensor | me:75944110813 -> vw00:66112569301], (Wrapper)>[PointerTensor | me:60131188431 -> vw00:77377266206]), ((Wrapper)>[PointerTensor | me:25121896235 -> vw00:83845377390], (Wrapper)>[PointerTensor | m

In [25]:
print('length of remote dataset:\t', len(remote_dataset))
print('length of one item in remote dataset:\t', len(remote_dataset[0]))

length of remote dataset:	 2
length of one item in remote dataset:	 25


In [0]:
# define worker-based models and optimizer

vw00_model = Net().send(vw00)
vw01_model = Net().send(vw01)

vw00_optimizer = optim.SGD(vw00_model.parameters(), lr=args.lr)
vw01_optimizer = optim.SGD(vw01_model.parameters(), lr=args.lr)

models = [vw00_model, vw01_model]
params = [list(vw00_model.parameters()), list(vw01_model.parameters())]
optimizers = [vw00_optimizer, vw01_optimizer]

In [0]:
# need to be able to send distinct model to each worker to train.

def update2(data, target, model, optimizer):
  # model.send(data.location)
  optimizer.zero_grad()
  pred = model(data)
  loss = loss_func(pred, target)
  loss.backward()
  optimizer.step()
  return model

In [28]:
# figure out the guts of the training mechanism before putting into function.

for data_index in range(len(remote_dataset[0])):
  
  # update remote models
  
  for remote_index in range(len(compute_nodes)):
    print('data index: \t', data_index, '\tremote index:\t', remote_index)
    data, target = remote_dataset[remote_index][data_index]
    model = models[remote_index]
    opt = optimizers[remote_index]
    
    models[remote_index] = update2(data, target, model, opt)
    
  # encrypted aggregation
  new_params = list() # list to store encrypted model average
  
  for i in range(len(params[0])): # iterate through each parameter

    # for each worker
    spdz_params_th = list()
    for remote_index in range(len(compute_nodes)):

      # copy of parameter (cop): copy same parameter from each worker's model, floating type tensor
      cop = params[remote_index][i].copy()

      # copy of parameter, thousands (cop_th): copy of parameter scaled up so that precision is not lost
      # fixed precision parameter, thousands (fpp_th): fixed precision version of parameter scaled up so that precision is not lost
      fpp_th = cop * 10000

      # encrypt on the remote machine. Note: fixed_precision_param is already a pointer. 
      # calling share encrypts data to which pointer is pointing. Returns a pointer to the MPC secret shared object. Need to fetch object.
      encrypted_param_th = fpp_th.share(vw00, vw01, crypto_provider=aggr)

      # fetch the pointer to the MPC shared value
      param_th = encrypted_param_th.get()

      # save parameter so can average with same parameter from other workers.
      spdz_params_th.append(param_th)

    # average param from multiple workers, fetch back to local machine. Decrypt and decode from fixed precision back to float. 
    new_param_th = (spdz_params_th[0] + spdz_params_th[1]).get() / 2 #.float_precision() / 2 
    new_param_float_th = new_param_th.type(torch.FloatTensor) 
    new_param_float = new_param_float_th / 10000

    # save new averaged parameter
    new_params.append(new_param_float)

  print(new_params)  
  
  # cleanup
  with torch.no_grad():
    for model in params:
      for param in model:
        param *= 0

    for model in models:
      model.get()

    for remote_index in range(len(compute_nodes)):
      for param_index in range(len(params[remote_index])):
        params[remote_index][param_index].set_(new_params[param_index])

  

data index: 	 0 	remote index:	 0
data index: 	 0 	remote index:	 1
[tensor([[0.4501, 0.0806]]), tensor([0.1653])]
data index: 	 1 	remote index:	 0


RuntimeError: ignored