In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import logging

from neoglia.workers.connect_workers import connect
from neoglia.learn.utils import setup_logging
from neoglia.learn.config import LearnConfig
from neoglia.learn.losses import cross_entropy, binary_cross_entropy
from neoglia.learn.models import ConvNet, FFNet
from neoglia.learn.learner import Learner

W0902 02:21:48.554007 140494841968448 secure_random.py:26] Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/home/danielhomola/.virtualenvs/tf/lib/python3.6/site-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.14.0.so'
W0902 02:21:48.562424 140494841968448 deprecation_wrapper.py:119] From /home/danielhomola/.virtualenvs/tf/lib/python3.6/site-packages/tf_encrypted/session.py:26: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.



In [3]:
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

## Connect to data nodes

In this demo, we have 3 distinct hospitals. Each is an indenpendent EC2 instance on AWS.

In [4]:
h1, h2, h3 = connect()

neoglia.workers.connect_workers - INFO - Connected to worker h1.
neoglia.workers.connect_workers - INFO - Connected to worker h2.
neoglia.workers.connect_workers - INFO - Connected to worker h3.


Check the datasets they have and the dimensions of these.

In [6]:
h1.datasets

['mnist_train',
 'mnist_test',
 'eicu_class_train',
 'eicu_class_test',
 'eicu_reg_train',
 'eicu_reg_test']

In [7]:
h1.dataset_input_dims

{'mnist_train': (None, 28, 28),
 'mnist_test': (None, 28, 28),
 'eicu_class_train': (None, 103),
 'eicu_class_test': (None, 103),
 'eicu_reg_train': (None, 103),
 'eicu_reg_test': (None, 103)}

In [6]:
h1.dataset_output_dims

{'mnist_train': (None, 10),
 'mnist_test': (None, 10),
 'eicu_class_train': (None, 1),
 'eicu_class_test': (None, 1),
 'eicu_reg_train': (None, 1),
 'eicu_reg_test': (None, 1)}

In [7]:
h1.dataset_sizes

{'mnist_train': 12000,
 'mnist_test': 10000,
 'eicu_class_train': 4778,
 'eicu_class_test': 5421,
 'eicu_reg_train': 4778,
 'eicu_reg_test': 5421}

In [8]:
h2.dataset_sizes

{'mnist_train': 12000,
 'mnist_test': 10000,
 'eicu_class_train': 3981,
 'eicu_class_test': 5421,
 'eicu_reg_train': 3981,
 'eicu_reg_test': 5421}

## Train a convolutional neural network on the mnist dataset with federated averaging

Each hospital holds a subset of the training data but they all share the same test data.

## Define the config file for this experiment

This holds everything from the learning rate to the batch size. 

First let's check the available parameters. Note, this object can take a yml config file (good for reproducible experiments) or be parametrised when instantiated.

In [8]:
?LearnConfig

[0;31mInit signature:[0m
[0mLearnConfig[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mconfig_file[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_dataset_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_dataset_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_batch_size[0m[0;34m=[0m[0;36m64[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_batch_size[0m[0;34m=[0m[0;36m128[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_epochs[0m[0;34m=[0m[0;36m40[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfed_after_n_batches[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcuda[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mseed[0m[0;34m=[0m[0;36m42[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msave_model[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[

In [9]:
config = LearnConfig("mnist_config.yml")
config

{'config_file': 'mnist_config.yml',
 'train_dataset_name': 'mnist_train',
 'test_dataset_name': 'test_train',
 'train_batch_size': 64,
 'test_batch_size': 128,
 'train_epochs': 40,
 'fed_after_n_batches': 10,
 'lr': 0.1,
 'cuda': False,
 'seed': 42,
 'save_model': True,
 'verbose': True}

## Define model architecture and loss function

Define a model architecture in Torch, or simply load one of NeoGlia's predefined ones.

In [10]:
model = ConvNet()
model

ConvNet(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

We'll use cross entropy in this example as a loss function as this is a multi-class problem.

## Start training and evaluating the model in a federated manner. 

In [11]:
fed_learner = Learner(config, model, cross_entropy, (h1, h2, h3))

In [12]:
fed_learner.train_eval()

neoglia.learn.learner - INFO - Starting epoch 1/41
neoglia.learn.learner - INFO - Training round: 1, worker: h1, avg_loss: tensor(0.8722, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Training round: 1, worker: h2, avg_loss: tensor(1.4286, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Training round: 1, worker: h3, avg_loss: tensor(1.2797, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Starting epoch 2/41
neoglia.learn.learner - INFO - Training round: 2, worker: h2, avg_loss: tensor(0.8743, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Training round: 2, worker: h1, avg_loss: tensor(0.7070, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Training round: 2, worker: h3, avg_loss: tensor(1.6277, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Starting epoch 3/41
neoglia.learn.learner - INFO - Training round: 3, worker: h2, avg_loss: tensor(0.4867, grad_fn=<MeanBackward1>)
neoglia.learn.learner - INFO - Training round: 3, worker: h1, a

RuntimeError: Websocket connection closed and creation of new connection failed.