# Performing Testing at Each Round of Training 

Use for developing (autoreloads changes made across packages)

In [1]:
%load_ext autoreload
%autoreload 2

## Start the network
Before running this notebook, start the network with `./scripts/fedbiomed_run network`

## Setting the node up
It is necessary to previously configure a node:
1. `./scripts/fedbiomed_run node add`
  * Select option 2 (default) to add MNIST to the node
  * Confirm default tags by hitting "y" and ENTER
  * Pick the folder where MNIST is downloaded (this is due torch issue https://github.com/pytorch/vision/issues/3549)
  * Data must have been added (if you get a warning saying that data must be unique is because it's been already added)
  
2. Check that your data has been added by executing `./scripts/fedbiomed_run node list`
3. Run the node using `./scripts/fedbiomed_run node run`. Wait until you get `Starting task manager`. it means you are online.

## 1. Testing Pytorch Model Using Predefiend Evalution Metrics at each Round of Federeated Training

Declare a torch.nn MyTrainingPlan class to send for training on the node.

In [2]:
import torch
import torch.nn as nn
from fedbiomed.common.training_plans import TorchTrainingPlan
from fedbiomed.common.data import DataManager
from torchvision import datasets, transforms

# Here we define the model to be used. 
# You can use any class name (here 'Net')
class MyTrainingPlan(TorchTrainingPlan):
    def __init__(self, model_args: dict = {}):
        super(MyTrainingPlan, self).__init__(model_args)
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
        
        # Here we define the custom dependencies that will be needed by our custom Dataloader
        # In this case, we need the torch DataLoader classes
        # Since we will train on MNIST, we need datasets and transform from torchvision
        deps = ["from torchvision import datasets, transforms"]
        
        self.add_dependency(deps)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        
        
        output = F.log_softmax(x, dim=1)
        return output

    def training_data(self, batch_size = 48):
        # Custom torch Dataloader for MNIST data
        transform = transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))])
        dataset1 = datasets.MNIST(self.dataset_path, train=True, download=False, transform=transform)
        train_kwargs = {'batch_size': batch_size, 'shuffle': True}
        return DataManager(dataset=dataset1, **train_kwargs)
    
    def training_step(self, data, target):
        output = self.forward(data)
        loss   = torch.nn.functional.nll_loss(output, target)
        return loss


### 3.1 Declare and run the experiment
The model is trained on the **MNIST dataset** for classification. For testing, we will be using the **F1-Score**  as a metric. Testing will be performed on both **local updates and global updates**.

In [3]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage


model_args = {}

training_args = {
    'batch_size': 48, 
    'lr': 1e-3, 
    'epochs': 1, 
    'dry_run': False,  
    'batch_maxnum': 100, # Fast pass for development : only use ( batch_maxnum * batch_size ) samples
  
}


tags =  ['#MNIST', '#dataset']
rounds = 2

exp = Experiment(tags=tags,
                 model_args=model_args,
                 model_class=MyTrainingPlan,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=None,
                tensorboard=True)

2022-04-05 17:02:18,878 fedbiomed INFO - Component environment:
2022-04-05 17:02:18,880 fedbiomed INFO - type = ComponentType.RESEARCHER
2022-04-05 17:02:19,371 fedbiomed INFO - Messaging researcher_00aba3da-4c77-4047-832c-ee956d08535c successfully connected to the message broker, object = <fedbiomed.common.messaging.Messaging object at 0x138197160>
2022-04-05 17:02:19,413 fedbiomed INFO - Searching dataset with data tags: ['#MNIST', '#dataset'] for all nodes
2022-04-05 17:02:29,449 fedbiomed INFO - Node selected for training -> node_cebf1f87-fc00-42ca-9142-ec9226084a94
2022-04-05 17:02:29,493 fedbiomed DEBUG - Model file has been saved: /Users/jls/Development/fedbiomed/fedbiomed/var/experiments/Experiment_0050/my_model_e6b0ed0a-f068-4bb6-82bb-f7dd683adc6c.py
2022-04-05 17:02:29,820 fedbiomed DEBUG - upload (HTTP POST request) of file /Users/jls/Development/fedbiomed/fedbiomed/var/experiments/Experiment_0050/my_model_e6b0ed0a-f068-4bb6-82bb-f7dd683adc6c.py successful, with status code 

#### Declaring Testing Arguments 

- **test_ratio:** The ratio for testing partition 
- **test_metric:** The metric that is going to be used for evaluation
- **Testing on local updates:** Means that testing is going to be perform after training is performed over aggreated paramaters  
- **Testing on global updates**: Means that testing will be perform on aggregated parameters before performing the training. 


You can display all the default metrics that are supported in Fed-BioMed. They are all based on sklearn metrics

In [4]:
from fedbiomed.common.metrics import MetricTypes
MetricTypes.get_all_metrics()

['ACCURACY',
 'F1_SCORE',
 'PRECISION',
 'RECALL',
 'MEAN_SQUARE_ERROR',
 'MEAN_ABSOLUTE_ERROR',
 'EXPLAINED_VARIANCE']

In [5]:
exp.set_test_ratio(0.1)
exp.set_test_on_local_updates(True)
exp.set_test_on_global_updates(True)
exp.set_test_metric(MetricTypes.F1_SCORE)

2022-04-05 17:02:39,489 fedbiomed DEBUG - Experimentation training_args updated for `job`
2022-04-05 17:02:39,489 fedbiomed DEBUG - Experimentation training_args updated for `job`
2022-04-05 17:02:39,490 fedbiomed DEBUG - Experimentation training_args updated for `job`
2022-04-05 17:02:39,492 fedbiomed DEBUG - Experimentation training_args updated for `job`


(<MetricTypes.F1_SCORE: (1, <_MetricCategory.CLASSIFICATION_LABELS: 0>)>, {})

Launch tensorboard

In [6]:
from fedbiomed.researcher.environ import environ
tensorboard_dir = environ['TENSORBOARD_RESULTS_DIR']

In [7]:
%load_ext tensorboard

In [8]:
tensorboard --logdir "$tensorboard_dir"

Let's start the experiment.

By default, this function doesn't stop until all the `round_limit` rounds are done for all the nodes

In [9]:
exp.run()

2022-04-05 17:04:29,751 fedbiomed INFO - Sampled nodes in round 0 ['node_cebf1f87-fc00-42ca-9142-ec9226084a94']
2022-04-05 17:04:29,752 fedbiomed INFO - [1mSending request[0m 
					[1m To[0m: node_cebf1f87-fc00-42ca-9142-ec9226084a94 
					[1m Reqeust: [0m: Perform training with the arguments: {'researcher_id': 'researcher_00aba3da-4c77-4047-832c-ee956d08535c', 'job_id': 'a2a6ad3b-2895-43a4-9642-71563d0bcdb0', 'training_args': {'test_ratio': 0.1, 'test_on_local_updates': True, 'test_on_global_updates': True, 'test_metric': <MetricTypes.F1_SCORE: (1, <_MetricCategory.CLASSIFICATION_LABELS: 0>)>, 'test_metric_args': {}, 'batch_size': 48, 'lr': 0.001, 'epochs': 1, 'dry_run': False, 'batch_maxnum': 100}, 'training': True, 'model_args': {}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2022/04/05/my_model_e6b0ed0a-f068-4bb6-82bb-f7dd683adc6c.py', 'params_url': 'http://localhost:8844/media/uploads/2022/04/05/aggregated_params_init_f651eff4-b835-408c-9c7a-d8ffd34

2022-04-05 17:05:13,114 fedbiomed INFO - [1mINFO[0m
					[1m NODE[0m node_cebf1f87-fc00-42ca-9142-ec9226084a94
					[1m MESSAGE:[0m No `testing_step` method found in TrainingPlan: using defined metric F1_SCORE for model evaluation.[0m
-----------------------------------------------------------------
2022-04-05 17:05:18,360 fedbiomed INFO - [1mINFO[0m
					[1m NODE[0m node_cebf1f87-fc00-42ca-9142-ec9226084a94
					[1m MESSAGE:[0m Actual/True values (y_true) has more than two levels, using multiclass `weighted` calculation for the metric F1_SCORE[0m
-----------------------------------------------------------------
2022-04-05 17:05:18,386 fedbiomed INFO - [1mTESTING ON GLOBAL UPDATES[0m 
					 NODE_ID: node_cebf1f87-fc00-42ca-9142-ec9226084a94 
					 Completed: 6000/6000 (100%) 
 					 F1_SCORE: [1m0.937988[0m 
					 ---------
2022-04-05 17:05:19,321 fedbiomed INFO - [1mTRAINING[0m 
					 NODE_ID: node_cebf1f87-fc00-42ca-9142-ec9226084a94 
					 Epoch: 1 | Completed: 4

2



## 2. Training and Testing with sklearn Perceptron model


Now we will use the testing facility on Skelearn training plan

In [None]:
from fedbiomed.common.training_plans import SGDSkLearnModel
from fedbiomed.common.data import DataManager
import numpy as np


class SkLearnClassifierTrainingPlan(SGDSkLearnModel):
    def __init__(self, model_args):
        super(SkLearnClassifierTrainingPlan,self).__init__(model_args)
        self.add_dependency(['import torch',
                            "from sklearn.linear_model import Perceptron",
                            "from torchvision import datasets, transforms",
                           "from torch.utils.data import DataLoader"])
    
    
    def training_data(self):
        # Custom torch Dataloader for MNIST data: np.ndarray
        transform = transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))])
        dataset = datasets.MNIST(self.dataset_path, train=True, download=False, transform=transform)
        
        train_kwargs = {'batch_size': 500, 'shuffle': True}  # number of data passed to classifier
        X_train = dataset.data.numpy()
        X_train = X_train.reshape(-1, 28*28)
        Y_train = dataset.targets.numpy()
        
        return DataManager(dataset=X_train,target=Y_train)

It is also possible to define testing option in the training arguments. 

In [None]:
model_args = { 'max_iter':1000,
              'tol': 1e-4 ,
              'model': 'Perceptron' ,
              'n_features': 28*28,
              'n_classes' : 10,
              'eta0':1e-6,
              'random_state':1234,
              'alpha':0.1 }

training_args = {
    'epochs': 5, 
}




In [None]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

tags =  ['#MNIST', '#dataset']
rounds = 10

# select nodes participing to this experiment
exp = Experiment(tags=tags,
                 model_args=model_args,
                 model_class=SkLearnClassifierTrainingPlan,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=None, 
                 tensorboard=True)


exp.set_test_ratio(.2)
#exp.set_test_metric(MetricTypes.PRECISION, average='macro')
exp.set_test_on_global_updates(True)

In [None]:
exp.run(increase=True)

Feel free to run other sample notebooks or try your own models :D

# 3. Testing facility using your own testing metric

If the user wants to define its own testing metric, he can do so by defining the `testing_step` method in the Training plan. 

`testing_step` is defined the same way as `training_step`:

When defining a `testing_step` method in the TrainingPlan, user has to:
- predict classes or probabilities from model
- compute a scalar or a list of scalars

Method `testing_step` can return either a scalar or a list of scalars: in Tensorboard, list of scalars will be seen as the output of several metrics


## 3.1 PyTorch Training Plan

Below we showcase an example of a TorchTrainingPlan with a `testing_step` computing 3 metrics: log likelihood loss, a cross entropy loss, and a custom accuracy metric 

In [None]:
import torch
import torch.nn as nn
from fedbiomed.common.training_plans import TorchTrainingPlan
from fedbiomed.common.data import DataManager
from torchvision import datasets, transforms

# Here we define the model to be used. 
# You can use any class name (here 'Net')
class MyTrainingPlanCM(TorchTrainingPlan):
    def __init__(self, model_args: dict = {}):
        super(MyTrainingPlanCM, self).__init__(model_args)
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
        
        # Here we define the custom dependencies that will be needed by our custom Dataloader
        # In this case, we need the torch DataLoader classes
        # Since we will train on MNIST, we need datasets and transform from torchvision
        deps = ["from torchvision import datasets, transforms"]
        
        self.add_dependency(deps)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        
        
        output = F.log_softmax(x, dim=1)
        return output

    def training_data(self, batch_size = 48):
        # Custom torch Dataloader for MNIST data
        transform = transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))])
        dataset1 = datasets.MNIST(self.dataset_path, train=True, download=False, transform=transform)
        train_kwargs = {'batch_size': batch_size, 'shuffle': True}
        return DataManager(dataset=dataset1, **train_kwargs)
    
    def training_step(self, data, target):
        output = self.forward(data)
        loss   = torch.nn.functional.nll_loss(output, target)
        return loss

    def testing_step(self, data, target):        
        output = self.forward(data)
        
        #negative log likelihood loss
        loss1   = torch.nn.functional.nll_loss(output, target)
        
        #cross entropy
        loss2 = torch.nn.functional.cross_entropy(output,target)
        
        # accuracy
        _,predicted = torch.max(output.data,1)
        acc = torch.sum(predicted==target)
        loss3 = acc/len(target)
        
        # Returning results as list
        return [loss1,loss2,loss3]

In [None]:
model_args = {}

training_args = {
    'batch_size': 48, 
    'lr': 1e-3, 
    'epochs': 1, 
    'dry_run': False,  
    'batch_maxnum': 100, # Fast pass for development : only use ( batch_maxnum * batch_size ) samples
    'test_ratio': .3,
    'test_on_local_updates': True, 
    'test_on_global_updates': True
}

In [None]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

tags =  ['#MNIST', '#dataset']
rounds = 2

exp = Experiment(tags=tags,
                 model_args=model_args,
                 model_class=MyTrainingPlanCM,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=None, 
                tensorboard=True)

In [None]:
exp.run()

## 3.2 Sklearn Training Plan

Below we showcase an example of a SklearnTrainingPlan with a `testing_step` computing several metrics

In [None]:
from fedbiomed.common.training_plans import SGDSkLearnModel
from fedbiomed.common.data import DataManager
import numpy as np


class SkLearnClassifierTrainingPlan(SGDSkLearnModel):
    def __init__(self, model_args):
        super(SkLearnClassifierTrainingPlan,self).__init__(model_args)
        self.add_dependency(['import torch',
                            "from sklearn.linear_model import Perceptron",
                            "from torchvision import datasets, transforms",
                           "from torch.utils.data import DataLoader",
                            "from sklearn.metrics import hinge_loss"])
    
    
    def compute_accuracy_for_specific_digit(self, data, target, digit: int):
        idx_data_equal_to_digit = target == digit
        
        predicted = self.model.predict(data[idx_data_equal_to_digit])
        well_predicted_label = np.sum(predicted == digit) / np.sum(idx_data_equal_to_digit)
        return well_predicted_label
    
    def training_data(self):
        # Custom torch Dataloader for MNIST data
        transform = transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))])
        dataset = datasets.MNIST(self.dataset_path, train=True, download=False, transform=transform)
        
        train_kwargs = {'batch_size': 500, 'shuffle': True}  # number of data passed to classifier
        X_train = dataset.data.numpy()
        X_train = X_train.reshape(-1, 28*28)
        Y_train = dataset.targets.numpy()
        
        return DataManager(dataset=X_train, target=Y_train)
    
    def testing_step(self, data, target):
        #test_data = data.reshape(-1, 28 * 28)
        # hinge loss
        distance_from_hyperplan = self.model.decision_function(data)
        loss = hinge_loss(target, distance_from_hyperplan)
        
        # get the accuracy only on images representing digit 1
        well_predicted_label_1 = self.compute_accuracy_for_specific_digit(data, target, 1)
        
        # Returning results as dict
        return {'Hinge Loss': loss, 'Well Predcited Label 1' : well_predicted_label_1}

In [None]:
model_args = { 'max_iter':1000,
              'tol': 1e-4 ,
              'model': 'Perceptron' ,
              'n_features': 28*28,
              'n_classes' : 10,
              'eta0':1e-6,
              'random_state':1234,
              'alpha':0.1 }

training_args = {
    'epochs': 5, 
}


In [None]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

tags =  ['#MNIST', '#dataset']
rounds = 10

# select nodes participing to this experiment
exp = Experiment(tags=tags,
                 model_args=model_args,
                 model_class=SkLearnClassifierTrainingPlan,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=None, 
                 tensorboard=True)


exp.set_test_ratio(.2)
#exp.set_test_metric(MetricTypes.PRECISION, average='macro')
exp.set_test_on_global_updates(True)
exp.set_test_on_local_updates(True)

In [None]:
exp.run(increase=True)