# Fed-BioMed Researcher to train a model on a CSV dataset

This example shows how to use a CSV format file as a node dataset. The example CSV file is synthetic data with a format inspired from ADNI dataset.

This example uses Pseudo Adni Dataset. Please check `README.md` file in `notebooks` directory for the instructions to load Pseudo Adni dataset and configure nodes.

## Create an experiment to train a model on the data found

Declare a torch training plan MyTrainingPlan class to send for training on the node

In [None]:
import torch
import torch.nn as nn
from fedbiomed.common.training_plans import TorchTrainingPlan
from fedbiomed.common.data import DataManager
from torch.utils.data import Dataset
import pandas as pd

# Here we define the model to be used. 
# You can use any class name (here 'MyTrainingPlan')
class MyTrainingPlan(TorchTrainingPlan):
    
    # Model 
    def init_model(self, model_args):    
        model = self.Net(model_args)
        return model 
    
    
    # Dependencies
    def init_dependencies(self):
        # Here we define the custom dependencies that will be needed by our custom Dataloader
        # In this case, we need the torch Dataset and DataLoader classes
        # We need pandas to read the local .csv file at the node side
        deps = ["from torch.utils.data import Dataset",
                "import pandas as pd"]
        
        return deps
    
    class Net(nn.Module):
        def __init__(self, model_args):
            super().__init__()
            # should match the model arguments dict passed below to the experiment class
            self.fc1 = nn.Linear(model_args['in_features'], 5)
            self.fc2 = nn.Linear(5, model_args['out_features'])

        def forward(self, x):
            x = self.fc1(x)
            x = F.relu(x)
            x = self.fc2(x)
            return x

    def training_step(self, data, target):
        output = self.model().forward(data).float()
        criterion = torch.nn.MSELoss()
        loss   = criterion(output, target.unsqueeze(1))
        return loss

    def training_data(self):
        df = pd.read_csv(self.dataset_path, sep=';', index_col=False)
        x_dim = self.model_args()['in_features']
        x_train = df.iloc[:,:x_dim].values
        y_train = df.iloc[:,-1].values
        train_kwargs = {'shuffle': True}
        
        data_manager = DataManager(dataset=x_train , target=y_train, **train_kwargs)
        
        return data_manager

In [None]:
# model parameters 
model_args = {
    'in_features': 15, 
    'out_features': 1
}

# training parameters 
training_args = {
    'loader_args': { 'batch_size': 20, }, 
    'optimizer_args': {
        'lr': 1e-3
    }, 
    'epochs': 10, 
    'dry_run': False,  
}

In [None]:
from fedbiomed.researcher.federated_workflows import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

# Calling the training data with specified tags
tags =  ['#test_data']
rounds = 5

exp = Experiment(tags=tags,
                 training_plan_class=MyTrainingPlan,
                 model_args=model_args,
                 training_args=training_args,
                 round_limit=rounds,
                 aggregator=FedAverage(),
                 node_selection_strategy=None)

Let's start the experiment.

By default, this function doesn't stop until all the `round_limit` rounds are done for all the nodes

In [None]:
exp.run()

Save trained model to file

In [None]:
exp.training_plan().export_model('./trained_model')

In [None]:
print("\nList the training rounds : ", exp.training_replies().keys())

print("\nList the nodes for the last training round and their timings : ")
round_data = exp.training_replies()[rounds - 1]
for r in round_data.values():
    print("\t- {id} :\
    \n\t\trtime_training={rtraining:.2f} seconds\
    \n\t\tptime_training={ptraining:.2f} seconds\
    \n\t\trtime_total={rtotal:.2f} seconds".format(id = r['node_id'],
        rtraining = r['timing']['rtime_training'],
        ptraining = r['timing']['ptime_training'],
        rtotal = r['timing']['rtime_total']))
print('\n')


In [None]:
print("\nList the training rounds : ", exp.aggregated_params().keys())
print("\nAccess the federated params for the last training round :")
print("\t- parameter data: ", exp.aggregated_params()[rounds - 1]['params'].keys())
