# Data Preprocessing : Download Used Cars Dataset 
https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes

In [None]:
import pandas as pd

# Dataset Details
The data consists of used cars listings. 100,000 listings, which have been separated into files corresponding to each car manufacturer. Each file will simulate data for each node.

In [None]:
audi = pd.read_csv("___PATH to audi.csv___")
bmw = pd.read_csv("___PATH to bmw.csv___")
ford = pd.read_csv("___PATH to ford.csv___")
cclass = pd.read_csv("___PATH to cclass.csv___")
focus = pd.read_csv("___PATH to focus.csv___")
hyundai = pd.read_csv("___PATH to huyndai.csv___")
merc = pd.read_csv("___PATH to merc.csv___")
skoda = pd.read_csv("___PATH to skoda.csv___")
toyata = pd.read_csv("___PATH to toyata.csv___")
vauxhall = pd.read_csv("___PATH to vauxhall.csv___")
vw = pd.read_csv("__PATH to vw.csv___")

Drop columns model & fuelType as labels are not consistent across files. A better solution could be vertical federated learning

In [None]:
audi.drop(columns = ['model','fuelType'],inplace = True)
bmw.drop(columns = ['model','fuelType'],inplace = True)
ford.drop(columns = ['model','fuelType'],inplace = True)
cclass.drop(columns = ['model','fuelType'],inplace = True)
focus.drop(columns = ['model','fuelType'],inplace = True)
hyundai.drop(columns = ['model','fuelType'],inplace = True)
merc.drop(columns = ['model','fuelType'],inplace = True)
skoda.drop(columns = ['model','fuelType'],inplace = True)
toyata.drop(columns = ['model','fuelType'],inplace = True)
vauxhall.drop(columns = ['model','fuelType'],inplace = True)
vw.drop(columns = ['model','fuelType'],inplace = True)

# Fedbiomed Researcher to train a model on a Used Cars dataset

Use for developing (autoreloads changes made across packages)

In [1]:
%load_ext autoreload
%autoreload 2

## Start the network and setting the client up
Before running this notebook, you shoud start the network from fedbiomed-network, as detailed in https://gitlab.inria.fr/fedbiomed/fedbiomed-network
Therefore, it is necessary to previously configure a node:
Also ensure that you have run data-preprocessing-used-cars-dataset.ipynb notebook to preprocess the used cars dataset
1. `./scripts/fedbiomed_run node add`
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset
  * Run the Data Preprcess Used Cars notebook for generating csv files
  * Spin as many nodes as you want(max nodes 11 for 11 csv files in used cars dataset). Hold out one file for testing.
  * Load the .csv file generated using above mentioned notebook to individual nodes
2. Check that your data has been added by executing `./scripts/fedbiomed_run node list`
3. Run the node using `./scripts/fedbiomed_run node start`. Wait until you get `Connected with result code 0`. it means you are online.

## Create an experiment to train a model on the data found

Declare a torch.nn MyTrainingPlan class to send for training on the node

In [2]:
from fedbiomed.researcher.environ import TMP_DIR
import tempfile
tmp_dir_model = tempfile.TemporaryDirectory(dir=TMP_DIR+'/')
model_file = tmp_dir_model.name + '/class_export_csv.py'

Note : write **only** the code to export in the following cell

In [3]:
%%writefile "$model_file"

import torch
import torch.nn as nn
from fedbiomed.common.torchnn import TorchTrainingPlan
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Here we define the model to be used. 
# You can use any class name (here 'MyTrainingPlan')
class MyTrainingPlan(TorchTrainingPlan):       
    def __init__(self, kwargs):
        super(MyTrainingPlan, self).__init__()
        # kwargs should match the model arguments to be passed below to the experiment class
        self.in_features = kwargs['in_features']
        self.out_features = kwargs['out_features']
        self.fc1 = nn.Linear(self.in_features, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 8)
        self.fc5 = nn.Linear(8, 4)
        self.fc6 = nn.Linear(4, self.out_features)
        
        # Here we define the custom dependencies that will be needed by our custom Dataloader
        # In this case, we need the torch Dataset and DataLoader classes
        # We need pandas to read the local .csv file at the client side
        deps = ["from torch.utils.data import Dataset, DataLoader",
                "import pandas as pd"]
        self.add_dependency(deps)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        x = F.relu(x)
        x = self.fc5(x)
        x = F.relu(x)
        x = self.fc6(x)
        return x

    def training_step(self, data, target):
        output = self.forward(data).float()
        criterion = torch.nn.MSELoss()
        loss   = torch.sqrt(criterion(output, target.unsqueeze(1)))
        return loss

    class csv_Dataset(Dataset):
    # Here we define a custom Dataset class inherited from the general torch Dataset class
    # This class takes as argument a .csv file path and creates a torch Dataset 
        def __init__(self, dataset_path, x_dim):
            self.input_file = pd.read_csv(dataset_path,sep=',',index_col=False)
            x_train = self.input_file.loc[:,('year','transmission','mileage','tax','mpg','engineSize')].values
            y_train = self.input_file.loc[:,'price'].values
            self.X_train = torch.from_numpy(x_train).float()
            self.Y_train = torch.from_numpy(y_train).float()

        def __len__(self):            
            return len(self.Y_train)

        def __getitem__(self, idx):

            return (self.X_train[idx], self.Y_train[idx])
        
    def training_data(self,  batch_size = 48):
    # The training_data creates the Dataloader to be used for training in the general class TorchTrainingPlan of fedbiomed
        dataset = self.csv_Dataset(self.dataset_path, self.in_features)
        train_kwargs = {'batch_size': batch_size, 'shuffle': True}
        data_loader = DataLoader(dataset, **train_kwargs)
        return data_loader

Writing /home/fedbiomed/var/tmp/tmpqzvqfnt8/class_export_csv.py


In [4]:
# model parameters 
model_args = {
    'in_features': 6, 
    'out_features': 1
}

# training parameters 
training_args = {
    'batch_size': 20, 
    'lr': 1e-3, 
    'epochs': 10, 
    'dry_run': False,  
    #'batch_maxnum': 100 # Fast pass for development : only use ( batch_maxnum * batch_size ) samples
}

Define an experiment
- search nodes serving data for these `tags`, optionally filter on a list of client ID with `clients`
- run a round of local training on nodes with model defined in `model_path` + federation with `aggregator`
- run for `rounds` rounds, applying the `client_selection_strategy` between the rounds

In [5]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

# Calling the training data with specified tags. Change the following tag accordingly
tags =  ['UsedCars']
rounds = 5

exp = Experiment(tags=tags,
                 #clients=None,
                 model_path=model_file,
                 model_class='MyTrainingPlan',
                 model_args=model_args,
                 training_args=training_args,
                 rounds=rounds,
                 aggregator=FedAverage(),
                 client_selection_strategy=None)

Messaging researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d connected with result code 0
Searching for clients with data tags: ['UsedCars'] ...
2021-09-03 07:00:54.504172 [ RESEARCHER ] message received. {'researcher_id': 'researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d', 'success': True, 'databases': [{'name': 'temp', 'data_type': 'csv', 'tags': ['UsedCars'], 'description': 'temp', 'shape': [10668, 6], 'dataset_id': 'dataset_87a505c6-48a6-40a4-bdf6-d7a178daf7f7'}], 'count': 1, 'client_id': 'client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5', 'command': 'search'}
2021-09-03 07:00:54.542004 [ RESEARCHER ] message received. {'researcher_id': 'researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d', 'success': True, 'databases': [{'name': 'temp', 'data_type': 'csv', 'tags': ['UsedCars'], 'description': 'temp', 'shape': [10781, 6], 'dataset_id': 'dataset_ec752b95-24fc-4106-bacd-70dc5b665b4a'}], 'count': 1, 'client_id': 'client_9335c64d-df9b-4a9e-b9c7-f262369cb07a', 'command': 'search'}


Let's start the experiment.

By default, this function doesn't stop until all the `rounds` are done for all the clients

In [6]:
exp.run()

Sampled clients in round  0   ['client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5', 'client_9335c64d-df9b-4a9e-b9c7-f262369cb07a']
[ RESEARCHER ] Send message to client  client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5 {'researcher_id': 'researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d', 'job_id': 'c3fb2c54-5f75-43c0-b93f-ea3c620fcad0', 'training_args': {'batch_size': 20, 'lr': 0.001, 'epochs': 10, 'dry_run': False}, 'model_args': {'in_features': 6, 'out_features': 1}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/09/03/my_model_tP1OyDI.py', 'params_url': 'http://localhost:8844/media/uploads/2021/09/03/my_model_mO38dIx.pt', 'model_class': 'MyTrainingPlan', 'training_data': {'client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5': ['dataset_87a505c6-48a6-40a4-bdf6-d7a178daf7f7']}}
researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d
[ RESEARCHER ] Send message to client  client_9335c64d-df9b-4a9e-b9c7-f262369cb07a {'researcher_id': 'researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d', 'j

2021-09-03 07:02:40.414120 [ RESEARCHER ] message received. {'researcher_id': 'researcher_482350d3-a296-42f1-8f84-66c8c12dcd9d', 'job_id': 'c3fb2c54-5f75-43c0-b93f-ea3c620fcad0', 'success': True, 'client_id': 'client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5', 'dataset_id': 'dataset_87a505c6-48a6-40a4-bdf6-d7a178daf7f7', 'params_url': 'http://localhost:8844/media/uploads/2021/09/03/node_params_110bd5f7-e7f9-4124-a85c-ebf21a8d643d.pt', 'timing': {'rtime_training': 9.53348030000052, 'ptime_training': 9.593045499999988}, 'msg': '', 'command': 'train'}
Downloading model params after training on  client_9335c64d-df9b-4a9e-b9c7-f262369cb07a 
	- from http://localhost:8844/media/uploads/2021/09/03/node_params_423bf1d6-c6c4-4610-932f-e8c6710f9384.pt
Downloading model params after training on  client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5 
	- from http://localhost:8844/media/uploads/2021/09/03/node_params_110bd5f7-e7f9-4124-a85c-ebf21a8d643d.pt
Clients that successfully reply in round  2   ['client_933

Local training results for each round and each node are available in `exp.training_replies` (index 0 to (`rounds` - 1) ).

For example you can view the training results for the last round below.

Different timings (in seconds) are reported for each dataset of a node participating in a round :
- `rtime_training` real time (clock time) spent in the training function on the node
- `ptime_training` process time (user and system CPU) spent in the training function on the node
- `rtime_total` real time (clock time) spent in the researcher between sending the request and handling the response, at the `Job()` layer

In [7]:
print("\nList the training rounds : ", exp.training_replies.keys())

print("\nList the clients for the last training round and their timings : ")
round_data = exp.training_replies[rounds - 1].data
for c in range(len(round_data)):
    print("\t- {id} :\
    \n\t\trtime_training={rtraining:.2f} seconds\
    \n\t\tptime_training={ptraining:.2f} seconds\
    \n\t\trtime_total={rtotal:.2f} seconds".format(id = round_data[c]['client_id'],
        rtraining = round_data[c]['timing']['rtime_training'],
        ptraining = round_data[c]['timing']['ptime_training'],
        rtotal = round_data[c]['timing']['rtime_total']))
print('\n')
    
exp.training_replies[rounds - 1].dataframe


List the training rounds :  dict_keys([0, 1, 2, 3, 4])

List the clients for the last training round and their timings : 
	- client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5 :    
		rtime_training=8.52 seconds    
		ptime_training=8.57 seconds    
		rtime_total=25.03 seconds
	- client_9335c64d-df9b-4a9e-b9c7-f262369cb07a :    
		rtime_training=9.60 seconds    
		ptime_training=9.66 seconds    
		rtime_total=26.04 seconds




Unnamed: 0,success,msg,dataset_id,client_id,params_path,params,timing
0,True,,dataset_87a505c6-48a6-40a4-bdf6-d7a178daf7f7,client_5c0bb3c3-adb2-4fe8-9ed1-00c64110c8e5,/home/fedbiomed/var/tmp/my_model_e37770e1-e111...,"{'fc1.weight': [[tensor(-0.1343), tensor(0.021...","{'rtime_training': 8.515194999999949, 'ptime_t..."
1,True,,dataset_ec752b95-24fc-4106-bacd-70dc5b665b4a,client_9335c64d-df9b-4a9e-b9c7-f262369cb07a,/home/fedbiomed/var/tmp/my_model_4fb6bbc3-b413...,"{'fc1.weight': [[tensor(-0.1343), tensor(0.021...","{'rtime_training': 9.602519799998845, 'ptime_t..."


Federated parameters for each round are available in `exp.aggregated_params` (index 0 to (`rounds` - 1) ).

For example you can view the federated parameters for the last round of the experiment :

In [8]:
print("\nList the training rounds : ", exp.aggregated_params.keys())

print("\nAccess the federated params for the last training round :")
print("\t- params_path: ", exp.aggregated_params[rounds - 1]['params_path'])
print("\t- parameter data: ", exp.aggregated_params[rounds - 1]['params'].keys())



List the training rounds :  dict_keys([0, 1, 2, 3, 4])

Access the federated params for the last training round :
	- params_path:  /home/fedbiomed/var/tmp/researcher_params_e2687e7e-375c-471b-9d3a-21d6e6d978bd.pt
	- parameter data:  odict_keys(['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'fc3.weight', 'fc3.bias', 'fc4.weight', 'fc4.bias', 'fc5.weight', 'fc5.bias', 'fc6.weight', 'fc6.bias'])


# Test Function

In [9]:
fed_model = exp.model_instance
fed_model.load_state_dict(exp.aggregated_params[rounds - 1]['params'])

<All keys matched successfully>

In [10]:
 fed_model

MyTrainingPlan(
  (fc1): Linear(in_features=6, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
  (fc5): Linear(in_features=8, out_features=4, bias=True)
  (fc6): Linear(in_features=4, out_features=1, bias=True)
)

In [23]:
# Hold one file for testing the fed model
test_dataset_path ="__PATH to Test File__"

In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd

def cal_rmse(actual, prediction):
    return ((actual- prediction)**2).mean()**0.5

def testing_rmse(model, data_loader):
    model.eval()
    test_loss = 0
    correct = 0
    device = 'cpu'
    preds = []
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            preds.append(output.numpy().flatten())
    rmse = cal_rmse(data_loader.dataset.Y_train.numpy(),np.hstack(preds))
    return rmse

In [25]:
class csv_Dataset(Dataset):
        def __init__(self, dataset_path):
            self.input_file = pd.read_csv(dataset_path,sep=',',index_col=False)
            x_train = self.input_file.loc[:,('year','transmission','mileage','tax','mpg','engineSize')].values
            y_train = self.input_file.loc[:,'price'].values
            self.X_train = torch.from_numpy(x_train).float()
            self.Y_train = torch.from_numpy(y_train).float()

        def __len__(self):            
            return len(self.Y_train)

        def __getitem__(self, idx):

            return (self.X_train[idx], self.Y_train[idx])

In [26]:
dataset = csv_Dataset(test_dataset_path)
train_kwargs = {'batch_size': 64, 'shuffle': True}
data_loader = DataLoader(dataset, **train_kwargs)

In [27]:
rmse = testing_rmse(fed_model, data_loader)

In [28]:
rmse

13138.38589781865