# Data Preprocessing : Download Used Cars Dataset 
https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes

In [1]:
import pandas as pd

# Dataset Details
The data consists of used cars listings. 100,000 listings, which have been separated into files corresponding to each car manufacturer. Each file will simulate data for each node.

# Goal

The goal of this tutorial is to build a federated regression model on Non-IID dataset and generate the best model by performing validation on hold out dataset and tuning hyperparameters.The metric used to decide best model is RMSE.

In [None]:
audi = pd.read_csv("___PATH to audi.csv___")
bmw = pd.read_csv("___PATH to bmw.csv___")
# Use Ford for final validation at central researcher
ford = pd.read_csv("___PATH to ford.csv___")

# Use the following csvs if you want to run more than 2 nodes. Uncomment Corresponding lines in the following cell blocks
# cclass = pd.read_csv("___PATH to cclass.csv___")
# focus = pd.read_csv("___PATH to focus.csv___")
# hyundai = pd.read_csv("___PATH to huyndai.csv___")
# merc = pd.read_csv("___PATH to merc.csv___")
# skoda = pd.read_csv("___PATH to skoda.csv___")
# toyata = pd.read_csv("___PATH to toyata.csv___")
# vauxhall = pd.read_csv("___PATH to vauxhall.csv___")
# vw = pd.read_csv("__PATH to vw.csv___")

Drop columns model & fuelType as labels are not consistent across files. A better solution could be vertical federated learning

In [None]:
audi.drop(columns = ['model','fuelType'],inplace = True)
bmw.drop(columns = ['model','fuelType'],inplace = True)
ford.drop(columns = ['model','fuelType'],inplace = True)

# cclass.drop(columns = ['model','fuelType'],inplace = True)
# focus.drop(columns = ['model','fuelType'],inplace = True)
# hyundai.drop(columns = ['model','fuelType'],inplace = True)
# merc.drop(columns = ['model','fuelType'],inplace = True)
# skoda.drop(columns = ['model','fuelType'],inplace = True)
# toyata.drop(columns = ['model','fuelType'],inplace = True)
# vauxhall.drop(columns = ['model','fuelType'],inplace = True)
# vw.drop(columns = ['model','fuelType'],inplace = True)

Label encode transmission column

In [None]:
audi['transmission'] = audi['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
bmw['transmission'] = bmw['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
ford['transmission'] = ford['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})

# cclass['transmission'] = cclass['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# focus['transmission'] = focus['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# hyundai['transmission'] = hyundai['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# merc['transmission'] = merc['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# skoda['transmission'] = skoda['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# toyata['transmission'] = toyata['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# vauxhall['transmission'] = vauxhall['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})
# vw['transmission'] = vw['transmission'].map({'Automatic':0,'Manual':1,'Semi-Auto':2,'Other':3})

In [None]:
audi.to_csv('audi_transformed.csv',header = True,index= False)
bmw.to_csv('bmw_transformed.csv',header = True,index= False)
ford.to_csv('ford_transformed.csv',header = True,index= False)

# cclass.to_csv('cclass_transformed.csv',header = True,index= False)
# focus.to_csv('focus_transformed.csv',header = True,index= False)
# hyundai.to_csv('huydai_transformed.csv',header = True,index= False)
# merc.to_csv('merc_transformed.csv',header = True,index= False)
# skoda.to_csv('skoda_transformed.csv',header = True,index= False)
# toyata.to_csv('toyata_transformed.csv',header = True,index= False)
# vauxhall.to_csv('vaxhall_transformed.csv',header = True,index= False)
# vw.to_csv('vw_transformed.csv',header = True,index= False)

# Fedbiomed Researcher to train a model on a Used Cars dataset

Use for developing (autoreloads changes made across packages)

In [2]:
%load_ext autoreload
%autoreload 2

## Start the network and setting the client up
Before running this notebook, you shoud start the network from fedbiomed-network, as detailed in https://gitlab.inria.fr/fedbiomed/fedbiomed-network
Therefore, it is necessary to previously configure a node:
Also ensure that you have run data-preprocessing-used-cars-dataset.ipynb notebook to preprocess the used cars dataset
1. `./scripts/fedbiomed_run node add`
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset
  * Spin as many nodes as you want(max nodes 11 for 11 csv files in used cars dataset). Hold out one file for testing.
  * Load the .csv file generated using above mentioned notebook to individual nodes
2. Check that your data has been added by executing `./scripts/fedbiomed_run node list`
3. Run the node using `./scripts/fedbiomed_run node start`. Wait until you get `Connected with result code 0`. it means you are online.

## Create an experiment to train a model on the data found

Declare a torch.nn MyTrainingPlan class to send for training on the node

In [3]:
from fedbiomed.researcher.environ import TMP_DIR
import tempfile
tmp_dir_model = tempfile.TemporaryDirectory(dir=TMP_DIR+'/')
model_file = tmp_dir_model.name + '/class_export_csv.py'

Note : write **only** the code to export in the following cell

In [4]:
%%writefile "$model_file"

import torch
import torch.nn as nn
from fedbiomed.common.torchnn import TorchTrainingPlan
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Here we define the model to be used. 
# You can use any class name (here 'MyTrainingPlan')
class MyTrainingPlan(TorchTrainingPlan):       
    def __init__(self, kwargs):
        super(MyTrainingPlan, self).__init__()
        # kwargs should match the model arguments to be passed below to the experiment class
        self.in_features = kwargs['in_features']
        self.out_features = kwargs['out_features']
        self.fc1 = nn.Linear(self.in_features, 5)
        self.fc2 = nn.Linear(5, self.out_features)
        
        # Here we define the custom dependencies that will be needed by our custom Dataloader
        # In this case, we need the torch Dataset and DataLoader classes
        # We need pandas to read the local .csv file at the client side
        deps = ["from torch.utils.data import Dataset, DataLoader",
                "import pandas as pd"]
        self.add_dependency(deps)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

    def training_step(self, data, target):
        output = self.forward(data).float()
        criterion = torch.nn.MSELoss()
        loss   = torch.sqrt(criterion(output, target.unsqueeze(1)))
        return loss

    class csv_Dataset(Dataset):
    # Here we define a custom Dataset class inherited from the general torch Dataset class
    # This class takes as argument a .csv file path and creates a torch Dataset 
        def __init__(self, dataset_path, x_dim):
            self.input_file = pd.read_csv(dataset_path,sep=',',index_col=False)
            x_train = self.input_file.loc[:,('year','transmission','mileage','tax','mpg','engineSize')].values
            y_train = self.input_file.loc[:,'price'].values
            self.X_train = torch.from_numpy(x_train).float()
            self.Y_train = torch.from_numpy(y_train).float()

        def __len__(self):            
            return len(self.Y_train)

        def __getitem__(self, idx):

            return (self.X_train[idx], self.Y_train[idx])
        
    def training_data(self,  batch_size = 48):
    # The training_data creates the Dataloader to be used for training in the general class TorchTrainingPlan of fedbiomed
        dataset = self.csv_Dataset(self.dataset_path, self.in_features)
        train_kwargs = {'batch_size': batch_size, 'shuffle': True}
        data_loader = DataLoader(dataset, **train_kwargs)
        return data_loader

Writing /home/fedbiomed/fedbiomed/var/tmp/tmplcbohsuc/class_export_csv.py


In [5]:
# model parameters 
model_args = {
    'in_features': 6, 
    'out_features': 1
}

# training parameters 
training_args = {
    'batch_size': 40, 
    'lr': 1e-3, 
    'epochs': 2, 
    'dry_run': False,  
    #'batch_maxnum': 100 # Fast pass for development : only use ( batch_maxnum * batch_size ) samples
}

Define an experiment
- search nodes serving data for these `tags`, optionally filter on a list of client ID with `clients`
- run a round of local training on nodes with model defined in `model_path` + federation with `aggregator`
- run for `rounds` rounds, applying the `client_selection_strategy` between the rounds

In [6]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.fedavg import FedAverage

# Calling the training data with specified tags. Change the following tag accordingly
tags =  ['UsedCars']
rounds = 3

exp = Experiment(tags=tags,
                 #clients=None,
                 model_path=model_file,
                 model_class='MyTrainingPlan',
                 model_args=model_args,
                 training_args=training_args,
                 rounds=rounds,
                 aggregator=FedAverage(),
                 client_selection_strategy=None)

Messaging 6309fcbb-bae7-4c2a-adb9-631e5b9db5b4  connected with result code 0
Searching for clients with data tags: ['UsedCars'] ...
2021-09-13 16:57:12.241643 [ RESEARCHER ] message received. {'researcher_id': 'researcher_702f019d-9c48-47c8-8811-14cb5d5560db', 'success': True, 'databases': [{'name': 'UsedCars', 'data_type': 'csv', 'tags': ['UsedCars'], 'description': 'UsedCars', 'shape': [10781, 6], 'dataset_id': 'dataset_3843dd0a-ea4e-4281-b955-e68e71065448'}], 'count': 1, 'node_id': 'client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67', 'command': 'search'}
2021-09-13 16:57:12.284651 [ RESEARCHER ] message received. {'researcher_id': 'researcher_702f019d-9c48-47c8-8811-14cb5d5560db', 'success': True, 'databases': [{'name': 'UsedCars', 'data_type': 'csv', 'tags': ['UsedCars'], 'description': 'UsedCars', 'shape': [10668, 6], 'dataset_id': 'dataset_92d53d50-548d-4fc6-a98d-5ca7233a2c8b'}], 'count': 1, 'node_id': 'client_41d55762-b0d3-4050-9ee1-07537fcf7abe', 'command': 'search'}


Let's start the experiment.

By default, this function doesn't stop until all the `rounds` are done for all the clients

In [7]:
exp.run()

Sampled clients in round  0   ['client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67', 'client_41d55762-b0d3-4050-9ee1-07537fcf7abe']
[ RESEARCHER ] Send message to client  client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67 {'researcher_id': 'researcher_702f019d-9c48-47c8-8811-14cb5d5560db', 'job_id': '40227b54-d516-46ce-b94f-77166035feaf', 'training_args': {'batch_size': 40, 'lr': 0.001, 'epochs': 2, 'dry_run': False}, 'model_args': {'in_features': 6, 'out_features': 1}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/09/13/my_model_cac1f991-2efd-4780-8878-d0d70e2ac890.py', 'params_url': 'http://localhost:8844/media/uploads/2021/09/13/my_model_4e8c25ac-9f49-4027-b5f6-798d2afd0394.pt', 'model_class': 'MyTrainingPlan', 'training_data': {'client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67': ['dataset_3843dd0a-ea4e-4281-b955-e68e71065448']}}
researcher_702f019d-9c48-47c8-8811-14cb5d5560db
[ RESEARCHER ] Send message to client  client_41d55762-b0d3-4050-9ee1-07537fcf7abe {'researcher_i

2021-09-13 16:57:45.916037 [ RESEARCHER ] message received. {'researcher_id': 'researcher_702f019d-9c48-47c8-8811-14cb5d5560db', 'job_id': '40227b54-d516-46ce-b94f-77166035feaf', 'success': True, 'node_id': 'client_41d55762-b0d3-4050-9ee1-07537fcf7abe', 'dataset_id': 'dataset_92d53d50-548d-4fc6-a98d-5ca7233a2c8b', 'params_url': 'http://localhost:8844/media/uploads/2021/09/13/node_params_f852c488-f912-43bb-bf2f-688516f9e572.pt', 'timing': {'rtime_training': 0.7563033000333235, 'ptime_training': 0.7666934000000083}, 'msg': '', 'command': 'train'}
2021-09-13 16:57:45.986156 [ RESEARCHER ] message received. {'researcher_id': 'researcher_702f019d-9c48-47c8-8811-14cb5d5560db', 'job_id': '40227b54-d516-46ce-b94f-77166035feaf', 'success': True, 'node_id': 'client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67', 'dataset_id': 'dataset_3843dd0a-ea4e-4281-b955-e68e71065448', 'params_url': 'http://localhost:8844/media/uploads/2021/09/13/node_params_3fdb8a71-4f9a-49c5-9c4c-ae2f398b5a67.pt', 'timing': {'rtime

Local training results for each round and each node are available in `exp.training_replies` (index 0 to (`rounds` - 1) ).

For example you can view the training results for the last round below.

Different timings (in seconds) are reported for each dataset of a node participating in a round :
- `rtime_training` real time (clock time) spent in the training function on the node
- `ptime_training` process time (user and system CPU) spent in the training function on the node
- `rtime_total` real time (clock time) spent in the researcher between sending the request and handling the response, at the `Job()` layer

In [8]:
print("\nList the training rounds : ", exp.training_replies.keys())

print("\nList the clients for the last training round and their timings : ")
round_data = exp.training_replies[rounds - 1].data
for c in range(len(round_data)):
    print("\t- {id} :\
    \n\t\trtime_training={rtraining:.2f} seconds\
    \n\t\tptime_training={ptraining:.2f} seconds\
    \n\t\trtime_total={rtotal:.2f} seconds".format(id = round_data[c]['node_id'],
        rtraining = round_data[c]['timing']['rtime_training'],
        ptraining = round_data[c]['timing']['ptime_training'],
        rtotal = round_data[c]['timing']['rtime_total']))
print('\n')
    
exp.training_replies[rounds - 1].dataframe


List the training rounds :  dict_keys([0, 1, 2])

List the clients for the last training round and their timings : 
	- client_41d55762-b0d3-4050-9ee1-07537fcf7abe :    
		rtime_training=0.76 seconds    
		ptime_training=0.77 seconds    
		rtime_total=10.01 seconds
	- client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67 :    
		rtime_training=0.79 seconds    
		ptime_training=0.80 seconds    
		rtime_total=10.03 seconds




Unnamed: 0,success,msg,dataset_id,node_id,params_path,params,timing
0,True,,dataset_92d53d50-548d-4fc6-a98d-5ca7233a2c8b,client_41d55762-b0d3-4050-9ee1-07537fcf7abe,/home/fedbiomed/fedbiomed/var/tmp/my_model_af3...,"{'fc1.weight': [[tensor(2.2414), tensor(1.4931...","{'rtime_training': 0.7563033000333235, 'ptime_..."
1,True,,dataset_3843dd0a-ea4e-4281-b955-e68e71065448,client_293f2b07-c5c0-4ac1-9ea2-28fe09fd9f67,/home/fedbiomed/fedbiomed/var/tmp/my_model_367...,"{'fc1.weight': [[tensor(2.2512), tensor(1.5125...","{'rtime_training': 0.7873878999962471, 'ptime_..."


Federated parameters for each round are available in `exp.aggregated_params` (index 0 to (`rounds` - 1) ).

For example you can view the federated parameters for the last round of the experiment :

In [9]:
print("\nList the training rounds : ", exp.aggregated_params.keys())

print("\nAccess the federated params for the last training round :")
print("\t- params_path: ", exp.aggregated_params[rounds - 1]['params_path'])
print("\t- parameter data: ", exp.aggregated_params[rounds - 1]['params'].keys())



List the training rounds :  dict_keys([0, 1, 2])

Access the federated params for the last training round :
	- params_path:  /home/fedbiomed/fedbiomed/var/tmp/researcher_params_b06b09d0-e859-4579-861e-4273f33ca468.pt
	- parameter data:  odict_keys(['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias'])


# Test Function

In [19]:
fed_model = exp.model_instance
fed_model.load_state_dict(exp.aggregated_params[rounds - 1]['params'])

<All keys matched successfully>

In [20]:
 fed_model

MyTrainingPlan(
  (fc1): Linear(in_features=6, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=1, bias=True)
)

In [25]:
# Hold one file for testing the fed model
test_dataset_path ="__PATH to ford_transformed.csv___"

In [26]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd

def cal_rmse(actual, prediction):
    return ((actual- prediction)**2).mean()**0.5

def testing_rmse(model, data_loader):
    model.eval()
    test_loss = 0
    correct = 0
    device = 'cpu'
    preds = []
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            preds.append(output.numpy().flatten())
    rmse = cal_rmse(data_loader.dataset.Y_train.numpy(),np.hstack(preds))
    return rmse

In [27]:
class csv_Dataset(Dataset):
        def __init__(self, dataset_path):
            self.input_file = pd.read_csv(dataset_path,sep=',',index_col=False)
            x_train = self.input_file.loc[:,('year','transmission','mileage','tax','mpg','engineSize')].values
            y_train = self.input_file.loc[:,'price'].values
            self.X_train = torch.from_numpy(x_train).float()
            self.Y_train = torch.from_numpy(y_train).float()

        def __len__(self):            
            return len(self.Y_train)

        def __getitem__(self, idx):

            return (self.X_train[idx], self.Y_train[idx])

In [28]:
dataset = csv_Dataset(test_dataset_path)
train_kwargs = {'batch_size': 64, 'shuffle': True}
data_loader = DataLoader(dataset, **train_kwargs)

In [29]:
rmse = testing_rmse(fed_model, data_loader)

In [30]:
rmse

7471.202045186571