# Fedbiomed Researcher to train a federated PPCA (Probabilistic PCA) model.

## Description of the exercise :

Three datasets `n1.csv` , `n2.csv` and `n3.csv` will be generated randomly using 3-views PPCA from a 4-dimensional latent space, with views dimensions [15,8,10] and 2 groups. Henceforth, we will distribute the 3 dataset to 3 distinct nodes and use Fed-mv-PPCA. In each center we check the evolution of expected LL during training.

## Data Generation

We will generate three datasets using mv-PPCA.
Then save them in a path of your choice on your machine.

In [3]:
import numpy as np
import pandas as pd
from typing import List, Union, Dict

def sample_x_n(N:int, q:int, random_state:int=None):
    """samples from a Guassian dsitribution
    Args:
    
    :N first dimension array
    :q second dimesnion array
    """
    return np.random.RandomState(random_state).randn(N,q)

def generate_data(N_g: List[int],
                  W: np.ndarray,
                  a_g: np.ndarray,
                  mu:float,
                  sigma2:float,
                  x_n,
                  view:int,
                  random_state=None):
    
    """generates Gaussian dataset given several groups of data points, using the following
    gausian generative proccess (for a given view):
    
    Y = WX + mu + epsilon with epsilon ~ N(0, sigma2)
    
    where X is the latent space of size (q, n_features), Y the observation matrix, W the matrix used
    for data reconstructionY
    
    Params:
    :N_g: List[int] number of data to generate per group (list of size number of group)
    :W: reconstruction matrix, of size (n_features, q)
    :a_g: (np.ndarray) array of size (nb_group, n_components), introduces shift when creating different group. 
    :mu: offset of the dataset
    :sigma2: variance used for generating
    :x_n: random variable 
    :view: (int) the given view
    :random_state: ransom seed for reproducibility
    
    Returns:
    :Y (pd.DataFrame): synthetic dataset generated wrt above expression
    of size (n_features, n_samples)
    """
    rnd=np.random.RandomState(random_state)

    N=N_g.sum()
    d, q = W.shape
    sigma=np.sqrt(sigma2)
    G=len(N_g)

    g_ind=np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g)))
    print(g_ind)

    y_n=np.empty((N, d))

    for g in range(G):
        # computing Y = W.transpose(X + shift()) + mu
        y_n[g_ind[g]:g_ind[g+1]]= np.einsum("dq,nq->nd", W, x_n[g_ind[g]:g_ind[g+1]]+a_g[g]) + mu
        
    y_n = pd.DataFrame(data=y_n,
                     columns=[f'var_{view},{i + 1}' for i in range(d)])

    return y_n + sigma*rnd.randn(N,d)



def generate_ppca_nodes_dataset(n_nodes: int,
                           n_features: Union[List[int],int],
                           
                           n_components: int,
                               n_group: int=2,
                           absent_view: Dict[str, int]=None,
                           W_init: List[np.ndarray]=None,
                           mu_init: List[np.ndarray]=None,
                           sigma_init : List[np.ndarray]=None,
                           is_validation: bool=True,
                           n_sample_validation: int=None):
    """
    Generate a synthetic dataset for each node
    
    """
    # generate PPCA parameters if not defined
    ## case where W parameter is not defined
    if W_init is None:
        W_init = []
        for i in range(n_nodes):
            W_gen = np.random.uniform(-10, 10, (n_features[i], n_components))
            W_init.append(W_gen)
    ## case where mu not generated
    if mu_init is None:
        mu_init = []
        for i in range(n_nodes):
            mu_gen = np.random.uniform(-10, 10, n_features[i])
            mu_init.append(mu_gen)
            
    ## case where sigma is not definied (we will set sigma =1 for each clients)
    if sigma_init is None:
        sigma_init = []
        for i in range(n_nodes):
            sigma_init.append(1)
            
    
    shift = np.concatenate((np.zeros((1, n_components)),
                          np.random.uniform(-10, 10, (nb_group - 1, n_components))))

In [4]:
from typing import Dict, Union

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header
    
    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names
    
    _concatenated_datasets = np.array([])  # store dataframe values
    
    for key in datasets.keys():
        #_sub_dataframe_header.append(list(datasets[key].columns.values))
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            try:
                _concatenated_datasets = np.concatenate([_concatenated_datasets,
                                                         datasets[key].to_numpy()],
                                                        axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError('Cannot create multi view dataset: different number of samples for each modality have been detected')
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)
        print(_feature_name_array)
    _header = pd.MultiIndex.from_arrays([_view_name_array, _feature_name_array],
                                        names=_header_labels)
    
    
    print(_concatenated_datasets)
    
    # 2. create multi index dataframe
    
    mulit_view_df = pd.DataFrame(_concatenated_datasets,
                                 columns = _header)
    return mulit_view_df


def save_multi_view_dataframe(dataframe: pd.DataFrame, file_name: str):
    dataframe.to_csv(file_name)
    
def load_multi_view_dataframe(file_name: str) -> pd.DataFrame:
    df = pd.read_csv(file_name, delimiter=',', index_col=0, header=[0,1])
    return df




In [5]:
np.random.seed(100)

D_i = [15, 8, 10]
nb_group = 2
n_centers = 3
testing_samples = 40

n_components_generated = 4


# initializing PPCA variables
sigma2_gen1, sigma2_gen2, sigma2_gen3 = 2, 1, 3
W_gen1 = np.random.uniform(-10, 10, (D_i[0], n_components_generated))
W_gen2 = np.random.uniform(-5, 5, (D_i[1], n_components_generated))
W_gen3 = np.random.uniform(-15, 15, (D_i[2], n_components_generated))
mu_gen1 = np.random.uniform(-10, 10, D_i[0])
mu_gen2 = np.random.uniform(-5, 5, D_i[1])
mu_gen3 = np.random.uniform(-15, 15, D_i[2])

a_g_gen = np.concatenate((np.zeros((1, n_components_generated)),
                          np.random.uniform(-10, 10, (nb_group - 1, n_components_generated))))

W = [W_gen1,W_gen2,W_gen3]
mu = [mu_gen1,mu_gen2,mu_gen3]
sigma = [sigma2_gen1,sigma2_gen2,sigma2_gen3]

# absent_views contains as key the id of a center in which we want to simulate absent views,
# and as argumet the id of the missing view. 
absent_views = {'2': 2}

for i in range(n_centers):
    N_g = np.array([np.random.randint(25,300) for _ in range(nb_group)]) 
    # N_g = np.array([np.random.randint(25,300) for _ in range(nb_group)])?
    #g_ind = np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g)))
    N = N_g.sum()
    x_n_gen = sample_x_n(N, n_components_generated, random_state=150)  # randomly generate a Gaussian
    # dataset
    #Y = []
    Y = {}
    for d in range(len(D_i)):
        y_t = generate_data(N_g, W[d], a_g_gen, mu[d], sigma[d], x_n_gen, view = d+1, random_state=250)
        if ((str(i+1) in absent_views.keys()) \
            and (type(absent_views[str(i+1)])== int) \
            and (absent_views[str(i+1)]==d+1)):
            absent_views.update({str(i+1): y_t})
            y_abs=pd.DataFrame(np.nan, index = np.arange(N_g.sum()), \
                               columns = [f'var_{d+1},{i + 1}' for i in range(D_i[d])])
            #Y.append(y_abs)
            Y['view_' + str(d+1)] = y_abs
        else:
            # Y.append(y_t)
            Y['view_' + str(d+1)] = y_t

    gr = []
    for g in range(nb_group):
       gr += [int(g) for _ in range(N_g[g])]
    gr = pd.Series(gr)
    Y['Label'] = pd.DataFrame(gr, columns=['Labels'])

###
# the output will be a list of n_centers dataset containing: dataframe for each centers,of 
# different dimensions
# 
    #t_i = pd.concat(Y, axis=1)
    
    #t_i.columns.values[-1] = 'Label'
    t_i = create_multi_view_dataframe(Y)
    t_i.to_csv('== Local path to node' + str(i+1) + '.csv',sep=',')
    #np.savetxt('== Local path to node' + str(i+1) + '.csv',t_i,delimiter=',')
               
# building the test dataset
N_g_test = np.array([testing_samples//2,testing_samples//2])
g_ind_test = np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g_test)))
N_test = N_g_test.sum()
x_n_gen = sample_x_n(N_test, n_components_generated, random_state=150)
Y_test = []
for d in range(len(D_i)):
    y_t = generate_data(N_g_test, W[d], a_g_gen, mu[d], sigma[d], x_n_gen, view = d+1, random_state=250)
    Y_test.append(y_t)

gr_test = [0 for _ in range(N_g_test[0])]+[1 for _ in range(N_g_test[1])]
gr_test = pd.Series(gr_test)
Y_test.append(gr_test)

t_test = pd.concat(Y_test, axis=1)

[  0 148 215]
[  0 148 215]
[  0 148 215]
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15']
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15' 'var_2,1' 'var_2,2' 'var_2,3' 'var_2,4' 'var_2,5'
 'var_2,6' 'var_2,7' 'var_2,8']
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15' 'var_2,1' 'var_2,2' 'var_2,3' 'var_2,4' 'var_2,5'
 'var_2,6' 'var_2,7' 'var_2,8' 'var_3,1' 'var_3,2' 'var_3,3' 'var_3,4'
 'var_3,5' 'var_3,6' 'var_3,7' 'var_3,8' 'var_3,9' 'var_3,10']
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15' 'var_2,1' 'var_2,2' 'var_2,3' 'var_2,4' 'var_2,5'
 'var

In [None]:

load_multi_view_dataframe('== Local path to node1.csv')

## Start the network and setting the client up
Before running this notebook:
1. You should start the network from fedbiomed-network, as detailed in :
https://gitlab.inria.fr/fedbiomed/fedbiomed

2. You need to configure at least 2 nodes: <br/>
* **Node 1 :** `./scripts/fedbiomed_run node add`
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[0],y[0].
  * Check that your data has been added in node 1 by executing `./scripts/fedbiomed_run node list`
  * Run the node using `./scripts/fedbiomed_run node start`. <br/>

* **Node 2 :** Open a second terminal and run ./scripts/fedbiomed_run node add config n2.ini
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[1],y[1].
  * Check that your data has been added in node 2 by executing `./scripts/fedbiomed_run node config n2.ini list`
  * Run the node using `./scripts/fedbiomed_run node config n2.ini start`.
  


 Wait until you get `Connected with result code 0`. it means node is online.


In [24]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np
from fedbiomed.researcher.environ import TMP_DIR
import tempfile
tmp_dir_model = tempfile.TemporaryDirectory(dir=TMP_DIR+'/')
model_file = tmp_dir_model.name + '/fed_mv_ppca.py'

Hereafter the template of the class you should provide to Fedbiomed :
       
**training_data** : you must return here a tuple (X,X_k,ViewsX,y) or (X,X_k,ViewsX). Note that all centers should provide a dataset with the same view-specific columns. If in a specific center a view has not been observed, then the corresponding columns will be filled of nan. The training_data method take care of identifying view-specific sub-datasets and collecting information concerning non-available observations. Data can also been normalized here.

In [2]:
%%writefile "$model_file"

from fedbiomed.common.ppca import PpcaPlan
import numpy as np
import pandas as pd


class Fed_MV_PPCA(PpcaPlan):
    def __init__(self, kwargs):
        super(Fed_MV_PPCA, self).__init__(kwargs)
        deps = ['import numpy as np', 
               'import pandas as pd']
        self.add_dependency(deps)
        self.multi_view = True
    
    def training_data(self):
        """
            Perform in this method all data reading and data transformations you need.
            At the end you should provide a tuple (X_obs,Xk,ViewsX,y), where: 
            X_obs is the training dataset, 
            Xk is a list containing the k-specific dataframe if it exists or 'NaN' otherwise,
            ViewsX is the indicator function for observed views (ViewsX[k]=1 if view k is observed, 0 otherwise)
            y the corresponding labels (optional)
            The dataset is normalized using min max scaler if model_args['norm'] is true
            Note: since labels are not needed for the optimization, 
            training_data can also simply return (X_obs,Xk,ViewsX)
            :raise NotImplementedError if researcher do not implement this method.
        """
        #dataset = pd.read_csv(self.dataset_path,delimiter=',', index_col=0)
        dataset = self.load_multi_view_dataframe(self.dataset_path)
        X = dataset.iloc[:,:-1]
        y = dataset[dataset.columns[-1]]
        
        # Xk is a list contistripaning the view-specific local datasets
        Xk = []
        ViewsX = []
        ind = 0
        for k in range(self.K):
            if X.iloc[:, ind].isnull().values.any():
                Xk.append(np.nan)
                ViewsX.append(0)
            else:
                # if norm = true, data are normalized with min max scaler
                X_k = X.iloc[:, ind:ind + self.dim_views[k]]
                if self.is_norm:
                    X_k = self.normalize_data(X_k) 
                    
                Xk.append(X_k)
                ViewsX.append(1)
            ind += self.dim_views[k]
        
        # The entire dataset is re-built without empty columns
        Xk_obs = [item for item in Xk if item is not np.nan]
        X_obs = pd.concat(Xk_obs, axis=1)
        
        return (X_obs,Xk,ViewsX,y)
    
    def load_multi_view_dataframe(self, file_name: str) -> pd.DataFrame:
        df = pd.read_csv(file_name, delimiter=',', index_col=0, header=[0,1])
        return df
    

Writing /home/ybouilla/fedbiomed/var/tmp/tmp7x42vv1s/fed_mv_ppca.py


**model_args** is a dictionary containing the mv-ppca model arguments: the total number of views across all datasets (tot_views), the dimension of each view (dim_views), the latent space size (n_components), and a boolean (norm) for data preprocessing. Additionaly, the researcher can provide priors for one ore more global parameters.

**training_args** contains here the number of local iterations for EM/MAP. 

In [3]:
tot_views = 3
dim_views = [15, 8, 10]
n_components = 4
norm = True

model_args = {'tot_views': tot_views, 'dim_views': dim_views, 'n_components': n_components, 'is_norm': norm}

training_args = {'n_iterations': 15}

In [4]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.ppca_aggregator import MLaggregator

tags =  ['ppca_data']
rounds = 5

# select nodes pr into task <Task pending name='Task-27' coro=<HTTP1ServerConnection._server_request_loop() running at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/tornado/http1connection.py:823> wait_for=<Future finished result=b'GET /kernel...6bd7"\r\n\r\n'> cb=[IOLoop.add_future.<locals>.<lambda>() at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/tornado/ioloop.py:688]> while another task <Task pending name='Task-2' coro=<KernelManager._async_start_kernel() running at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/jupyter_carticiping to this experiment
exp = Experiment(tags=tags,
                 #clients=None,
                 model_path=model_file,
                 model_args=model_args,
                 model_class='Fed_MV_PPCA',
                 training_args=training_args,
                 rounds=rounds,
                 aggregator=MLaggregator(),
                 client_selection_strategy=None)

2021-11-03 14:05:56,643 fedbiomed INFO - Messaging researcher_0c1392f2-9255-4162-90ac-385747ea3987 successfully connected to the message broker, object = <fedbiomed.common.messaging.Messaging object at 0x7f609b7bdac0>
2021-11-03 14:05:56,684 fedbiomed INFO - Searching dataset with data tags: ['ppca_data'] for all nodes
2021-11-03 14:05:56,686 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - DEBUG Message received: {'researcher_id': 'researcher_0c1392f2-9255-4162-90ac-385747ea3987', 'tags': ['ppca_data'], 'command': 'search'}
2021-11-03 14:06:06,969 fedbiomed INFO - Messaging NodeTrainingFeedbackClient successfully connected to the message broker, object = <fedbiomed.common.messaging.Messaging object at 0x7f609b764f70>


In [8]:
from fedbiomed.researcher.requests import Requests

req = Requests()
datasets = req.list(verbose=True)

2021-11-03 13:21:29,132 fedbiomed INFO - Listing available datasets in all nodes... 
2021-11-03 13:21:29,136 fedbiomed INFO - log from: client_07c4f022-82cb-4db2-9d26-78b4b977ef6e - DEBUG Message received: {'researcher_id': 'researcher_a6efeab7-24c5-4c4e-99d6-627a0ff67a78', 'command': 'list'}
2021-11-03 13:21:39,148 fedbiomed INFO - 
 Node: client_07c4f022-82cb-4db2-9d26-78b4b977ef6e | Number of Datasets: 1 
+--------+-------------+---------------+---------------+-----------+--------------+
| name   | data_type   | tags          | description   | shape     | multi_view   |
|        | csv         | ['ppca_data'] |               | [215, 35] | multi_view   |
+--------+-------------+---------------+---------------+-----------+--------------+



In [5]:
# start federated training
exp.run()

2021-11-03 14:06:16,476 fedbiomed INFO - Sampled clients in round 0 ['client_9bf02431-d71d-41e6-a2b2-c091316c5572']
2021-11-03 14:06:16,480 fedbiomed INFO - Send message to client client_9bf02431-d71d-41e6-a2b2-c091316c5572 - {'researcher_id': 'researcher_0c1392f2-9255-4162-90ac-385747ea3987', 'job_id': '1794a818-8f9e-40e7-8594-051525a7ff12', 'training_args': {'n_iterations': 15}, 'model_args': {'tot_views': 3, 'dim_views': [15, 8, 10], 'n_components': 4, 'is_norm': True}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/11/03/my_model_284e093f-1fca-43bb-9383-de2ecbc09f3f.py', 'params_url': 'http://localhost:8844/media/uploads/2021/11/03/my_model_aba23f4e-411e-464e-a76f-f5761ce7f29f.pt', 'model_class': 'Fed_MV_PPCA', 'training_data': {'client_9bf02431-d71d-41e6-a2b2-c091316c5572': ['dataset_95356dc6-347a-4f78-b6e2-135be6e0abaf']}}
2021-11-03 14:06:16,482 fedbiomed DEBUG - researcher_0c1392f2-9255-4162-90ac-385747ea3987
2021-11-03 14:06:16,488 fedbiomed INFO - 

2021-11-03 14:06:26,702 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - DEBUG Dataset_path/home/ybouilla/fedbiomed/notebooks/== Local path to node1.csv
2021-11-03 14:06:26,703 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - DEBUG Dataset is multi view ? True
2021-11-03 14:06:26,889 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 1/15	Expected LL: 7151.926600
2021-11-03 14:06:26,994 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 2/15	Expected LL: 7030.532824
2021-11-03 14:06:27,088 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 3/15	Expected LL: 7023.989809
2021-11-03 14:06:27,199 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 4/15	Expected LL: 7022.232409
2021-11-03 14:06:27,315 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 5/15	E

2021-11-03 14:06:46,770 fedbiomed INFO - Downloading model params after training on client_9bf02431-d71d-41e6-a2b2-c091316c5572 - from http://localhost:8844/media/uploads/2021/11/03/node_params_3143fcb2-2306-49fa-98f1-549b7c7e3a95.pt
2021-11-03 14:06:46,797 fedbiomed INFO - Clients that successfully reply in round 2 ['client_9bf02431-d71d-41e6-a2b2-c091316c5572']
2021-11-03 14:06:46,831 fedbiomed INFO - Sampled clients in round 3 ['client_9bf02431-d71d-41e6-a2b2-c091316c5572']
2021-11-03 14:06:46,832 fedbiomed INFO - Send message to client client_9bf02431-d71d-41e6-a2b2-c091316c5572 - {'researcher_id': 'researcher_0c1392f2-9255-4162-90ac-385747ea3987', 'job_id': '1794a818-8f9e-40e7-8594-051525a7ff12', 'training_args': {'n_iterations': 15}, 'model_args': {'tot_views': 3, 'dim_views': [15, 8, 10], 'n_components': 4, 'is_norm': True}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/11/03/my_model_284e093f-1fca-43bb-9383-de2ecbc09f3f.py', 'params_url': 'http://lo

2021-11-03 14:06:56,940 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO {'monitor': <fedbiomed.node.history_monitor.HistoryMonitor object at 0x7f5c1a2f5e50>, 'n_iterations': 15}
2021-11-03 14:06:56,943 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - DEBUG Dataset_path/home/ybouilla/fedbiomed/notebooks/== Local path to node1.csv
2021-11-03 14:06:56,946 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - DEBUG Dataset is multi view ? True
2021-11-03 14:06:57,111 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 1/15	Expected LL: 7446.123382
2021-11-03 14:06:57,217 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 2/15	Expected LL: 7304.298923
2021-11-03 14:06:57,314 fedbiomed INFO - log from: client_9bf02431-d71d-41e6-a2b2-c091316c5572 - INFO Iteration: 3/15	Expected LL: 7295.368097
2021-11-03 14:06:57,499 fedbiomed INFO - log from: cli

In [17]:
print("\nList the training rounds : ", exp.aggregated_params.keys())

print("\nAccess the federated params for the last training round :")
print("\t- params_path: ", exp.aggregated_params[rounds - 1]['params_path'])
print("\t- parameter data: ", exp.aggregated_params[rounds - 1]['params'].keys())


List the training rounds :  dict_keys([0, 1, 2, 3, 4])

Access the federated params for the last training round :
	- params_path:  /home/ybouilla/fedbiomed/var/tmp/researcher_params_c519e47f-9d93-4ee3-9f17-334d35319a56.pt
	- parameter data:  dict_keys(['tilde_muk', 'tilde_Wk', 'tilde_Sigma2k', 'Alpha', 'Beta', 'sigma_til_muk', 'sigma_til_Wk', 'sigma_til_sigma2k'])


## Test

Herafter we test the performance of the aggregated parameters on a test dataset. In particular, for each round we use the global parameters to evaluate the mean absolute error and the separation in the latent space using LDA. Note that we have already defined the test dataset at the beginning of this notebook.

In [35]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from numpy.linalg import solve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error


def normalize_data(X):
    """
    This function normalize the dataset X
    :param X: pandas dataframe
    :return pandas dataframe
    """
    
    x = X.values  # returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    
    if True is True:
        col_name = X.columns
        
        
    else:
        
        col_name = [col.strip() for col in list(X.columns)]
    print(x_scaled.shape, col_name.shape, )
    norm_dataset = pd.DataFrame(x_scaled, index=X.index, columns=col_name)

    return norm_dataset

def eval_MB(Wk, q, D_i, K, Sigma2, ViewsX):
    """
    Computes matrices M:=inv(I_q+sum_k Wk.TWk/sigma2k) and B:= [W1.T/sigma2K,...,W1.T/sigma2K].
    :param Wk: list of matrices (d_k x q)
    :param Sigma2: list of float > 0
    :return np.arrays
    """
    index = ViewsX.index(1)

    M1 = Wk[index].reshape(D_i[index], q).T.dot(Wk[index].reshape(D_i[index],q)) / Sigma2[index]
    B = Wk[index].reshape(D_i[index], q).T / Sigma2[index]
    for k in range(index + 1, K):
        if ViewsX[k] == 1:
            M1 += Wk[k].reshape(D_i[k], q).T.dot(Wk[k].reshape(D_i[k],q)) / Sigma2[k]
            B = np.concatenate((B, (Wk[k].reshape(D_i[k], q)).T / Sigma2[k]), axis=1)

    M = solve(np.eye(q) + M1,np.eye(q))

    return M, B

def concat_params(park, K, ViewsX):
    """
    This function concatenates parameters from a list
    :param park: list of vectors/matrices to concatenate
    :return np.array
    """
    index = ViewsX.index(1)

    par = park[index]
    for k in range(index + 1, K):
        if ViewsX[k] == 1:
            par = np.concatenate((par, park[k]), axis=0)

    return par
[]
def simu_latent(q,dataset,ViewsX,global_params):
    """
    This function allows sampling of x_n (latent variables) from the posterior distribution 
    (with global parameters).
    :return pandas dataframe
    """
    d = dataset.shape[1]
    N = dataset.shape[0]
    K = len(ViewsX)

    mu = concat_params(global_params['tilde_muk'], K, ViewsX)
    M, B = eval_MB(global_params['tilde_Wk'], q, D_i, K, global_params['tilde_Sigma2k'],ViewsX)

    Xn = [(M.dot(B).dot(dataset.iloc[n].values.reshape(d, 1) - mu)).reshape(1, q) for n in range(N)]

    df = pd.DataFrame(np.vstack(Xn), index=dataset.index)

    return df

def MAE(dataset,ViewsX,q,global_params):
    """
    This function evaluates the MAE using global parameters
    :return float
    """    
    d = dataset.shape[1]
    N = dataset.shape[0]
    K = len(ViewsX)
    
    mu = concat_params(global_params['tilde_muk'], K, ViewsX)
    W = concat_params(global_params['tilde_Wk'], K, ViewsX)
    M, B = eval_MB(global_params['tilde_Wk'], q, D_i, K, global_params['tilde_Sigma2k'],ViewsX)

    T_true = dataset.values.tolist()

    T_pred = []
    for n in range(N):
        Xng = (M.dot(B).dot(dataset.iloc[n].values.reshape(d, 1) - mu)).reshape(q, 1)
        T_pred.append((W.dot(Xng) + mu).reshape(d))

    MAE = mean_absolute_error(T_true, T_pred)



In [36]:
X = t_i.iloc[:,:-1]
Xk = []
ViewsX = []
ind = 0
K= 3
dim_views = [15, 8, 10]

y = t_i[t_i.columns[-1]]
for k in range(K):
    if X.iloc[:, ind].isnull().values.any():
        Xk.append(np.nan)
        ViewsX.append(0)
    else:
        # if norm = true, data are normalized with min max scaler
        X_k = X.iloc[:, ind:ind + dim_views[k]]
        if True is True:
            X_k = normalize_data(X_k) 

        Xk.append(X_k)
        ViewsX.append(1)
    ind += dim_views[k]

# The entire dataset is re-built without empty columns
Xk_obs = [item for item in Xk if item is not np.nan]
X_obs = pd.concat(Xk_obs, axis=1)

(324, 15) (15,)
(324, 8) (8,)
(324, 10) (10,)


In [32]:
X_k = normalize_data(X_k) 

In [16]:
(X_obs,Xk,ViewsX,y)

(views           view_1                                                    \
 feature_name   var_1,1   var_1,2   var_1,3   var_1,4   var_1,5   var_1,6   
 0             0.269406  0.626926  0.791682  0.364060  0.420920  0.741968   
 1             0.087613  0.421347  0.819334  0.128131  0.376296  0.892917   
 2             0.271479  0.534076  0.711473  0.403702  0.363166  0.639258   
 3             0.000000  0.504321  1.000000  0.002266  0.551466  1.000000   
 4             0.206283  0.412658  0.719225  0.186896  0.792503  0.623314   
 ..                 ...       ...       ...       ...       ...       ...   
 319           0.853870  0.280192  0.077036  0.628759  0.728695  0.198696   
 320           0.657090  0.249640  0.153975  0.604943  0.494459  0.216128   
 321           0.624006  0.254716  0.245067  0.523445  0.325065  0.397745   
 322           0.846120  0.553332  0.231588  0.821875  0.526248  0.192887   
 323           0.687008  0.279843  0.208965  0.533276  0.548129  0.301200   

In [19]:
# Test dataset


def test_data(dataset,norm,K,dim_views):
        """
            Equivalent to training_data, for the test dataset
        """
        X = dataset.iloc[:,:-1]
        y = dataset[dataset.columns[-1]]
        
        # Xk is a list contianing the view-specific local datasets
        Xk = []
        ViewsX = []
        ind = 0
        for k in range(K):
            if X.iloc[:, ind].isnull().values.any():
                Xk.append(np.nan)
                ViewsX.append(0)
            else:
                # if norm = true, data are normalized with min max scaler
                X_k = normalize_data(X.iloc[:, ind:ind + dim_views[k]]) if norm \
                    else X.iloc[:, ind:ind + dim_views[k]]
                Xk.append(X_k)
                ViewsX.append(1)
            ind += dim_views[k]
        
        # The entire dataset is re-built without empty columns
        Xk_obs = [item for item in Xk if item is not np.nan]
        X_obs = pd.concat(Xk_obs, axis=1)
        
        return (X_obs,Xk,ViewsX,y)

In [23]:
from sklearn import preprocessing

for r in range(1,rounds):
    global_params = exp.aggregated_params[r]['params']
    ######## Train data
    MAE_train = []
    Latent_Train = pd.DataFrame()
    Label_Train = pd.Series(dtype='int64')
    for c in range(n_centers):
        dataset_c = pd.read_csv('== Local path to node' + str(c+1) + '.csv', delimiter=',', index_col=0)
        X_obs_c,Xk_c,ViewsX_c,y_c = test_data(dataset_c,norm,tot_views,dim_views)
        # Dataframe of latent space for LDA
        Latent_Train = Latent_Train.append(simu_latent(n_components,X_obs_c,ViewsX_c,global_params))
        Label_Train = Label_Train.append(y_c)
        # MAE Train
        MAE_train.append(MAE(X_obs_c,ViewsX_c,n_components,global_params))

    

    ######## Test data
    X_obs_test,Xk_test,ViewsX_test,y_test = test_data(t_test,norm,tot_views,dim_views)

    Latent_Test = pd.DataFrame()
    Label_Test = pd.Series(dtype='int64')

    # Dataframe of latent space for LDA
    Latent_Test = Latent_Test.append(simu_latent(n_components,X_obs_test,ViewsX_test,global_params))
    Label_Test = Label_Test.append(y_test)
    # MAE Test
    MAE_test = MAE(X_obs_test,ViewsX_test,n_components,global_params)


    print('Round {}:'.format(r))
    print('MAE train (mean,std) = ({:.4f},{:.4f}) \
    \t MAE test = {:.4f} \
    '.format(np.mean(np.array(MAE_train)), \
                                                 np.std(np.array(MAE_train)), MAE_test))

Round 1:
MAE train (mean,std) = (0.0458,0.0044)     	 MAE test = 0.0588     
Round 2:
MAE train (mean,std) = (0.0434,0.0045)     	 MAE test = 0.0554     
Round 3:
MAE train (mean,std) = (0.0415,0.0046)     	 MAE test = 0.0527     
Round 4:
MAE train (mean,std) = (0.0400,0.0047)     	 MAE test = 0.0508     


In [24]:
conf_LDA_Test

array([[20,  0],
       [ 0, 20]])