# Fedbiomed Researcher to train a federated PPCA (Probabilistic PCA) model.

## Description of the exercise :

Three datasets `n1.csv` , `n2.csv` and `n3.csv` will be generated randomly using 3-views PPCA from a 4-dimensional latent space, with views dimensions [15,8,10] and 2 groups. Henceforth, we will distribute the 3 dataset to 3 distinct nodes and use Fed-mv-PPCA. In each center we check the evolution of expected LL during training.

## Data Generation

We will generate three datasets using mv-PPCA.
Then save them in a path of your choice on your machine.

In [1]:
import numpy as np
import pandas as pd
from typing import List, Union, Dict, Iterator

def sample_x_n(N:int, q:int, random_state:int=None):
    """samples from a Guassian dsitribution
    Args:
    
    :N first dimension array
    :q second dimesnion array
    """
    return np.random.RandomState(random_state).randn(N,q)

def generate_data(N_g: List[int],
                  W: np.ndarray,
                  a_g: np.ndarray,
                  mu:float,
                  sigma2:float,
                  x_n,
                  view:int,
                  random_state=None):
    
    """generates Gaussian dataset given several groups of data points, using the following
    gausian generative proccess (for a given view):
    
    Y = WX + mu + epsilon with epsilon ~ N(0, sigma2)
    
    where X is the latent space of size (q, n_features), Y the observation matrix, W the matrix used
    for data reconstructionY
    
    Params:
    :N_g: List[int] number of data to generate per group (list of size number of group)
    :W: reconstruction matrix, of size (n_features, q)
    :a_g: (np.ndarray) array of size (nb_group, n_components), introduces shift when creating different group. 
    :mu: offset of the dataset
    :sigma2: variance used for generating
    :x_n: random variable 
    :view: (int) the given view
    :random_state: ransom seed for reproducibility
    
    Returns:
    :Y (pd.DataFrame): synthetic dataset generated wrt above expression
    of size (n_features, n_samples)
    """
    rnd=np.random.RandomState(random_state)

    N=N_g.sum()
    d, q = W.shape
    sigma=np.sqrt(sigma2)
    G=len(N_g)

    g_ind=np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g)))
    

    y_n=np.empty((N, d))

    for g in range(G):
        # computing Y = W.transpose(X + shift()) + mu
        y_n[g_ind[g]:g_ind[g+1]]= np.einsum("dq,nq->nd", W, x_n[g_ind[g]:g_ind[g+1]]+a_g[g]) + mu
        
    y_n = pd.DataFrame(data=y_n,
                     columns=[f'var_{view},{i + 1}' for i in range(d)])

    return y_n + sigma*rnd.randn(N,d)



def generate_ppca_nodes_dataset(n_nodes: int,
                           n_features: Union[List[int],int],
                           
                           n_components: int,
                           n_group: int=2,
                           absent_view: Dict[str, int]=None,
                           W_init: List[np.ndarray]=None,
                           mu_init: List[np.ndarray]=None,
                           sigma_init : List[np.ndarray]=None,
                           is_validation: bool=True,
                           n_sample_validation: int=None):
    """
    Generate a synthetic dataset for each node
    
    """
    # generate PPCA parameters if not defined
    ## case where W parameter is not defined
    if W_init is None:
        W_init = []
        for i in range(n_nodes):
            W_gen = np.random.uniform(-10, 10, (n_features[i], n_components))
            W_init.append(W_gen)
    ## case where mu not generated
    if mu_init is None:
        mu_init = []
        for i in range(n_nodes):
            mu_gen = np.random.uniform(-10, 10, n_features[i])
            mu_init.append(mu_gen)
            
    ## case where sigma is not definied (we will set sigma =1 for each clients)
    if sigma_init is None:
        sigma_init = []
        for i in range(n_nodes):
            sigma_init.append(1)
            
    
    shift = np.concatenate((np.zeros((1, n_components)),
                          np.random.uniform(-10, 10, (nb_group - 1, n_components))))

In [2]:
from typing import Dict, Union

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header
    
    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names
    
    _concatenated_datasets = np.array([])  # store dataframe values
    
    for key in datasets.keys():
        #_sub_dataframe_header.append(list(datasets[key].columns.values))
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # other pass
            try:
                _concatenated_datasets = np.concatenate([_concatenated_datasets,
                                                         datasets[key].to_numpy()],
                                                        axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError('Cannot create multi view dataset: different number of samples for each modality have been detected')
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array, _feature_name_array],
                                        names=_header_labels)
    
    
    
    
    # 2. create multi index dataframe
    
    mulit_view_df = pd.DataFrame(_concatenated_datasets,
                                 columns = _header)
    return mulit_view_df


def save_multi_view_dataframe(dataframe: pd.DataFrame, file_name: str):
    dataframe.to_csv(file_name)
    
def load_multi_view_dataframe(file_name: str) -> pd.DataFrame:
    df = pd.read_csv(file_name, delimiter=',', index_col=0, header=[0,1])
    return df




In [3]:
np.random.seed(100)

D_i = [15, 8, 10]
nb_group = 2
n_centers = 3
testing_samples = 40

n_components_generated = 4

In [4]:



# initializing PPCA variables
sigma2_gen1, sigma2_gen2, sigma2_gen3 = 2, 1, 3
W_gen1 = np.random.uniform(-10, 10, (D_i[0], n_components_generated))
W_gen2 = np.random.uniform(-5, 5, (D_i[1], n_components_generated))
W_gen3 = np.random.uniform(-15, 15, (D_i[2], n_components_generated))
mu_gen1 = np.random.uniform(-10, 10, D_i[0])
mu_gen2 = np.random.uniform(-5, 5, D_i[1])
mu_gen3 = np.random.uniform(-15, 15, D_i[2])

a_g_gen = np.concatenate((np.zeros((1, n_components_generated)),
                          np.random.uniform(-10, 10, (nb_group - 1, n_components_generated))))

#W = [W_gen1,W_gen2,W_gen3]
views_name = ['view_1', 'view_2', 'view_3']
W = {'view_1': W_gen1, 
    'view_2': W_gen2,
    'view_3': W_gen3}
mu = [mu_gen1,mu_gen2,mu_gen3]
mu = {'view_1': mu_gen1,
     'view_2':mu_gen2,
     'view_3': mu_gen3}
sigma = [sigma2_gen1,sigma2_gen2,sigma2_gen3]

sigma = {'view_1': sigma2_gen1,
        'view_2':sigma2_gen2,
        'view_3':sigma2_gen3}
# absent_views contains as key the id of a center in which we want to simulate absent views,
# and as argumet the id of the missing view. 
absent_views = {'2': 2}

for i in range(n_centers):
    N_g = np.array([np.random.randint(25,300) for _ in range(nb_group)]) 
    # N_g = np.array([np.random.randint(25,300) for _ in range(nb_group)])?
    #g_ind = np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g)))
    N = N_g.sum()
    x_n_gen = sample_x_n(N, n_components_generated, random_state=150)  # randomly generate a Gaussian
    # dataset
    #Y = []
    Y = {}
    for d, d_name in enumerate(views_name):
        y_t = generate_data(N_g, W[d_name], a_g_gen, mu[d_name], sigma[d_name], x_n_gen, view = d+1, random_state=250)
        if ((str(i+1) in absent_views.keys()) \
            and (type(absent_views[str(i+1)])== int) \
            and (absent_views[str(i+1)]==d+1)):
            absent_views.update({str(i+1): y_t})
            y_abs=pd.DataFrame(np.nan, index = np.arange(N_g.sum()), \
                               columns = [f'var_{d+1},{i + 1}' for i in range(D_i[d])])
            #Y.append(y_abs)
            Y['view_' + str(d+1)] = y_abs
        else:
            # Y.append(y_t)
            Y['view_' + str(d+1)] = y_t

    gr = []
    for g in range(nb_group):
       gr += [int(g) for _ in range(N_g[g])]
    gr = pd.Series(gr)
    Y['Label'] = pd.DataFrame(gr, columns=['Labels'])

###
# the output will be a list of n_centers dataset containing: dataframe for each centers,of 
# different dimensions
# 
    #t_i = pd.concat(Y, axis=1)
    
    #t_i.columns.values[-1] = 'Label'
    t_i = create_multi_view_dataframe(Y)
    t_i.to_csv('== Local path to node' + str(i+1) + '.csv',sep=',')
    #np.savetxt('== Local path to node' + str(i+1) + '.csv',t_i,delimiter=',')
               
# building the test dataset
N_g_test = np.array([testing_samples//2,testing_samples//2])
g_ind_test = np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g_test)))
N_test = N_g_test.sum()
x_n_gen = sample_x_n(N_test, n_components_generated, random_state=150)
Y_test = {}
for d, d_name in enumerate(views_name):
    y_t = generate_data(N_g_test, W[d_name], a_g_gen, mu[d_name], sigma[d_name], x_n_gen, view = d+1, random_state=250)
    Y_test['view_' + str(d+1)] = y_t

gr_test = [0 for _ in range(N_g_test[0])]+[1 for _ in range(N_g_test[1])]
gr_test = pd.Series(gr_test)
Y_test['Label'] = pd.DataFrame(gr_test, columns=['Labels'])

t_test = pd.concat(Y_test, axis=1)

In [None]:
for v in sorted(list(set(t_test2.columns.get_level_values(0)))):
    print(v)

## Start the network and setting the client up
Before running this notebook:
1. You should start the network from fedbiomed-network, as detailed in :
https://gitlab.inria.fr/fedbiomed/fedbiomed

2. You need to configure at least 2 nodes: <br/>
* **Node 1 :** `./scripts/fedbiomed_run node add`
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[0],y[0].
  * Check that your data has been added in node 1 by executing `./scripts/fedbiomed_run node list`
  * Run the node using `./scripts/fedbiomed_run node start`. <br/>

* **Node 2 :** Open a second terminal and run ./scripts/fedbiomed_run node add config n2.ini
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[1],y[1].
  * Check that your data has been added in node 2 by executing `./scripts/fedbiomed_run node config n2.ini list`
  * Run the node using `./scripts/fedbiomed_run node config n2.ini start`.
  


* **Node 3 :** Open a third terminal and run ./scripts/fedbiomed_run node add config n3.ini
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[2],y[2].
  * Check that your data has been added in node 2 by executing `./scripts/fedbiomed_run node config n3.ini list`
  * Run the node using `./scripts/fedbiomed_run node config n3.ini start `.

 Wait until you get `Connected with result code 0`. it means node is online.


In [None]:
%load_ext autoreload
%autoreload 2

In [5]:
import numpy as np
from fedbiomed.researcher.environ import TMP_DIR
import tempfile
tmp_dir_model = tempfile.TemporaryDirectory(dir=TMP_DIR+'/')
model_file = tmp_dir_model.name + '/fed_mv_ppca.py'

Hereafter the template of the class you should provide to Fedbiomed :
       
**training_data** : you must return here a tuple (X,X_k,ViewsX,y) or (X,X_k,ViewsX). Note that all centers should provide a dataset with the same view-specific columns. If in a specific center a view has not been observed, then the corresponding columns will be filled of nan. The training_data method take care of identifying view-specific sub-datasets and collecting information concerning non-available observations. Data can also been normalized here.

In [6]:
%%writefile "$model_file"

from fedbiomed.common.ppca import PpcaPlan

import numpy as np
import pandas as pd


class Fed_MV_PPCA(PpcaPlan):
    def __init__(self, kwargs):
        super(Fed_MV_PPCA, self).__init__(kwargs)
        deps = ['import numpy as np', 
               'import pandas as pd']
        self.add_dependency(deps)
        self.multi_view = True
    
    def training_data(self):
        """
            Perform in this method all data reading and data transformations you need.
            At the end you should provide a tuple (X_obs,Xk,ViewsX,y), where: 
            X_obs is the training dataset, 
            Xk is a list containing the k-specific dataframe if it exists or 'NaN' otherwise,
            ViewsX is the indicator function for observed views (ViewsX[k]=1 if view k is observed, 0 otherwise)
            y the corresponding labels (optional)
            The dataset is normalized using min max scaler if model_args['norm'] is true
            Note: since labels are not needed for the optimization, 
            training_data can also simply return (X_obs,Xk,ViewsX)
            :raise NotImplementedError if researcher do not implement this method.
        """
        
        #dataset = pd.read_csv(self.dataset_path,delimiter=',', index_col=0)
        dataset = self.load_multi_view_dataframe(self.dataset_path)
        X = dataset.iloc[:,:-1]
        y = dataset[dataset.columns[-1]]
        return X

    
    def load_multi_view_dataframe(self, file_name: str) -> pd.DataFrame:
        df = pd.read_csv(file_name, delimiter=',', index_col=0, header=[0,1])
        return df
    

Writing /home/ybouilla/fedbiomed/var/tmp/tmpq50bovi8/fed_mv_ppca.py


        # Xk is a list contistripaning the view-specific local datasets
        Xk = []
        ViewsX = []
        ind = 0
        for k in range(self.K):
            if X.iloc[:, ind].isnull().values.any():
                Xk.append(np.nan)
                #Xk.append('NaN')
                ViewsX.append(0)
            else:
                # if norm = true, data are normalized with min max scaler
                X_k = X.iloc[:, ind:ind + self.dim_views[k]]
                if self.is_norm:
                    X_k = self.normalize_data(X_k) 
                    
                    
                Xk.append(X_k)
                
                ViewsX.append(1)
            ind += self.dim_views[k]
            
        
        # The entire dataset is re-built without empty columns
        Xk_obs = [item for item in Xk if item is not np.nan]
        #Xk_obs = [item for item in Xk if type(item) is not str]
        X_obs = pd.concat(Xk_obs, axis=1)
        
        return (X_obs,Xk,ViewsX,y)

**model_args** is a dictionary containing the mv-ppca model arguments: the total number of views across all datasets (tot_views), the dimension of each view (dim_views), the latent space size (n_components), and a boolean (norm) for data preprocessing. Additionaly, the researcher can provide priors for one ore more global parameters.

**training_args** contains here the number of local iterations for EM/MAP. 

In [7]:
tot_views = 3
dim_views = [15, 8, 10]
n_components = 4
norm = True

views_names = ['view_1', 'view_2', 'view_3']

model_args = {'tot_views': tot_views,
              'dim_views': dim_views,
              'views_names': views_names,
              'n_components': n_components,
              'is_norm': norm}

# better to increase log interval if number of iteration is higher
training_args = {'n_iterations': 15, 'log_interval' : 1} #


tags =  ['ppca_data']
rounds = 5

In [8]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.ppca_aggregator import MLaggregator



# select nodes pr into task <Task pending name='Task-27' coro=<HTTP1ServerConnection._server_request_loop() running at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/tornado/http1connection.py:823> wait_for=<Future finished result=b'GET /kernel...6bd7"\r\n\r\n'> cb=[IOLoop.add_future.<locals>.<lambda>() at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/tornado/ioloop.py:688]> while another task <Task pending name='Task-2' coro=<KernelManager._async_start_kernel() running at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/jupyter_carticiping to this experiment
exp = Experiment(tags=tags,
                 #clients=None,
                 model_path=model_file,
                 model_args=model_args,
                 model_class='Fed_MV_PPCA',
                 training_args=training_args,
                 rounds=rounds,
                 aggregator=MLaggregator(),
                 client_selection_strategy=None,
                 tensorboard=True)

2021-11-09 15:54:03,066 fedbiomed INFO - Messaging researcher_77c643fd-ce6e-42c3-8a2e-79b96ff91e24 successfully connected to the message broker, object = <fedbiomed.common.messaging.Messaging object at 0x7f1f1fd7dac0>
2021-11-09 15:54:03,077 fedbiomed INFO - Searching dataset with data tags: ['ppca_data'] for all nodes
2021-11-09 15:54:03,079 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - DEBUG Message received: {'researcher_id': 'researcher_77c643fd-ce6e-42c3-8a2e-79b96ff91e24', 'tags': ['ppca_data'], 'command': 'search'}
2021-11-09 15:54:13,159 fedbiomed INFO - Removing tensorboard logs from previous experiment


In [9]:
from fedbiomed.researcher.requests import Requests

req = Requests()
datasets = req.list(verbose=True)

2021-11-09 15:54:16,858 fedbiomed INFO - Listing available datasets in all nodes... 
2021-11-09 15:54:16,862 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - DEBUG Message received: {'researcher_id': 'researcher_77c643fd-ce6e-42c3-8a2e-79b96ff91e24', 'command': 'list'}
2021-11-09 15:54:26,874 fedbiomed INFO - 
 Node: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc | Number of Datasets: 1 
+--------+-------------+---------------+---------------+-----------+--------------+
| name   | data_type   | tags          | description   | shape     | multi_view   |
|        | csv         | ['ppca_data'] |               | [215, 35] | multi_view   |
+--------+-------------+---------------+---------------+-----------+--------------+



In [10]:
from fedbiomed.researcher.environ import TENSORBOARD_RESULTS_DIR

In [None]:
ind = self.view[x]%load_ext tensorboard

Following cell will launch tensorboard on `TENSORBOARD_RESULTS_DIR`. Since the experiment is not started, it won't show any results. After runing experiment, you can click refresh button to see changes. 

In [None]:
tensorboard --logdir "$TENSORBOARD_RESULTS_DIR"

In [11]:
# start federated training
exp.run()

2021-11-09 15:54:31,492 fedbiomed INFO - Sampled clients in round 0 ['node_66231194-c0a2-45f7-ac58-ebaa938ffdfc']
2021-11-09 15:54:31,497 fedbiomed INFO - Send message to client node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - {'researcher_id': 'researcher_77c643fd-ce6e-42c3-8a2e-79b96ff91e24', 'job_id': '9825b4ae-111a-4083-851a-1278a0603a4b', 'training_args': {'n_iterations': 15, 'log_interval': 1}, 'model_args': {'tot_views': 3, 'dim_views': [15, 8, 10], 'views_names': ['view_1', 'view_2', 'view_3'], 'n_components': 4, 'is_norm': True}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/11/09/my_model_ada533c7-979e-4e4d-9aac-79653355cad8.py', 'params_url': 'http://localhost:8844/media/uploads/2021/11/09/my_model_b9069c57-c9ef-4a92-ac0a-a03ff02908a9.pt', 'model_class': 'Fed_MV_PPCA', 'training_data': {'node_66231194-c0a2-45f7-ac58-ebaa938ffdfc': ['dataset_607b91c2-e92c-4703-82d5-b9c24687f40b']}}
2021-11-09 15:54:31,499 fedbiomed DEBUG - researcher_77c643fd-ce6e-42c3

2021-11-09 15:54:41,609 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO {'monitor': <fedbiomed.node.history_monitor.HistoryMonitor object at 0x7f5d2d06d1c0>, 'n_iterations': 15, 'log_interval': 1}
2021-11-09 15:54:41,611 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - DEBUG Dataset_path/home/ybouilla/fedbiomed/notebooks/== Local path to node1.csv
2021-11-09 15:54:41,612 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - DEBUG is Dataset  multi view ? True
2021-11-09 15:54:41,763 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO Iteration: [0/15]	Expected LL: 7568.893796
2021-11-09 15:54:41,861 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO Iteration: [1/15]	Expected LL: 7640.148580
2021-11-09 15:54:41,958 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO Iteration: [2/15]	Expected LL: 7663.325062
2021-11-09 15:54:42,082 fedbiomed INFO -

2021-11-09 15:54:53,184 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO Iteration: [13/15]	Expected LL: 7773.583931
2021-11-09 15:54:53,279 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO Iteration: [14/15]	Expected LL: 7773.583933
2021-11-09 15:54:53,309 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO results uploaded successfully 
2021-11-09 15:55:01,675 fedbiomed INFO - Downloading model params after training on node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - from http://localhost:8844/media/uploads/2021/11/09/node_params_b7322f8f-1f8b-42ae-9ed2-c83ef7bf03ce.pt
2021-11-09 15:55:01,703 fedbiomed INFO - Clients that successfully reply in round 2 ['node_66231194-c0a2-45f7-ac58-ebaa938ffdfc']
2021-11-09 15:55:01,737 fedbiomed INFO - Sampled clients in round 3 ['node_66231194-c0a2-45f7-ac58-ebaa938ffdfc']
2021-11-09 15:55:01,738 fedbiomed INFO - Send message to client node_66231194-c0a2-45f7-ac58-ebaa938ffdfc

2021-11-09 15:55:11,806 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - DEBUG [TASKS QUEUE] Item:{'researcher_id': 'researcher_77c643fd-ce6e-42c3-8a2e-79b96ff91e24', 'job_id': '9825b4ae-111a-4083-851a-1278a0603a4b', 'params_url': 'http://localhost:8844/media/uploads/2021/11/09/researcher_params_6c32ee5c-17a3-49f9-a49e-58b2f99225f7.pt', 'training_args': {'n_iterations': 15, 'log_interval': 1}, 'training_data': {'node_66231194-c0a2-45f7-ac58-ebaa938ffdfc': ['dataset_607b91c2-e92c-4703-82d5-b9c24687f40b']}, 'model_args': {'tot_views': 3, 'dim_views': [15, 8, 10], 'views_names': ['view_1', 'view_2', 'view_3'], 'n_components': 4, 'is_norm': True}, 'model_url': 'http://localhost:8844/media/uploads/2021/11/09/my_model_ada533c7-979e-4e4d-9aac-79653355cad8.py', 'model_class': 'Fed_MV_PPCA', 'command': 'train'}
2021-11-09 15:55:11,818 fedbiomed INFO - log from: node_66231194-c0a2-45f7-ac58-ebaa938ffdfc - INFO {'monitor': <fedbiomed.node.history_monitor.HistoryMonitor objec

In [12]:
print("\nList the training rounds : ", exp.aggregated_params.keys())


print("\nAccess the federated params for the last training round :")
print("\t- params_path: ", exp.aggregated_params[rounds - 1]['params_path'])
print("\t- parameter data: ", exp.aggregated_params[rounds - 1]['params'].keys())


List the training rounds :  dict_keys([0, 1, 2, 3, 4])

Access the federated params for the last training round :
	- params_path:  /home/ybouilla/fedbiomed/var/tmp/researcher_params_8f28b090-387b-41d5-8064-4a65b0d60512.pt
	- parameter data:  dict_keys(['tilde_muk', 'tilde_Wk', 'tilde_Sigma2k', 'Alpha', 'Beta', 'sigma_til_muk', 'sigma_til_Wk', 'sigma_til_sigma2k'])


## Test

Herafter we test the performance of the aggregated parameters on a test dataset. In particular, for each round we use the global parameters to evaluate the mean absolute error and the separation in the latent space using LDA. Note that we have already defined the test dataset at the beginning of this notebook.

In [13]:

import numpy as np
import pandas as pd
from numpy.linalg import solve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error




def eval_MB(Wk, q, D_i, K, Sigma2, ViewsX):
    """
    Computes matrices M:=inv(I_q+sum_k Wk.TWk/sigma2k) and B:= [W1.T/sigma2K,...,W1.T/sigma2K].
    :param Wk: list of matrices (d_k x q)
    :param Sigma2: list of float > 0
    :return np.arrays
    """
    index = ViewsX.index(1)

    M1 = Wk[index].reshape(D_i[index], q).T.dot(Wk[index].reshape(D_i[index],q)) / Sigma2[index]
    B = Wk[index].reshape(D_i[index], q).T / Sigma2[index]
    for k in range(index + 1, K):
        if ViewsX[k] == 1:
            M1 += Wk[k].reshape(D_i[k], q).T.dot(Wk[k].reshape(D_i[k],q)) / Sigma2[k]
            B = np.concatenate((B, (Wk[k].reshape(D_i[k], q)).T / Sigma2[k]), axis=1)

    M = solve(np.eye(q) + M1,np.eye(q))

    return M, B

def concat_params(park: Union[Dict[Union[str, int], np.ndarray],
                              List[np.ndarray]],
                  K, ViewsX: List[int],
                  views_iterator: Iterator =None) -> np.ndarray:
    """
    This function concatenates parameters from a list
    :param park: list of vectors/matrices to concatenate
    :return np.array
    """
    if views_iterator is None:
        views_iterator = range(K)
    _index = ViewsX.index(1)
    
    _index_view = views_iterator[_index]
    par = park[_index_view]
    
    for k, k_name in zip(range(_index + 1, K), views_iterator[_index + 1:]):
        if ViewsX[k] == 1:
            par = np.concatenate((par, park[k_name]), axis=0)

    return par

def simu_latent(ppca, q,dataset,ViewsX,global_params, views_iterator: Iterator = None):
    """
    This function allows sampling of x_n (latent variables) from the posterior distribution 
    (with global parameters).
    :return pandas dataframe
    """
    d = dataset.shape[1]
    N = dataset.shape[0]
    K = len(ViewsX)

    mu = concat_params(global_params['tilde_muk'], K, ViewsX, views_iterator)
    
    M, B = ppca.eval_MB(global_params['tilde_Wk'],  global_params['tilde_Sigma2k'], ViewsX)
    
    Xn = [(M.dot(B).dot(dataset.iloc[n].values.reshape(d, 1) - mu)).reshape(1, q) for n in range(N)]

    df = pd.DataFrame(np.vstack(Xn), index=dataset.index)

    return df

def MAE(ppca, dataset,ViewsX,q,global_params, views_iterator):
    """
    This function evaluates the MAE using global parameters
    :return float
    """    
    d = dataset.shape[1]
    N = dataset.shape[0]
    K = len(ViewsX)
    
    mu = concat_params(global_params['tilde_muk'], K, ViewsX, views_iterator)
    W = concat_params(global_params['tilde_Wk'], K, ViewsX, views_iterator)
    M, B = ppca.eval_MB(global_params['tilde_Wk'],  global_params['tilde_Sigma2k'], ViewsX)
    
    T_true = dataset.values.tolist()

    T_pred = []
    for n in range(N):
        Xng = (M.dot(B).dot(dataset.iloc[n].values.reshape(d, 1) - mu)).reshape(q, 1)
        T_pred.append((W.dot(Xng) + mu).reshape(d))
    
    MAE = mean_absolute_error(T_true, T_pred)
    return MAE


In [14]:
from fedbiomed.common.ppca import PpcaPlan

ppca = PpcaPlan(model_args)
ppca.is_multi_view = True
ppca.views_iterator = views_names

In [15]:
# Test datasetnp.random.seed(100)

D_i = [15, 8, 10]
nb_group = 2
n_centers = 3
testing_samples = 40

n_components_generated = 4


def test_data(ppca, dataset,norm,K,dim_views):
        """
            Equivalent to training_data, for the test dataset
        """
        X = dataset.iloc[:,:-1]
        y = dataset[dataset.columns[-1]]
        
        # Xk is a list contianing the view-specific local datasets
        Xk = []
        ViewsX = []
        ind = 0
        for k in range(K):
            if X.iloc[:, ind].isnull().values.any():
                Xk.append(np.nan)
                ViewsX.append(0)
            else:
                # if norm = true, data are normalized with min max scaler
                X_k = X.iloc[:, ind:ind + dim_views[k]]
                if norm:
                    
                    X_k = ppca.normalize_data(X_k)
                Xk.append(X_k)
                ViewsX.append(1)
            ind += dim_views[k]
        
        # The entire dataset is re-built without empty columns
        Xk_obs = [item for item in Xk if item is not np.nan]
        X_obs = pd.concat(Xk_obs, axis=1)
        
        return (X_obs,Xk,ViewsX,y)

In [16]:
from sklearn import preprocessing

for r in range(1,rounds):
    global_params = exp.aggregated_params[r]['params']
    ######## Train data
    MAE_train = []
    Latent_Train = pd.DataFrame()
    Label_Train = pd.Series(dtype='int64')
    for c in range(n_centers):
        dataset_c = load_multi_view_dataframe('== Local path to node' + str(c+1) + '.csv')
        
        X_obs_c,Xk_c,ViewsX_c,y_c = test_data(ppca, dataset_c,norm,tot_views,dim_views)
        # Dataframe of latent space for LDA
        Latent_Train = Latent_Train.append(simu_latent(ppca, n_components,X_obs_c,ViewsX_c,global_params,
                                                      views_name))
        Label_Train = Label_Train.append(y_c)
        # MAE Train
        MAE_train.append(MAE(ppca, X_obs_c,ViewsX_c,n_components,global_params, views_name))

    

    ######## Test data
    
    X_obs_test,Xk_test,ViewsX_test,y_test = test_data(ppca, t_test,norm,tot_views,dim_views)

    Latent_Test = pd.DataFrame()
    Label_Test = pd.Series(dtype='int64')

    # Dataframe of latent space for LDA
    Latent_Test = Latent_Test.append(simu_latent(ppca, n_components,X_obs_test,ViewsX_test,global_params,views_name))
    Label_Test = Label_Test.append(y_test)
    # MAE Test
    MAE_test = MAE(ppca, X_obs_test,ViewsX_test,n_components,global_params, views_name)


    print('Round {}:'.format(r))
    print('MAE train (mean,std) = ({:.4f},{:.4f}) \
    \t MAE test = {:.4f} \
    '.format(np.mean(np.array(MAE_train)), \
      np.std(np.array(MAE_train)), MAE_test))

Round 1:
MAE train (mean,std) = (0.0361,0.0049)     	 MAE test = 0.0460     
Round 2:
MAE train (mean,std) = (0.0357,0.0049)     	 MAE test = 0.0455     
Round 3:
MAE train (mean,std) = (0.0354,0.0049)     	 MAE test = 0.0451     
Round 4:
MAE train (mean,std) = (0.0351,0.0049)     	 MAE test = 0.0448     


In [None]:
ppca.views_iterator

In [None]:
ppca.eval_MB(global_params['tilde_Wk'],  global_params['tilde_Sigma2k'],ViewsX_c)

In [None]:
ViewsX_c[1]

In [None]:
class MyPPCA:
    
    def __init__(self):
        self.n_components = 4
        self.dim_views = [15, 8, 10]
        self.views_names = ['view_1', 'view_2', 'view_3']
        self.is_multi_view = True
        self.K = 3
        self.views_iterator = self.views_names
        
    def eval_MB(self, Wk, Sigma2, ViewsX):
        print('IN HERE', Wk, Sigma2, ViewsX)
        q = self.n_components
        D_i = self.dim_views
        index = ViewsX.index(1)  # get all views that has been specified (here get first occurence)
        # TODO: self.index has not been defined (to be solved with self.ViewsX)
        # TODO: handle case where there is only one view
        
        if self.is_multi_view:
            index_name = self.views_names[index]
        else:
            index_name = index
        # first computation of M and B
        M1 = Wk[index_name].reshape(D_i[index], q).T.dot(Wk[index_name].reshape(D_i[index],q)) / Sigma2[index_name]
        B = Wk[index_name].reshape(D_i[index], q).T / Sigma2[index_name]
        
        for k, k_name in zip(range(index + 1, self.K),
                             self.views_iterator[index+1:]):
            # iterate over next computations 
            if ViewsX[k] == 1:
                # print(k,Wk[k])
                
                M1 += Wk[k_name].reshape(D_i[k], q).T.dot(Wk[k_name].reshape(D_i[k],q)) / Sigma2[k_name]
                B = np.concatenate((B, (Wk[k_name].reshape(D_i[k], q)).T / Sigma2[k_name]), axis=1)

        M = solve(np.eye(q) + M1, np.eye(q))
        return M, B
        
    def normalize_data(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        This function normalize the dataset X using min max scaler.
            :return normalized pandas dataframe norm_dataset
        """
        if self.is_multi_view:
            col_name = dataframe.columns
        else:
            col_name = [col.strip() for col in list(dataframe.columns)]
        x = dataframe.values  # returns a numpy array

        min_max_scaler = preprocessing.MinMaxScaler()
        try:
            x_scaled = min_max_scaler.fit_transform(x)
        except ValueError as value_error:
            raise ValueError(str(value_error) + "\nHint: this error can occur if headers are badly parsed"\
                             + "(eg using multiview datasests in single view mode)")
        norm_dataset = pd.DataFrame(x_scaled,
                                    index=dataframe.index,
                                    columns=col_name)
        return norm_dataset

In [None]:
ppca = MyPPCA()


In [None]:
i1 = {'view_1': np.array([[-0.13802021,  0.32325862,  0.02833382,  0.22127967],
       [-0.06671564, -0.14654803, -0.06791536, -0.13937345],
       [ 0.09402419, -0.36234396, -0.07688558, -0.26407766],
       [-0.16543944,  0.20462959,  0.03944648,  0.10994949],
       [ 0.04664261,  0.16706105, -0.03427558,  0.06172403],
       [ 0.10766845, -0.34412815, -0.07302516, -0.19504473],
       [ 0.00201343, -0.07822788,  0.10469188, -0.12583385],
       [-0.07453472,  0.3735997 ,  0.01634467,  0.21436581],
       [-0.15595287,  0.09756261,  0.07502573,  0.05922501],
       [-0.03591935,  0.31898374, -0.02451741,  0.19517161],
       [ 0.17276584, -0.1106362 ,  0.00502202, -0.09095508],
       [ 0.09350993, -0.35664213, -0.07014446, -0.26297889],
       [ 0.01240203, -0.26298395, -0.10201349, -0.12131298],
       [ 0.08251615, -0.29594432,  0.03587987, -0.21626601],
       [-0.09750903,  0.08573494,  0.10886528, -0.01879524]]), 'view_2': np.array([[ 0.1181572 , -0.31047394, -0.03793795, -0.23783581],
       [ 0.08186203,  0.06473819,  0.00565074,  0.13703523],
       [ 0.01005903,  0.04468327,  0.0960204 , -0.11309241],
       [ 0.06230421, -0.34993475, -0.08847052, -0.19965678],
       [-0.06335369,  0.33786985,  0.1006056 ,  0.21450211],
       [-0.14181009,  0.29836918,  0.00047697,  0.19113181],
       [-0.06458798, -0.08084305, -0.13499445, -0.05638773],
       [ 0.09553599, -0.35066934, -0.03705256, -0.24508008]]), 'view_3': np.array([[-0.0592363 , -0.18025771, -0.02871543, -0.16276167],
       [-0.07577988, -0.09387132,  0.06195105, -0.0332227 ],
       [ 0.15634336, -0.05330637, -0.01126681,  0.03777153],
       [-0.0799387 ,  0.13625053, -0.02817348,  0.21059086],
       [-0.0221396 ,  0.30382939,  0.08784197,  0.17423392],
       [ 0.02302336,  0.17093075,  0.08027848,  0.17710022],
       [-0.12722318,  0.344386  ,  0.02886857,  0.21335018],
       [ 0.14454919, -0.2734921 , -0.00795633, -0.1992943 ],
       [ 0.07323909, -0.22176914,  0.03608349, -0.20353649],
       [-0.00463341,  0.17679149,  0.14989779,  0.06862934]])}

i2 = {'view_1': 0.045311805078432264, 'view_2': 0.049040873444794096, 'view_3': 0.035635074892257916}

In [None]:
ppca.eval_MB(i1, i2, [1,1,1])

In [None]:
global_params

In [None]:
global_params['tilde_Wk']

In [None]:
test_data(ppca, dataset_c,norm,tot_views,dim_views)