# Fedbiomed Researcher to train a federated PPCA (Probabilistic PCA) model.

## Description of the exercise :

Three datasets `n1.csv` , `n2.csv` and `n3.csv` will be generated randomly using 3-views PPCA from a 4-dimensional latent space, with views dimensions [15,8,10] and 2 groups. Henceforth, we will distribute the 3 dataset to 3 distinct nodes and use Fed-mv-PPCA. In each center we check the evolution of expected LL during training.

## Data Generation

We will generate three datasets using mv-PPCA.
Then save them in a path of your choice on your machine.

In [1]:
import numpy as np
import pandas as pd
from typing import List, Union, Dict

def sample_x_n(N:int, q:int, random_state:int=None):
    """samples from a Guassian dsitribution
    Args:
    
    :N first dimension array
    :q second dimesnion array
    """
    return np.random.RandomState(random_state).randn(N,q)

def generate_data(N_g: List[int],
                  W: np.ndarray,
                  a_g: np.ndarray,
                  mu:float,
                  sigma2:float,
                  x_n,
                  view:int,
                  random_state=None):
    
    """generates Gaussian dataset given several groups of data points, using the following
    gausian generative proccess (for a given view):
    
    Y = WX + mu + epsilon with epsilon ~ N(0, sigma2)
    
    where X is the latent space of size (q, n_features), Y the observation matrix, W the matrix used
    for data reconstructionY
    
    Params:
    :N_g: List[int] number of data to generate per group (list of size number of group)
    :W: reconstruction matrix, of size (n_features, q)
    :a_g: (np.ndarray) array of size (nb_group, n_components), introduces shift when creating different group. 
    :mu: offset of the dataset
    :sigma2: variance used for generating
    :x_n: random variable 
    :view: (int) the given view
    :random_state: ransom seed for reproducibility
    
    Returns:
    :Y (pd.DataFrame): synthetic dataset generated wrt above expression
    of size (n_features, n_samples)
    """
    rnd=np.random.RandomState(random_state)

    N=N_g.sum()
    d, q = W.shape
    sigma=np.sqrt(sigma2)
    G=len(N_g)

    g_ind=np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g)))
    print(g_ind)

    y_n=np.empty((N, d))

    for g in range(G):
        # computing Y = W.transpose(X + shift()) + mu
        y_n[g_ind[g]:g_ind[g+1]]= np.einsum("dq,nq->nd", W, x_n[g_ind[g]:g_ind[g+1]]+a_g[g]) + mu
        
    y_n = pd.DataFrame(data=y_n,
                     columns=[f'var_{view},{i + 1}' for i in range(d)])

    return y_n + sigma*rnd.randn(N,d)



def generate_ppca_nodes_dataset(n_nodes: int,
                           n_features: Union[List[int],int],
                           
                           n_components: int,
                               n_group: int=2,
                           absent_view: Dict[str, int]=None,
                           W_init: List[np.ndarray]=None,
                           mu_init: List[np.ndarray]=None,
                           sigma_init : List[np.ndarray]=None,
                           is_validation: bool=True,
                           n_sample_validation: int=None):
    """
    Generate a synthetic dataset for each node
    
    """
    # generate PPCA parameters if not defined
    ## case where W parameter is not defined
    if W_init is None:
        W_init = []
        for i in range(n_nodes):
            W_gen = np.random.uniform(-10, 10, (n_features[i], n_components))
            W_init.append(W_gen)
    ## case where mu not generated
    if mu_init is None:
        mu_init = []
        for i in range(n_nodes):
            mu_gen = np.random.uniform(-10, 10, n_features[i])
            mu_init.append(mu_gen)
            
    ## case where sigma is not definied (we will set sigma =1 for each clients)
    if sigma_init is None:
        sigma_init = []
        for i in range(n_nodes):
            sigma_init.append(1)
            
    
    shift = np.concatenate((np.zeros((1, n_components)),
                          np.random.uniform(-10, 10, (nb_group - 1, n_components))))

In [10]:
from typing import Dict, Union

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header
    
    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names
    
    _concatenated_datasets = np.array([])  # store dataframe values
    
    for key in datasets.keys():
        #_sub_dataframe_header.append(list(datasets[key].columns.values))
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            try:
                _concatenated_datasets = np.concatenate([_concatenated_datasets,
                                                         datasets[key].to_numpy()],
                                                        axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError('Cannot create multi view dataset: different number of samples have been detected')
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)
        print(_feature_name_array)
    _header = pd.MultiIndex.from_arrays([_view_name_array, _feature_name_array],
                                        names=_header_labels)
    
    
    print(_concatenated_datasets)
    
    # 2. create multi index dataframe
    
    mulit_view_df = pd.DataFrame(_concatenated_datasets,
                                 columns = _header)
    return mulit_view_df


def save_multi_view_dataframe(dataframe: pd.DataFrame, file_name: str):
    dataframe.to_csv(file_name)
    
def load_multi_view_dataframe(file_name: str) -> pd.DataFrame:
    df = pd.read_csv(file_name, delimiter=',', index_col=0, header=[0,1])
    return df




In [5]:
np.random.seed(100)

D_i = [15, 8, 10]
nb_group = 2
n_centers = 3
testing_samples = 40

n_components_generated = 4


# initializing PPCA variables
sigma2_gen1, sigma2_gen2, sigma2_gen3 = 2, 1, 3
W_gen1 = np.random.uniform(-10, 10, (D_i[0], n_components_generated))
W_gen2 = np.random.uniform(-5, 5, (D_i[1], n_components_generated))
W_gen3 = np.random.uniform(-15, 15, (D_i[2], n_components_generated))
mu_gen1 = np.random.uniform(-10, 10, D_i[0])
mu_gen2 = np.random.uniform(-5, 5, D_i[1])
mu_gen3 = np.random.uniform(-15, 15, D_i[2])

a_g_gen = np.concatenate((np.zeros((1, n_components_generated)),
                          np.random.uniform(-10, 10, (nb_group - 1, n_components_generated))))

W = [W_gen1,W_gen2,W_gen3]
mu = [mu_gen1,mu_gen2,mu_gen3]
sigma = [sigma2_gen1,sigma2_gen2,sigma2_gen3]

# absent_views contains as key the id of a center in which we want to simulate absent views,
# and as argumet the id of the missing view. 
absent_views = {'2': 2}

for i in range(n_centers):
    N_g = np.array([np.random.randint(25,300) for _ in range(nb_group)]) 
    # N_g = np.array([np.random.randint(25,300) for _ in range(nb_group)])?
    #g_ind = np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g)))
    N = N_g.sum()
    x_n_gen = sample_x_n(N, n_components_generated, random_state=150)  # randomly generate a Gaussian
    # dataset
    #Y = []
    Y = {}
    for d in range(len(D_i)):
        y_t = generate_data(N_g, W[d], a_g_gen, mu[d], sigma[d], x_n_gen, view = d+1, random_state=250)
        if ((str(i+1) in absent_views.keys()) \
            and (type(absent_views[str(i+1)])== int) \
            and (absent_views[str(i+1)]==d+1)):
            absent_views.update({str(i+1): y_t})
            y_abs=pd.DataFrame(np.nan, index = np.arange(N_g.sum()), \
                               columns = [f'var_{d+1},{i + 1}' for i in range(D_i[d])])
            #Y.append(y_abs)
            Y['view_' + str(d+1)] = y_abs
        else:
            # Y.append(y_t)
            Y['view_' + str(d+1)] = y_t

    #gr = []
    #for g in range(nb_group):
    #   gr += [int(g) for _ in range(N_g[g])]
    #gr = pd.Series(gr)
    #Y.append(gr)

###
# the output will be a list of n_centers dataset containing: dataframe for each centers,of 
# different dimensions
# 
    #t_i = pd.concat(Y, axis=1)
    t_i = create_multi_view_dataframe(Y)
    t_i.columns.values[-1] = 'Label'
    t_i.to_csv('== Local path to node' + str(i+1) + '.csv',sep=',')
    #np.savetxt('== Local path to node' + str(i+1) + '.csv',t_i,delimiter=',')
               
# building the test dataset
N_g_test = np.array([testing_samples//2,testing_samples//2])
g_ind_test = np.concatenate((np.zeros(1, dtype=np.int64), np.cumsum(N_g_test)))
N_test = N_g_test.sum()
x_n_gen = sample_x_n(N_test, n_components_generated, random_state=150)
Y_test = []
for d in range(len(D_i)):
    y_t = generate_data(N_g_test, W[d], a_g_gen, mu[d], sigma[d], x_n_gen, view = d+1, random_state=250)
    Y_test.append(y_t)

gr_test = [0 for _ in range(N_g_test[0])]+[1 for _ in range(N_g_test[1])]
gr_test = pd.Series(gr_test)
Y_test.append(gr_test)

t_test = pd.concat(Y_test, axis=1)

[  0 148 215]
[  0 148 215]
[  0 148 215]
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15']
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15' 'var_2,1' 'var_2,2' 'var_2,3' 'var_2,4' 'var_2,5'
 'var_2,6' 'var_2,7' 'var_2,8']
['var_1,1' 'var_1,2' 'var_1,3' 'var_1,4' 'var_1,5' 'var_1,6' 'var_1,7'
 'var_1,8' 'var_1,9' 'var_1,10' 'var_1,11' 'var_1,12' 'var_1,13'
 'var_1,14' 'var_1,15' 'var_2,1' 'var_2,2' 'var_2,3' 'var_2,4' 'var_2,5'
 'var_2,6' 'var_2,7' 'var_2,8' 'var_3,1' 'var_3,2' 'var_3,3' 'var_3,4'
 'var_3,5' 'var_3,6' 'var_3,7' 'var_3,8' 'var_3,9' 'var_3,10']
[[   9.31076846   -6.4619142    -5.62068783 ...   -4.40096329
    10.17890808    0.16178023]
 [  -5.68292263  -29.50322166   -1.63453443 ...   23.22751469
    13.87959692    5.1737088 ]
 [   9.4817346   -16.8685935

In [11]:
load_multi_view_dataframe('== Local path to node1.csv')

views,view_1,view_1,view_1,view_1,view_1,view_1,view_1,view_1,view_1,view_1,...,view_3,view_3,view_3,view_3,view_3,view_3,view_3,view_3,view_3,view_3
feature_name,"var_1,1","var_1,2","var_1,3","var_1,4","var_1,5","var_1,6","var_1,7","var_1,8","var_1,9","var_1,10",...,"var_3,1","var_3,2","var_3,3","var_3,4","var_3,5","var_3,6","var_3,7","var_3,8","var_3,9","var_3,10"
0,9.310768,-6.461914,-5.620688,-4.694115,-0.722527,15.265315,1.472637,7.819969,-4.907187,3.072993,...,-4.181252,3.615410,-0.257516,5.521795,-21.100074,-10.294829,-4.684357,-4.400963,10.178908,0.161780
1,-5.682923,-29.503222,-1.634534,-29.912643,-4.597543,33.058772,5.594790,-8.081894,-25.082754,-12.439787,...,-24.561108,2.569041,23.109628,4.612879,-16.967085,20.207414,-18.979548,23.227515,13.879597,5.173709
2,9.481735,-16.868594,-17.183134,-0.456770,-5.737701,3.158270,27.514549,6.222885,2.348626,-9.865517,...,-3.113561,12.566323,-3.256145,-21.943875,18.388225,3.157862,-1.418104,7.724204,17.130163,27.672049
3,-12.908970,-20.203461,24.409138,-43.366352,10.613558,45.681332,-14.135197,-4.561476,-44.835105,3.176304,...,-18.333662,-18.565325,25.181532,3.373526,-54.505386,-6.545974,-24.415089,33.304721,12.621899,-21.927617
4,4.104555,-30.477031,-16.065672,-23.631253,31.544356,1.278780,13.459440,31.989547,-34.999132,30.678031,...,-28.910695,-25.503408,13.981502,-37.072069,58.368088,5.190957,-1.770741,18.387552,15.571225,23.468098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,56.716275,-46.795837,-114.814538,28.537304,22.161511,-60.706484,-4.124133,74.366579,6.862298,69.597393,...,-46.316921,-10.529428,-0.258633,40.528049,136.386689,57.843989,49.571218,-71.988456,-11.437536,34.187132
211,41.911085,-64.198269,-105.309973,3.433542,22.578804,-40.912546,-14.363287,62.974529,-13.114022,58.777903,...,-66.805704,-14.473441,21.031061,51.821591,129.504104,85.280532,36.338416,-46.494829,-9.573619,34.502193
212,69.567830,-20.632233,-108.057262,46.532057,29.620095,-66.700842,-18.037534,92.022962,12.599330,89.746195,...,-32.308038,-18.587620,-19.555819,52.181370,116.159689,19.318592,67.211073,-98.492724,-19.684344,14.283126
213,61.829960,-42.909880,-118.346178,39.220589,29.023209,-72.195385,-2.184096,90.834560,8.600463,81.686865,...,-43.258037,-16.850819,-7.178427,30.154897,162.881319,49.457235,66.297061,-82.172018,-14.232245,39.503518


In [6]:
t_i

views,view_1,view_1,view_1,view_1,view_1,view_1,view_1,view_1,view_1,view_1,...,view_3,view_3,view_3,view_3,view_3,view_3,view_3,view_3,view_3,view_3
feature_name,"var_1,1","var_1,2","var_1,3","var_1,4","var_1,5","var_1,6","var_1,7","var_1,8","var_1,9","var_1,10",...,"var_3,1","var_3,2","var_3,3","var_3,4","var_3,5","var_3,6","var_3,7","var_3,8","var_3,9","var_3,10"
0,9.310768,-6.461914,-5.620688,-4.694115,-0.722527,15.265315,1.472637,7.819969,-4.907187,3.072993,...,-4.181252,3.615410,-0.257516,5.521795,-21.100074,-10.294829,-4.684357,-4.400963,10.178908,0.161780
1,-5.682923,-29.503222,-1.634534,-29.912643,-4.597543,33.058772,5.594790,-8.081894,-25.082754,-12.439787,...,-24.561108,2.569041,23.109628,4.612879,-16.967085,20.207414,-18.979548,23.227515,13.879597,5.173709
2,9.481735,-16.868594,-17.183134,-0.456770,-5.737701,3.158270,27.514549,6.222885,2.348626,-9.865517,...,-3.113561,12.566323,-3.256145,-21.943875,18.388225,3.157862,-1.418104,7.724204,17.130163,27.672049
3,-12.908970,-20.203461,24.409138,-43.366352,10.613558,45.681332,-14.135197,-4.561476,-44.835105,3.176304,...,-18.333662,-18.565325,25.181532,3.373526,-54.505386,-6.545974,-24.415089,33.304721,12.621899,-21.927617
4,4.104555,-30.477031,-16.065672,-23.631253,31.544356,1.278780,13.459440,31.989547,-34.999132,30.678031,...,-28.910695,-25.503408,13.981502,-37.072069,58.368088,5.190957,-1.770741,18.387552,15.571225,23.468098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,57.515524,-45.323824,-108.639868,23.599623,26.003503,-48.773703,-23.325352,77.301718,-0.261275,76.871558,...,-51.503040,-17.692983,7.737003,57.142940,114.017658,54.959433,49.337181,-73.962755,-16.587895,15.503366
320,41.285701,-48.748085,-97.548714,21.053953,5.663261,-46.718907,12.997327,49.486652,15.037893,36.406533,...,-40.744581,11.411611,2.313515,28.258345,121.411161,63.964155,33.056180,-48.248519,0.814188,46.766966
321,38.557053,-48.179177,-84.417492,12.342526,-9.046257,-25.310550,-8.117842,33.570963,10.087624,26.868781,...,-42.444966,12.110693,10.400322,64.846437,67.072904,65.470088,26.906109,-48.490353,-6.972979,20.445101
322,56.876268,-14.710380,-86.360473,44.241869,8.423738,-49.458546,-0.023092,63.668705,24.065679,52.503210,...,-15.173121,6.246647,-23.373962,34.195696,79.966724,13.621734,49.069197,-81.697565,-8.195492,23.699873


In [12]:
pd.read_csv('== Local path to node1.csv', header=0)

Unnamed: 0.1,Unnamed: 0,view_1,view_1.1,view_1.2,view_1.3,view_1.4,view_1.5,view_1.6,view_1.7,view_1.8,...,view_3,view_3.1,view_3.2,view_3.3,view_3.4,view_3.5,view_3.6,view_3.7,view_3.8,view_3.9
0,,"var_1,1","var_1,2","var_1,3","var_1,4","var_1,5","var_1,6","var_1,7","var_1,8","var_1,9",...,"var_3,1","var_3,2","var_3,3","var_3,4","var_3,5","var_3,6","var_3,7","var_3,8","var_3,9","var_3,10"
1,0.0,9.310768462017958,-6.461914197276625,-5.620687827816583,-4.694114680162956,-0.7225271123461006,15.265314801886715,1.4726368908849716,7.81996875948183,-4.907186593797548,...,-4.181252168028951,3.615409543463824,-0.25751617079234235,5.52179472950479,-21.10007407204525,-10.294828849767857,-4.684356781988975,-4.400963294800104,10.178908083240731,0.16178022849753004
2,1.0,-5.682922632270812,-29.50322166009444,-1.6345344318786574,-29.912642931876608,-4.597542736427378,33.05877238401702,5.5947903919193465,-8.081893755711596,-25.082753980753235,...,-24.5611084731604,2.569040626582712,23.10962773644023,4.612878535052369,-16.967084842173115,20.20741374606829,-18.97954775494416,23.22751469257653,13.879596915542708,5.173708796643757
3,2.0,9.481734598324717,-16.86859351243419,-17.183134121425528,-0.4567701276548075,-5.737700814358999,3.1582699825407765,27.514549136187853,6.222884806073197,2.348625898241953,...,-3.1135606520323456,12.566322574609977,-3.2561447901911764,-21.94387451629146,18.38822465242519,3.157862051602825,-1.4181035866354579,7.724204279922166,17.130163067820583,27.672048955767117
4,3.0,-12.908969637186573,-20.203461016638123,24.409138101432262,-43.36635164590533,10.613558060800413,45.68133190460415,-14.135197182933005,-4.561476360515851,-44.8351053035544,...,-18.333661678884418,-18.56532537937793,25.181531592420473,3.3735263496110752,-54.505386174935175,-6.545973840337101,-24.41508876591354,33.304721091650684,12.62189911483032,-21.927616936338506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,210.0,56.71627522855958,-46.79583736398122,-114.81453793403885,28.537304032307933,22.161511111013894,-60.70648388385722,-4.124132693167143,74.36657868549494,6.862297808753874,...,-46.31692113131656,-10.529428032157865,-0.25863343507594727,40.52804920494149,136.38668863385385,57.8439887144524,49.57121828241282,-71.98845579915621,-11.437536207049781,34.187132290185126
212,211.0,41.911085215219714,-64.19826881205148,-105.30997253326136,3.4335422102938775,22.57880421060195,-40.91254552112634,-14.36328713404069,62.97452872858298,-13.114022042322006,...,-66.80570384205987,-14.473441006664522,21.03106134475242,51.82159067089561,129.50410422391673,85.28053208830801,36.338416342368006,-46.49482909797055,-9.573618715974273,34.50219312385135
213,212.0,69.5678297095221,-20.632233311582002,-108.05726162858939,46.53205695919303,29.620095481934364,-66.70084239696382,-18.037533940225874,92.02296233790985,12.599330267759653,...,-32.30803793014368,-18.587619727612406,-19.555818852478627,52.181369865646566,116.15968920042138,19.318592426007836,67.21107338618309,-98.49272412627367,-19.6843441510884,14.2831260422221
214,213.0,61.82996020726665,-42.90988004431731,-118.34617802900418,39.22058926883936,29.023208771583143,-72.195384710036,-2.18409621084941,90.83455950280691,8.60046325799761,...,-43.25803732455479,-16.850818958212905,-7.178426996782701,30.154896631257657,162.88131860874216,49.457235411642706,66.29706055154033,-82.17201805095529,-14.232244943620406,39.5035177419949


In [10]:
import pandas as pd
tt = pd.DataFrame({"www": [np.nan, np.nan, np.nan], 'qq': [1, 2, 4]})

tt.iloc[:, 1].isnull().values.any()

False

## Start the network and setting the client up
Before running this notebook:
1. You should start the network from fedbiomed-network, as detailed in :
https://gitlab.inria.fr/fedbiomed/fedbiomed

2. You need to configure at least 2 nodes: <br/>
* **Node 1 :** `./scripts/fedbiomed_run node add`
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[0],y[0].
  * Check that your data has been added in node 1 by executing `./scripts/fedbiomed_run node list`
  * Run the node using `./scripts/fedbiomed_run node start`. <br/>

* **Node 2 :** Open a second terminal and run ./scripts/fedbiomed_run node add config n2.ini
  * Select option 1 to add a csv file to the client
  * Choose the name, tags and description of the dataset (you can write 'sk' always and it will be good)
  * Pick the .csv file you stored the couple X[1],y[1].
  * Check that your data has been added in node 2 by executing `./scripts/fedbiomed_run node config n2.ini list`
  * Run the node using `./scripts/fedbiomed_run node config n2.ini start`.
  


 Wait until you get `Connected with result code 0`. it means node is online.


In [11]:
%load_ext autoreload
%autoreload 2

In [12]:
import numpy as np
from fedbiomed.researcher.environ import TMP_DIR
import tempfile
tmp_dir_model = tempfile.TemporaryDirectory(dir=TMP_DIR+'/')
model_file = tmp_dir_model.name + '/fed_mv_ppca.py'

Hereafter the template of the class you should provide to Fedbiomed :
       
**training_data** : you must return here a tuple (X,X_k,ViewsX,y) or (X,X_k,ViewsX). Note that all centers should provide a dataset with the same view-specific columns. If in a specific center a view has not been observed, then the corresponding columns will be filled of nan. The training_data method take care of identifying view-specific sub-datasets and collecting information concerning non-available observations. Data can also been normalized here.

In [13]:
%%writefile "$model_file"

from fedbiomed.common.ppca import PpcaPlan
import numpy as np


class Fed_MV_PPCA(PpcaPlan):
    def __init__(self, kwargs):
        super(Fed_MV_PPCA, self).__init__(kwargs)
        deps = ['import numpy as np']
        self.add_dependency(deps)
    
    def training_data(self):
        """
            Perform in this method all data reading and data transformations you need.
            At the end you should provide a tuple (X_obs,Xk,ViewsX,y), where: 
            X_obs is the training dataset, 
            Xk is a list containing the k-specific dataframe if it exists or 'NaN' otherwise,
            ViewsX is the indicator function for observed views (ViewsX[k]=1 if view k is observed, 0 otherwise)
            y the corresponding labels (optional)
            The dataset is normalized using min max scaler if model_args['norm'] is true
            Note: since labels are not needed for the optimization, 
            training_data can also simply return (X_obs,Xk,ViewsX)
            :raise NotImplementedError if researcher do not implement this method.
        """
        dataset = pd.read_csv(self.dataset_path,delimiter=',', index_col=0)
        X = dataset.iloc[:,:-1]
        y = dataset[dataset.columns[-1]]
        
        # Xk is a list contianing the view-specific local datasets
        Xk = []
        ViewsX = []
        ind = 0
        for k in range(self.K):
            if X.iloc[:, ind].isnull().values.any():
                Xk.append(np.nan)
                ViewsX.append(0)
            else:
                # if norm = true, data are normalized with min max scaler
                X_k = self.normalize_data(X.iloc[:, ind:ind + self.dim_views[k]]) if self.norm \
                    else X.iloc[:, ind:ind + self.dim_views[k]]
                Xk.append(X_k)
                ViewsX.append(1)
            ind += self.dim_views[k]
        
        # The entire dataset is re-built without empty columns
        Xk_obs = [item for item in Xk if item is not np.nan]
        X_obs = pd.concat(Xk_obs, axis=1)
        
        return (X_obs,Xk,ViewsX,y)
    
    def load_multi_view_dataframe(file_name: str) -> pd.DataFrame:
        df = pd.read_csv(file_name, delimiter=',', index_col=0, header=[0,1])
        return df
    

Writing /home/ybouilla/fedbiomed/var/tmp/tmpcot1vz4y/fed_mv_ppca.py


**model_args** is a dictionary containing the mv-ppca model arguments: the total number of views across all datasets (tot_views), the dimension of each view (dim_views), the latent space size (n_components), and a boolean (norm) for data preprocessing. Additionaly, the researcher can provide priors for one ore more global parameters.

**training_args** contains here the number of local iterations for EM/MAP. 

In [14]:
tot_views = 3
dim_views = [15, 8, 10]
n_components = 4
norm = True

model_args = {'tot_views': tot_views, 'dim_views': dim_views, 'n_components': n_components, 'is_norm': norm}

training_args = {'n_iterations': 15}

In [15]:
from fedbiomed.researcher.experiment import Experiment
from fedbiomed.researcher.aggregators.ppca_aggregator import MLaggregator

tags =  ['ppca_data']
rounds = 5

# select nodes pr into task <Task pending name='Task-27' coro=<HTTP1ServerConnection._server_request_loop() running at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/tornado/http1connection.py:823> wait_for=<Future finished result=b'GET /kernel...6bd7"\r\n\r\n'> cb=[IOLoop.add_future.<locals>.<lambda>() at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/tornado/ioloop.py:688]> while another task <Task pending name='Task-2' coro=<KernelManager._async_start_kernel() running at /user/ybouilla/home/.conda/envs/fedbiomed-researcher/lib/python3.9/site-packages/jupyter_carticiping to this experiment
exp = Experiment(tags=tags,
                 #clients=None,
                 model_path=model_file,
                 model_args=model_args,
                 model_class='Fed_MV_PPCA',
                 training_args=training_args,
                 rounds=rounds,
                 aggregator=MLaggregator(),
                 client_selection_strategy=None)

2021-10-28 14:12:31,194 fedbiomed INFO - Messaging researcher_a4eaa540-4fe0-477c-b7cc-47216483f50a successfully connected to the message broker, object = <fedbiomed.common.messaging.Messaging object at 0x7f888a185250>
2021-10-28 14:12:31,236 fedbiomed INFO - Searching dataset with data tags: ['ppca_data'] for all nodes
2021-10-28 14:12:31,238 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - DEBUG Message received: {'researcher_id': 'researcher_a4eaa540-4fe0-477c-b7cc-47216483f50a', 'tags': ['ppca_data'], 'command': 'search'}
2021-10-28 14:12:41,609 fedbiomed INFO - Messaging NodeTrainingFeedbackClient successfully connected to the message broker, object = <fedbiomed.common.messaging.Messaging object at 0x7f888aa2e2b0>


In [16]:
# start federated training
exp.run()

2021-10-28 14:13:13,865 fedbiomed INFO - Sampled clients in round 0 ['client_170ef23c-cf03-45a8-98a2-dff418afe652']
2021-10-28 14:13:13,866 fedbiomed INFO - Send message to client client_170ef23c-cf03-45a8-98a2-dff418afe652 - {'researcher_id': 'researcher_a4eaa540-4fe0-477c-b7cc-47216483f50a', 'job_id': '4c7dc518-4981-495e-b23a-2f7262ef7bed', 'training_args': {'n_iterations': 15}, 'model_args': {'tot_views': 3, 'dim_views': [15, 8, 10], 'n_components': 4, 'norm': True}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/10/28/my_model_ee40d0fe-d28f-4fa8-9110-65978b14945b.py', 'params_url': 'http://localhost:8844/media/uploads/2021/10/28/my_model_ef839ee8-6872-45fe-b4cf-c7ed6068afe2.pt', 'model_class': 'Fed_MV_PPCA', 'training_data': {'client_170ef23c-cf03-45a8-98a2-dff418afe652': ['dataset_e39d0757-f2ea-400d-b1e9-8c09913dc1a7']}}
2021-10-28 14:13:13,867 fedbiomed DEBUG - researcher_a4eaa540-4fe0-477c-b7cc-47216483f50a
2021-10-28 14:13:13,880 fedbiomed INFO - log

2021-10-28 14:13:24,212 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - DEBUG Dataset_path/home/ybouilla/fedbiomed/notebooks/== Local path to node1.csv
2021-10-28 14:13:24,360 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 1/15	Expected LL: 7411.338880
2021-10-28 14:13:24,457 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 2/15	Expected LL: 7144.311812
2021-10-28 14:13:24,566 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 3/15	Expected LL: 7100.275396
2021-10-28 14:13:24,671 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 4/15	Expected LL: 7086.115278
2021-10-28 14:13:24,816 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 5/15	Expected LL: 7081.343709
2021-10-28 14:13:24,969 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iterat

2021-10-28 14:13:44,206 fedbiomed INFO - Clients that successfully reply in round 2 ['client_170ef23c-cf03-45a8-98a2-dff418afe652']
2021-10-28 14:13:44,261 fedbiomed INFO - Sampled clients in round 3 ['client_170ef23c-cf03-45a8-98a2-dff418afe652']
2021-10-28 14:13:44,262 fedbiomed INFO - Send message to client client_170ef23c-cf03-45a8-98a2-dff418afe652 - {'researcher_id': 'researcher_a4eaa540-4fe0-477c-b7cc-47216483f50a', 'job_id': '4c7dc518-4981-495e-b23a-2f7262ef7bed', 'training_args': {'n_iterations': 15}, 'model_args': {'tot_views': 3, 'dim_views': [15, 8, 10], 'n_components': 4, 'norm': True}, 'command': 'train', 'model_url': 'http://localhost:8844/media/uploads/2021/10/28/my_model_ee40d0fe-d28f-4fa8-9110-65978b14945b.py', 'params_url': 'http://localhost:8844/media/uploads/2021/10/28/researcher_params_68ec4da5-7610-4266-a61f-f7949b54267e.pt', 'model_class': 'Fed_MV_PPCA', 'training_data': {'client_170ef23c-cf03-45a8-98a2-dff418afe652': ['dataset_e39d0757-f2ea-400d-b1e9-8c09913dc1

2021-10-28 14:13:54,360 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - DEBUG Dataset_path/home/ybouilla/fedbiomed/notebooks/== Local path to node1.csv
2021-10-28 14:13:54,508 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 1/15	Expected LL: 7449.415743
2021-10-28 14:13:54,606 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 2/15	Expected LL: 7308.781965
2021-10-28 14:13:54,712 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 3/15	Expected LL: 7304.668597
2021-10-28 14:13:54,824 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 4/15	Expected LL: 7303.218550
2021-10-28 14:13:54,943 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iteration: 5/15	Expected LL: 7302.826706
2021-10-28 14:13:55,037 fedbiomed INFO - log from: client_170ef23c-cf03-45a8-98a2-dff418afe652 - INFO Iterat

In [17]:
print("\nList the training rounds : ", exp.aggregated_params.keys())

print("\nAccess the federated params for the last training round :")
print("\t- params_path: ", exp.aggregated_params[rounds - 1]['params_path'])
print("\t- parameter data: ", exp.aggregated_params[rounds - 1]['params'].keys())


List the training rounds :  dict_keys([0, 1, 2, 3, 4])

Access the federated params for the last training round :
	- params_path:  /home/ybouilla/fedbiomed/var/tmp/researcher_params_c519e47f-9d93-4ee3-9f17-334d35319a56.pt
	- parameter data:  dict_keys(['tilde_muk', 'tilde_Wk', 'tilde_Sigma2k', 'Alpha', 'Beta', 'sigma_til_muk', 'sigma_til_Wk', 'sigma_til_sigma2k'])


## Test

Herafter we test the performance of the aggregated parameters on a test dataset. In particular, for each round we use the global parameters to evaluate the mean absolute error and the separation in the latent space using LDA. Note that we have already defined the test dataset at the beginning of this notebook.

In [18]:
import numpy as np
import pandas as pd
from numpy.linalg import solve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error

def normalize_data(X):
    """
    This function normalize the dataset X
    :param X: pandas dataframe
    :return pandas dataframe
    """
    col_name = [col.strip() for col in list(X.columns)]
    x = X.values  # returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    norm_dataset = pd.DataFrame(x_scaled, index=X.index, columns=col_name)

    return norm_dataset

def eval_MB(Wk, q, D_i, K, Sigma2, ViewsX):
    """
    Computes matrices M:=inv(I_q+sum_k Wk.TWk/sigma2k) and B:= [W1.T/sigma2K,...,W1.T/sigma2K].
    :param Wk: list of matrices (d_k x q)
    :param Sigma2: list of float > 0
    :return np.arrays
    """
    index = ViewsX.index(1)

    M1 = Wk[index].reshape(D_i[index], q).T.dot(Wk[index].reshape(D_i[index],q)) / Sigma2[index]
    B = Wk[index].reshape(D_i[index], q).T / Sigma2[index]
    for k in range(index + 1, K):
        if ViewsX[k] == 1:
            M1 += Wk[k].reshape(D_i[k], q).T.dot(Wk[k].reshape(D_i[k],q)) / Sigma2[k]
            B = np.concatenate((B, (Wk[k].reshape(D_i[k], q)).T / Sigma2[k]), axis=1)

    M = solve(np.eye(q) + M1,np.eye(q))

    return M, B

def concat_params(park, K, ViewsX):
    """
    This function concatenates parameters from a list
    :param park: list of vectors/matrices to concatenate
    :return np.array
    """
    index = ViewsX.index(1)

    par = park[index]
    for k in range(index + 1, K):
        if ViewsX[k] == 1:
            par = np.concatenate((par, park[k]), axis=0)

    return par

def simu_latent(q,dataset,ViewsX,global_params):
    """
    This function allows sampling of x_n (latent variables) from the posterior distribution 
    (with global parameters).
    :return pandas dataframe
    """
    d = dataset.shape[1]
    N = dataset.shape[0]
    K = len(ViewsX)

    mu = concat_params(global_params['tilde_muk'], K, ViewsX)
    M, B = eval_MB(global_params['tilde_Wk'], q, D_i, K, global_params['tilde_Sigma2k'],ViewsX)

    Xn = [(M.dot(B).dot(dataset.iloc[n].values.reshape(d, 1) - mu)).reshape(1, q) for n in range(N)]

    df = pd.DataFrame(np.vstack(Xn), index=dataset.index)

    return df

def MAE(dataset,ViewsX,q,global_params):
    """
    This function evaluates the MAE using global parameters
    :return float
    """    
    d = dataset.shape[1]
    N = dataset.shape[0]
    K = len(ViewsX)
    
    mu = concat_params(global_params['tilde_muk'], K, ViewsX)
    W = concat_params(global_params['tilde_Wk'], K, ViewsX)
    M, B = eval_MB(global_params['tilde_Wk'], q, D_i, K, global_params['tilde_Sigma2k'],ViewsX)

    T_true = dataset.values.tolist()

    T_pred = []
    for n in range(N):
        Xng = (M.dot(B).dot(dataset.iloc[n].values.reshape(d, 1) - mu)).reshape(q, 1)
        T_pred.append((W.dot(Xng) + mu).reshape(d))

    MAE = mean_absolute_error(T_true, T_pred)

    return MAE

In [19]:
# Test dataset


def test_data(dataset,norm,K,dim_views):
        """
            Equivalent to training_data, for the test dataset
        """
        X = dataset.iloc[:,:-1]
        y = dataset[dataset.columns[-1]]
        
        # Xk is a list contianing the view-specific local datasets
        Xk = []
        ViewsX = []
        ind = 0
        for k in range(K):
            if X.iloc[:, ind].isnull().values.any():
                Xk.append(np.nan)
                ViewsX.append(0)
            else:
                # if norm = true, data are normalized with min max scaler
                X_k = normalize_data(X.iloc[:, ind:ind + dim_views[k]]) if norm \
                    else X.iloc[:, ind:ind + dim_views[k]]
                Xk.append(X_k)
                ViewsX.append(1)
            ind += dim_views[k]
        
        # The entire dataset is re-built without empty columns
        Xk_obs = [item for item in Xk if item is not np.nan]
        X_obs = pd.concat(Xk_obs, axis=1)
        
        return (X_obs,Xk,ViewsX,y)

In [23]:
from sklearn import preprocessing

for r in range(1,rounds):
    global_params = exp.aggregated_params[r]['params']
    ######## Train data
    MAE_train = []
    Latent_Train = pd.DataFrame()
    Label_Train = pd.Series(dtype='int64')
    for c in range(n_centers):
        dataset_c = pd.read_csv('== Local path to node' + str(c+1) + '.csv', delimiter=',', index_col=0)
        X_obs_c,Xk_c,ViewsX_c,y_c = test_data(dataset_c,norm,tot_views,dim_views)
        # Dataframe of latent space for LDA
        Latent_Train = Latent_Train.append(simu_latent(n_components,X_obs_c,ViewsX_c,global_params))
        Label_Train = Label_Train.append(y_c)
        # MAE Train
        MAE_train.append(MAE(X_obs_c,ViewsX_c,n_components,global_params))

    

    ######## Test data
    X_obs_test,Xk_test,ViewsX_test,y_test = test_data(t_test,norm,tot_views,dim_views)

    Latent_Test = pd.DataFrame()
    Label_Test = pd.Series(dtype='int64')

    # Dataframe of latent space for LDA
    Latent_Test = Latent_Test.append(simu_latent(n_components,X_obs_test,ViewsX_test,global_params))
    Label_Test = Label_Test.append(y_test)
    # MAE Test
    MAE_test = MAE(X_obs_test,ViewsX_test,n_components,global_params)


    print('Round {}:'.format(r))
    print('MAE train (mean,std) = ({:.4f},{:.4f}) \
    \t MAE test = {:.4f} \
    '.format(np.mean(np.array(MAE_train)), \
                                                 np.std(np.array(MAE_train)), MAE_test))

Round 1:
MAE train (mean,std) = (0.0458,0.0044)     	 MAE test = 0.0588     
Round 2:
MAE train (mean,std) = (0.0434,0.0045)     	 MAE test = 0.0554     
Round 3:
MAE train (mean,std) = (0.0415,0.0046)     	 MAE test = 0.0527     
Round 4:
MAE train (mean,std) = (0.0400,0.0047)     	 MAE test = 0.0508     


In [24]:
conf_LDA_Test

array([[20,  0],
       [ 0, 20]])