# Peer-to-peer Learning Contribution Measure Model (Ebola dataset)
This notebook implements a simple contribution model based on the notebook exemple of the DeAI repository. More precisely, it emulates a framework where `NUM_CLIENTS` clients are learning Ebola diagnosis (or prognosis) and average their models using a peer-to-peer learning setting (P2PL, i.e. without the orchestration of a central server).

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from sklearn.preprocessing import LabelEncoder

import time

# Custom functions
from helpers import *

# Visualization
from matplotlib import pyplot as plt
from visualization import *
plt.rc('legend', fontsize='small')
plt.style.use('bmh')

# Reproductibiity
SEED = 123
torch.manual_seed(SEED)
np.random.seed(SEED)

# Console priniting option
np.set_printoptions(precision=3, suppress=False, sign=' ')

### Model 
`Ebola_Net` was developped by ... (link to repo?)

In [None]:
from models import Ebola_Net()

### Peer-to-peer Learning Contribution Measure
The following cells implement a peer-to-peer learning setting, i.e. where each user intitializes its own model and then,
for $r=1,...,$ `NUM_ROUNDS`:

1. Each client trains the model on its own data (`EPOCHS` times using batch of size `BATCH_SIZE`),
2. Each client send its model to a subset of other users participating to the task (according to a communication graph),
3. Each client recives the models from the other users,
4. Each client evaluates the performance of the models on its own test datasets,
5. Each client aggregates the models and update the contributions of each neighbouring user (in the communication graph).

### Parameters (to be specified)

In [None]:
# Topology
NUM_CLIENTS = 4
COMM_MATRIX = np.array([[1, 1, 1, 1],
                        [1, 1, 1, 1],
                        [1, 1, 1, 1],
                        [1, 1, 1, 1]]) / NUM_CLIENTS


# Training parameters
NUM_ROUNDS = 100
EPOCHS = 1 #per round
LR = 1e-3
BATCH_SIZE = 32
THRESHOLD = 0.5 #for binary classificaltion
CRITERION = nn.BCELoss()
METRIC = 'loss_norm'
VERBOSE = True

### Data Loading

In [None]:
# Creating the dataloaders

train_loaders = # list of train datalader (1 per user) with batch_size=BATCH_SIZE
test_loaders = # list of train datalader (1 per user) with batch_size=10*BATCH_SIZE
test_loader_s = # datalader containing all the test dataset, with batch_size=10*BATCH_SIZE (only used for the ROC curve at the end)

### Marginal losses and Shapley values for P2PL

In [None]:
# initial model
initial_model = Ebola_Net()
client_models = [Ebola_Net() for _ in range(NUM_CLIENTS)]

# coalition models and initialization
coal_models_agg = {} 
global_coals = {}
marg_coals = {}

for i, weights in enumerate(COMM_MATRIX.transpose()):
    
    client_models[i].load_state_dict(initial_model.state_dict())
    
    # tuple containing all the users that sends their model to user i (including i)
    global_coals[i] = tuple(np.nonzero(weights)[0])
    # all the leave-one-out caolitions that i can make when aggregating the models it recieved.
    for j in range(NUM_CLIENTS):
        marg_coals[i, j] = tuple([k for k in global_coals[i] if k != j])
    
    for coal in powerset(global_coals[i]):
        coal_models_agg[i, coal] = Ebola_Net()
        coal_models_agg[i, coal].load_state_dict(initial_model.state_dict())

# performance measures
perf = initialize_perf((NUM_ROUNDS+1, NUM_CLIENTS)) # perf[0] contains the performance prior to learning
marg_perf_agg = initialize_perf((NUM_ROUNDS, NUM_CLIENTS, NUM_CLIENTS)) 
 
for i in range(NUM_CLIENTS):
    # untrained model performance
    fill_perf_history(evaluate_model(client_models[i], test_loaders[i], CRITERION, THRESHOLD),
                      perf, (0, i))
    
print('Prior performances ({}): {}'.format(METRIC, perf[METRIC][0, :]))

# contribution
SVa = np.zeros((NUM_ROUNDS, NUM_CLIENTS, NUM_CLIENTS))
MLa = np.zeros((NUM_ROUNDS, NUM_CLIENTS, NUM_CLIENTS))


# Iteration
for r in range(NUM_ROUNDS):
    t_start = time.time()
    if VERBOSE:
        print("------------------------------------\nRound {}:".format(r+1), end='')
    else:
        print("Round {}/{} (current {}: {})".format(r+1, NUM_ROUNDS, METRIC, perf[METRIC][r, :]), end='\r')
        
    # client update
    loss = {}
    for i, model in enumerate(client_models):
        opt = optim.SGD(model.parameters(), lr=LR)
        loss[i] = client_update(model, opt, CRITERION, train_loaders[i], epoch=EPOCHS)  
    
    # diffuse params across neighbors using different coalitions
    diffuse_params(client_models, coal_models_agg, COMM_MATRIX)

    # updating the client models with the aggregated model from the global coalitions
    for i, model in enumerate(client_models):
        model.load_state_dict(coal_models_agg[i, global_coals[i]].state_dict())
    
    # MLa
    for i in range(NUM_CLIENTS):
        
        # Computing the performance of each client models
        fill_perf_history(evaluate_model(client_models[i], test_loaders[i], CRITERION, THRESHOLD),
                          perf, (r+1, i))
        
        # step 5. Updating the contributions     
        # SVa
        SVa[r, i, :] = SV_P2PL(NUM_CLIENTS, i, global_coals[i], coal_models_agg, test_loaders[i], METRIC, CRITERION, THRESHOLD)
        
        #MLa
        for j in range(NUM_CLIENTS):
            # marginal models (during aggregation) performance on marginal test datasets
            fill_perf_history(evaluate_model(coal_models_agg[i, marg_coals[i, j]], test_loaders[i], CRITERION, THRESHOLD),
                              marg_perf_agg, (r, i, j))
        
        MLa[r, i, :] = marg_perf_agg[METRIC][r, i, :] - perf[METRIC][r+1, i]
    
    t_round = time.time() - t_start
    
    if VERBOSE:
        print(" ({:.1f}s)".format(t_round))
        print("Performance:\n{} ".format(perf[METRIC][r+1]))
        print("SVa:")
        print(" -Total:\n{}".format(SVa[r]))
        print(" -Cumulative:\n{}".format(SVa.sum(0)))
        print("MLa:")
        print(" -Total:\n{}".format(MLa[r]))
        print(" -Cumulative:\n{}".format(MLa.sum(0)))


print("------------------------------------\nFinal performance ({}): {}".format(METRIC, perf[METRIC][-1]))

### Training history vizualization

In [None]:
plotmetrics = ['loss_norm', 'accuracy', 'f1']

# Model performance
fig_perf = perfplots(perf, suptitle='Model Performance (server)', legends=legends, metrics=plotmetrics)


In [None]:
# ROC curve
fig_ROC = ROC(coal_models_agg[0, global_coals[0]], test_loader_s)

In [None]:
# Contribution plot
modes = ['round_maxmax', 'round_minmax', 'cum', 'minmax', 'maxmax']
modes = ['round', 'cum']
fig_SVa0 = contriplot(SVa[:,0,:], legends=legends, normalize=True, modes=modes, suptitle="Shapley Values at Aggregation")
fig_MLa0 = contriplot(MLa[:,0,:], legends=legends, normalize=True, modes=modes, suptitle="Marginal Losses at Aggregation")
fig_SVa1 = contriplot(SVa[:,1,:], legends=legends, normalize=True, modes=modes, suptitle="Shapley Values at Aggregation")
fig_MLa1 = contriplot(MLa[:,1,:], legends=legends, normalize=True, modes=modes, suptitle="Marginal Losses at Aggregation")
fig_SVa2 = contriplot(SVa[:,2,:], legends=legends, normalize=True, modes=modes, suptitle="Shapley Values at Aggregation")
fig_MLa2 = contriplot(MLa[:,2,:], legends=legends, normalize=True, modes=modes, suptitle="Marginal Losses at Aggregation")
fig_SVa3 = contriplot(SVa[:,3,:], legends=legends, normalize=True, modes=modes, suptitle="Shapley Values at Aggregation")
fig_MLa3 = contriplot(MLa[:,3,:], legends=legends, normalize=True, modes=modes, suptitle="Marginal Losses at Aggregation")

### Saving

In [None]:
from datetime import datetime
import dill
import os

### Saving the relevant objects
now = datetime.now()
time = now.strftime('%d_%m_%Hh%M')
subdir = "P2PL_Ebola_{}/".format(time)

fig_dir = './saves/' + subdir + 'figures/'
save_dir = './saves/' + subdir

if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
    
print(save_dir)

Figures

In [None]:
fig_perf.savefig(fig_dir + 'perf.png')
fig_ROC.savefig(fig_dir + 'ROC.png')
fig_SVa0.savefig(fig_dir + 'SVa0.png')
fig_MLa0.savefig(fig_dir + 'MLa0.png')
fig_SVa1.savefig(fig_dir + 'SVa1.png')
fig_MLa1.savefig(fig_dir + 'MLa1.png')
fig_SVa2.savefig(fig_dir + 'SVa2.png')
fig_MLa2.savefig(fig_dir + 'MLa2.png')
fig_SVa3.savefig(fig_dir + 'SVa3.png')
fig_MLa3.savefig(fig_dir + 'MLa3.png')

Variables

In [None]:
dill.dump_session(save_dir + 'variables.pckl')

### Loading

In [None]:
import dill
directory = './saves/FL_n4_r100_25_05_23h42/'
dill.load_session(directory + 'variables.pckl')