# FLDetector for Fashion MNIST


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import random
import copy
import time
from functools import reduce
from torchsummary import summary

import os
import sys
import pickle
sys.path.insert(0,'./utils/')
from logger import *
from eval import *
from misc import *

from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import defaultdict

from SGD import *
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

  warn(f"Failed to load image Python extension: {e}")


cuda


In [3]:
transform = transforms.Compose([transforms.ToTensor()])
trainset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

In [19]:
from torch.nn.utils import parameters_to_vector, vector_to_parameters

def train(train_data, labels, model, optimizer, batch_size=20):
    model.train()
    criterion = nn.CrossEntropyLoss()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    len_t = (len(train_data) // batch_size)
    if len(train_data)%batch_size:
        len_t += 1

    r=np.arange(len(train_data))
    np.random.shuffle(r)
    
    train_data = train_data[r]
    labels = labels[r]
    
    for ind in range(len_t):

        inputs = train_data[ind * batch_size:(ind + 1) * batch_size]
        targets = labels[ind * batch_size:(ind + 1) * batch_size]

        inputs, targets = inputs.cuda(), targets.cuda()

        inputs, targets = torch.autograd.Variable(inputs), torch.autograd.Variable(targets)
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(prec1.item(), inputs.size(0))
        top5.update(prec5.item(), inputs.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return (losses.avg, top1.avg)


def test(test_data, labels, model, criterion, use_cuda, debug_='MEDIUM', batch_size=64):
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    model.eval()
    len_t = (len(test_data) // batch_size)
    if len(test_data)%batch_size:
        len_t += 1

    with torch.no_grad():
        for ind in range(len_t):
            # measure data loading time
            inputs = test_data[ind * batch_size:(ind + 1) * batch_size]
            targets = labels[ind * batch_size:(ind + 1) * batch_size]

            inputs, targets = inputs.cuda(), targets.cuda()

            inputs, targets = torch.autograd.Variable(inputs), torch.autograd.Variable(targets)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))

    return (losses.avg, top1.avg)

In [4]:
def sample_dirichlet_train_data(trainset, no_participants, alpha=0.9, force=False):
        """
            Input: Number of participants and alpha (param for distribution)
            Output: A list of indices denoting data in CIFAR training set.
            Requires: cifar_classes, a preprocessed class-indice dictionary.
            Sample Method: take a uniformly sampled 10-dimension vector as parameters for
            dirichlet distribution to sample number of images in each class.
        """
        if not os.path.exists('./dirichlet_a_%.1f_nusers_%d.pkl'%(alpha, no_participants)) or force:
            print('generating participant indices for alpha %.1f'%alpha)
            np.random.seed(0)
            cifar_classes = {}
            for ind, x in enumerate(trainset):
                _, label = x
                if label in cifar_classes:
                    cifar_classes[label].append(ind)
                else:
                    cifar_classes[label] = [ind]

            per_participant_list = defaultdict(list)
            no_classes = len(cifar_classes.keys())
            for n in range(no_classes):
                random.shuffle(cifar_classes[n])
                sampled_probabilities = len(cifar_classes[n]) * np.random.dirichlet(
                    np.array(no_participants * [alpha]))
                for user in range(no_participants):
                    no_imgs = int(round(sampled_probabilities[user]))
                    sampled_list = cifar_classes[n][:min(len(cifar_classes[n]), no_imgs)]
                    per_participant_list[user].extend(sampled_list)
                    cifar_classes[n] = cifar_classes[n][min(len(cifar_classes[n]), no_imgs):]
            with open('./dirichlet_a_%.1f_nusers_%d.pkl'%(alpha, no_participants), 'wb') as f:
                pickle.dump(per_participant_list, f)
        else:
            per_participant_list = pickle.load(open('./dirichlet_a_%.1f_nusers_%d.pkl'%(alpha, no_participants), 'rb'))
            
        return per_participant_list

In [7]:
def get_fang_train_data(trainset, num_workers=100, bias=0.5, force=False):
    dist_file = 'fang_nworkers%d_bias%.1f.pkl' % (num_workers, bias)
    if not force and os.path.exists(dist_file):
        print('Loading fang distribution for num_workers %d and bias %.1f from memory' % (num_workers, bias))
        return pickle.load(open(dist_file, 'rb'))
    bias_weight = bias
    other_group_size = (1 - bias_weight) / 9.
    worker_per_group = num_workers / 10
    each_worker_data = [[] for _ in range(num_workers)]
    each_worker_label = [[] for _ in range(num_workers)]
    per_participant_list = defaultdict(list)
    for i, (x, y) in enumerate(trainset):
        # assign a data point to a group
        upper_bound = (y) * (1 - bias_weight) / 9. + bias_weight
        lower_bound = (y) * (1 - bias_weight) / 9.
        rd = np.random.random_sample()
        if rd > upper_bound:
            worker_group = int(np.floor((rd - upper_bound) / other_group_size) + y + 1)
        elif rd < lower_bound:
            worker_group = int(np.floor(rd / other_group_size))
        else:
            worker_group = y
        rd = np.random.random_sample()
        selected_worker = int(worker_group * worker_per_group + int(np.floor(rd * worker_per_group)))
        per_participant_list[selected_worker].extend([i])
    
    print('Saving fang distribution for num_workers %d and bias %.1f from memory' % (num_workers, bias))
    pickle.dump(per_participant_list, open(dist_file, 'wb'))
    return per_participant_list

In [15]:
def get_federated_data(trainset, num_workers, distribution='fang', param=1, force=False):
    if distribution == 'fang':
        per_participant_list = get_fang_train_data(trainset, num_workers, bias=param, force=force)
    elif distribution == 'dirichlet':
        per_participant_list = sample_dirichlet_train_data(trainset, num_workers, alpha=param, force=force)

    each_worker_idx = [[] for _ in range(num_workers)]
    each_worker_val_idx = [[] for _ in range(num_workers)]
    each_worker_te_idx = [[] for _ in range(num_workers)]
    
    each_worker_data = [[] for _ in range(num_workers)]
    each_worker_label = [[] for _ in range(num_workers)]
    
    each_worker_val_data = [[] for _ in range(num_workers)]
    each_worker_val_label = [[] for _ in range(num_workers)]
    
    each_worker_te_data = [[] for _ in range(num_workers)]
    each_worker_te_label = [[] for _ in range(num_workers)]
    
    np.random.seed(0)
    for worker_idx in range(len(per_participant_list)):
        w_indices = np.array(per_participant_list[worker_idx])
        w_len = len(w_indices)
        len_tr = int(5 * w_len/7)
        len_val = int(w_len/7)
        np.random.shuffle(w_indices)
        
        tr_idx = w_indices[:len_tr]
        val_idx = w_indices[len_tr: len_tr+len_val]
        te_idx = w_indices[len_tr+len_val:]

        for idx in tr_idx:
            each_worker_data[worker_idx].append(trainset[idx][0])
            each_worker_label[worker_idx].append(trainset[idx][1])
        each_worker_data[worker_idx] = torch.stack(each_worker_data[worker_idx])
        each_worker_label[worker_idx] = torch.Tensor(each_worker_label[worker_idx]).long()
        
        for idx in val_idx:
            each_worker_val_data[worker_idx].append(trainset[idx][0])
            each_worker_val_label[worker_idx].append(trainset[idx][1])
        each_worker_val_data[worker_idx] = torch.stack(each_worker_val_data[worker_idx])
        each_worker_val_label[worker_idx] = torch.Tensor(each_worker_val_label[worker_idx]).long()
        
        for idx in te_idx:
            each_worker_te_data[worker_idx].append(trainset[idx][0])
            each_worker_te_label[worker_idx].append(trainset[idx][1])
        each_worker_te_data[worker_idx] = torch.stack(each_worker_te_data[worker_idx])
        each_worker_te_label[worker_idx] = torch.Tensor(each_worker_te_label[worker_idx]).long()
    
    global_val_data = torch.concatenate(each_worker_val_data)
    global_val_label = torch.concatenate(each_worker_val_label)
    
    global_te_data = torch.concatenate(each_worker_te_data)
    global_te_label = torch.concatenate(each_worker_te_label)
    
    return each_worker_data, each_worker_label, each_worker_val_data, each_worker_val_label, each_worker_te_data, each_worker_te_label, global_val_data, global_val_label, global_te_data, global_te_label

In [10]:
class cnn(nn.Module):
    def __init__(self):
        super(cnn, self).__init__()
        self.conv1 = nn.Conv2d(1, 30, 3)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(30, 50, 3)
        self.pool2 = nn.MaxPool2d(2,2)
        self.fc1 = nn.Linear(1250, 200)
        self.fc2 = nn.Linear(200, 10)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [11]:
model = cnn()
sum(p.numel() for p in model.parameters())

266060

In [16]:
all_data = torch.utils.data.ConcatDataset((trainset, testset))
num_workers = 100
distribution='fang'
param = .5
force = False

each_worker_data, each_worker_label, each_worker_val_data, each_worker_val_label, each_worker_te_data, each_worker_te_label, global_val_data, global_val_label, global_te_data, global_te_label = get_federated_data(
    all_data, num_workers=num_workers, distribution=distribution, param=param, force=force)

Loading fang distribution for num_workers 100 and bias 0.5 from memory


In [17]:
len(global_te_label), len(global_val_label)

(8664, 8526)

In [20]:
criterion = nn.CrossEntropyLoss()
use_cuda = torch.cuda.is_available()
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.1)

ft_lrs = [0.01, 0.005, 0.001, 0.0001]
ft_epochs = [5, 10, 15, 20]
batch_sizes = [10, 20, 30, 40]
te_accs, val_accs = [], []

results = {}
for ft_lr in ft_lrs:
    results[ft_lr] = {}
    for ft_ne in ft_epochs:
        results[ft_lr][ft_ne] = {}
        for bs in batch_sizes:
            acc_losses = []
            te_acc_losses = []
            for i in range(len(each_worker_data)):
                model = cnn().to(device)
                model.apply(init_weights)
                optimizer = optim.SGD(model.parameters(), lr = ft_lr, momentum=.9, weight_decay=5e-5)
                for epoch in range(ft_ne):
                    train_loss, train_acc = train(each_worker_data[i].reshape(-1, 1, 28, 28), torch.Tensor(each_worker_label[i]).long(), model, optimizer, batch_size=bs)
                val_loss, val_acc = test(each_worker_val_data[i], each_worker_val_label[i], model, criterion, use_cuda, batch_size=100)
                te_loss, te_acc = test(each_worker_te_data[i], each_worker_te_label[i], model, criterion, use_cuda, batch_size=100)

                val_accs.append(val_acc)
                te_accs.append(te_acc)

            acc_losses = np.array(acc_losses)
            print('lr %.4f ne %d bs %d | val acc %.2f std %.2f | test acc %.2f std %.2f' % (
                ft_lr, ft_ne, bs, np.array(val_accs).mean(), np.array(val_accs).std(), np.array(te_accs).mean(), np.array(te_accs).std()))

lr 0.0100 ne 5 bs 10 | val acc 71.11 std 5.61 | test acc 71.20 std 5.39
lr 0.0100 ne 5 bs 20 | val acc 69.58 std 6.15 | test acc 70.34 std 5.53
lr 0.0100 ne 5 bs 30 | val acc 68.24 std 6.43 | test acc 69.17 std 5.81
lr 0.0100 ne 5 bs 40 | val acc 66.67 std 7.02 | test acc 67.71 std 6.65
lr 0.0100 ne 10 bs 10 | val acc 68.14 std 7.26 | test acc 69.14 std 6.96
lr 0.0100 ne 10 bs 20 | val acc 68.99 std 7.14 | test acc 69.95 std 6.83
lr 0.0100 ne 10 bs 30 | val acc 69.48 std 7.03 | test acc 70.31 std 6.63
lr 0.0100 ne 10 bs 40 | val acc 69.65 std 6.84 | test acc 70.39 std 6.44
lr 0.0100 ne 15 bs 10 | val acc 70.28 std 6.88 | test acc 70.98 std 6.50
lr 0.0100 ne 15 bs 20 | val acc 70.79 std 6.93 | test acc 71.40 std 6.47
lr 0.0100 ne 15 bs 30 | val acc 71.08 std 6.86 | test acc 71.70 std 6.41
lr 0.0100 ne 15 bs 40 | val acc 71.20 std 6.77 | test acc 71.83 std 6.33
lr 0.0100 ne 20 bs 10 | val acc 71.61 std 6.77 | test acc 72.26 std 6.36
lr 0.0100 ne 20 bs 20 | val acc 71.91 std 6.76 | test a

In [21]:
criterion = nn.CrossEntropyLoss()
use_cuda = torch.cuda.is_available()
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.1)

ft_lrs = [0.01]
ft_epochs = [20]
batch_sizes = [50, 60, 70, 80, 90, 100]
te_accs, val_accs = [], []

results = {}
for ft_lr in ft_lrs:
    results[ft_lr] = {}
    for ft_ne in ft_epochs:
        results[ft_lr][ft_ne] = {}
        for bs in batch_sizes:
            acc_losses = []
            te_acc_losses = []
            for i in range(len(each_worker_data)):
                model = cnn().to(device)
                model.apply(init_weights)
                optimizer = optim.SGD(model.parameters(), lr = ft_lr, momentum=.9, weight_decay=5e-5)
                for epoch in range(ft_ne):
                    train_loss, train_acc = train(each_worker_data[i].reshape(-1, 1, 28, 28), torch.Tensor(each_worker_label[i]).long(), model, optimizer, batch_size=bs)
                val_loss, val_acc = test(each_worker_val_data[i], each_worker_val_label[i], model, criterion, use_cuda, batch_size=100)
                te_loss, te_acc = test(each_worker_te_data[i], each_worker_te_label[i], model, criterion, use_cuda, batch_size=100)

                val_accs.append(val_acc)
                te_accs.append(te_acc)

            acc_losses = np.array(acc_losses)
            print('lr %.4f ne %d bs %d | val acc %.2f std %.2f | test acc %.2f std %.2f' % (
                ft_lr, ft_ne, bs, np.array(val_accs).mean(), np.array(val_accs).std(), np.array(te_accs).mean(), np.array(te_accs).std()))

lr 0.0100 ne 20 bs 50 | val acc 73.52 std 5.11 | test acc 74.11 std 5.15
lr 0.0100 ne 20 bs 60 | val acc 73.19 std 5.19 | test acc 74.01 std 4.93
lr 0.0100 ne 20 bs 70 | val acc 72.72 std 5.32 | test acc 73.67 std 4.88
lr 0.0100 ne 20 bs 80 | val acc 72.47 std 5.30 | test acc 73.54 std 4.90
lr 0.0100 ne 20 bs 90 | val acc 72.17 std 5.36 | test acc 73.36 std 4.92
lr 0.0100 ne 20 bs 100 | val acc 71.90 std 5.47 | test acc 73.02 std 5.00


In [24]:
criterion = nn.CrossEntropyLoss()
use_cuda = torch.cuda.is_available()
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.1)

ft_lrs = [0.01]
ft_epochs = [20]
batch_sizes = [50]
te_accs, val_accs = [], []
home_dir = '/home/vshejwalkar_umass_edu/fedrecover/'
results = {}
for ft_lr in ft_lrs:
    results[ft_lr] = {}
    for ft_ne in ft_epochs:
        results[ft_lr][ft_ne] = {}
        for bs in batch_sizes:
            acc_losses = []
            te_acc_losses = []
            for i in range(len(each_worker_data)):
                model = cnn().to(device)
                model.apply(init_weights)
                optimizer = optim.SGD(model.parameters(), lr = ft_lr, momentum=.9, weight_decay=5e-5)
                for epoch in range(ft_ne):
                    train_loss, train_acc = train(each_worker_data[i].reshape(-1, 1, 28, 28), torch.Tensor(each_worker_label[i]).long(), model, optimizer, batch_size=bs)
                val_loss, val_acc = test(each_worker_val_data[i], each_worker_val_label[i], model, criterion, use_cuda, batch_size=100)
                te_loss, te_acc = test(each_worker_te_data[i], each_worker_te_label[i], model, criterion, use_cuda, batch_size=100)

                val_accs.append(val_acc)
                te_accs.append(te_acc)

            acc_losses = np.array(acc_losses)
            print('lr %.4f ne %d bs %d | val acc %.2f std %.2f | test acc %.2f std %.2f' % (
                ft_lr, ft_ne, bs, np.array(val_accs).mean(), np.array(val_accs).std(), np.array(te_accs).mean(), np.array(te_accs).std()))

pickle.dump([val_accs, te_accs], open(os.path.join(home_dir, 'FLDetector_plots_data/fashion_personalized_eval_local_train.pkl'), 'wb'))

lr 0.0100 ne 20 bs 50 | val acc 73.70 std 5.75 | test acc 74.02 std 4.79


# Good FedAvg baseline Fashion MNIST + Fang distribution + 80 clients

# Mean without any attack

In [59]:
torch.cuda.empty_cache()
use_cuda = torch.cuda.is_available()

local_epochs = 2
batch_size = 16
num_workers = 100
local_lr = 0.01
global_lr = 1
nepochs = 50
nbyz = 20

best_global_acc=0
epoch_num = 0
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)
fed_model = cnn().to(device)
fed_model.apply(init_weights)
model_received = []
for i, (name, param) in enumerate(fed_model.state_dict().items()):
    model_received = param.view(-1).data.type(torch.cuda.FloatTensor) if len(model_received) == 0 else torch.cat((model_received, param.view(-1).data.type(torch.cuda.FloatTensor)))

best_accs_per_round = []
accs_per_round = []
loss_per_round = []
home_dir = '/home/vshejwalkar_umass_edu/fedrecover/'

while epoch_num <= nepochs:
    torch.cuda.empty_cache()
    round_clients = np.arange(nbyz, num_workers)
    round_benign = round_clients
    user_updates=[]
    benign_norm = 0
    
    for i in round_benign:
        model = copy.deepcopy(fed_model)
        optimizer = optim.SGD(model.parameters(), lr = lr, momentum=0.9, weight_decay=1e-4)
        for epoch in range(local_epochs):
            train_loss, train_acc = train(
                each_worker_data[i], torch.Tensor(each_worker_label[i]).long(), model, optimizer, batch_size)

        params = []
        for i, (name, param) in enumerate(model.state_dict().items()):
            params = param.view(-1).data.type(torch.cuda.FloatTensor) if len(params) == 0 else torch.cat(
                (params, param.view(-1).data.type(torch.cuda.FloatTensor)))

        update =  (params - model_received)
        benign_norm += torch.norm(update)/len(round_benign)
        user_updates = update[None,:] if len(user_updates) == 0 else torch.cat((user_updates, update[None,:]), 0)

    agg_update = torch.mean(user_updates, 0)
    del user_updates

    model_received = model_received + global_lr * agg_update
    fed_model = cnn().to(device)
    fed_model.apply(init_weights)
    
    start_idx=0
    state_dict = {}
    previous_name = 'none'
    for i, (name, param) in enumerate(fed_model.state_dict().items()):
        start_idx = 0 if i == 0 else start_idx + len(fed_model.state_dict()[previous_name].data.view(-1))
        start_end = start_idx + len(fed_model.state_dict()[name].data.view(-1))
        params = model_received[start_idx:start_end].reshape(fed_model.state_dict()[name].data.shape)
        state_dict[name] = params
        previous_name = name
    fed_model.load_state_dict(state_dict)
    val_loss, val_acc = test(global_test_data, global_test_label.long(), fed_model, criterion, use_cuda)
    is_best = best_global_acc < val_acc
    best_global_acc = max(best_global_acc, val_acc)
    best_accs_per_round.append(best_global_acc)
    accs_per_round.append(val_acc)
    loss_per_round.append(val_loss)
    if epoch_num%10==0 or epoch_num==nepochs-1:
        print('e %d val loss %.3f val acc %.3f best val_acc %.3f'% (epoch_num, val_loss, val_acc, best_global_acc))
    epoch_num+=1

final_accs_per_client=[]
for i in range(num_workers):
    client_loss, client_acc = test(each_worker_te_data[i], each_worker_te_label[i].long(),
                                   fed_model, criterion, use_cuda)
    final_accs_per_client.append(client_acc)
results = collections.OrderedDict(
    final_accs_per_client=np.array(final_accs_per_client),
    accs_per_round=np.array(accs_per_round),
    best_accs_per_round=np.array(best_accs_per_round),
    loss_per_round=np.array(loss_per_round)
)
pickle.dump(results, open(os.path.join(home_dir, 'FLDetector_plots_data/rq1_baseline_fashion_fast_mean.pkl'), 'wb'))

e 0 val loss 1.402 val acc 69.700 best val_acc 69.700
e 10 val loss 0.442 val acc 83.979 best val_acc 83.979
e 20 val loss 0.362 val acc 87.016 best val_acc 87.016
e 30 val loss 0.327 val acc 88.270 best val_acc 88.290
e 40 val loss 0.306 val acc 89.246 best val_acc 89.266
e 49 val loss 0.295 val acc 89.585 best val_acc 89.585
e 50 val loss 0.293 val acc 89.565 best val_acc 89.585
