In [1]:
import numpy as np
import time
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from collections import namedtuple
from torch.utils.data.sampler import SubsetRandomSampler
import math


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
def data_loader(data_dir,
                batch_size,
                random_seed=42,
                valid_size=0.1,
                shuffle=True,
                test=False):

    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            normalize,
    ])

    if test:
        dataset = datasets.CIFAR10(
          root=data_dir, train=False,
          download=True, transform=transform,
        )

        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle
        )

        return data_loader

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(42)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


# CIFAR10 dataset
train_loader, valid_loader = data_loader(data_dir='./data',
                                         batch_size=128)

test_loader = data_loader(data_dir='./data',
                              batch_size=128,
                              test=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 70153144.57it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified


#Residual block

In [3]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
                        nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 1),
                        nn.BatchNorm2d(out_channels),
                        nn.ReLU())
        self.conv2 = nn.Sequential(
                        nn.Conv2d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1),
                        nn.BatchNorm2d(out_channels))
        self.downsample = downsample
        self.relu = nn.ReLU()
        self.out_channels = out_channels

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

#Resnet


In [4]:
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes = 10):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Sequential(
                        nn.Conv2d(3, 64, kernel_size = 7, stride = 2, padding = 3),
                        nn.BatchNorm2d(64),
                        nn.ReLU())
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
        self.layer0 = self._make_layer(block, 64, layers[0], stride = 1)
        self.layer1 = self._make_layer(block, 128, layers[1], stride = 2)
        self.layer2 = self._make_layer(block, 256, layers[2], stride = 2)
        self.layer3 = self._make_layer(block, 512, layers[3], stride = 2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes:

            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)


    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

#Model resnet18

In [5]:
def resnet18(num_classes=10):
    return ResNet(ResidualBlock, [2, 2, 2, 2], num_classes=num_classes)

#Quantization

In [6]:
# Compressor
class IdenticalCompressor(object):
    def __init__(self, size=None, shape=None, args=None):
        pass

    @staticmethod
    def compress(vec):
        return vec.clone()

    @staticmethod
    def decompress(signature):
        return signature

class PSQuantizer():
    def __init__(self, Compressor, parameters, args):
        self.parameters = list(parameters)
        self.num_layers = len(self.parameters)
        self.compressors = list()
        self.compressed_gradients = [list() for _ in range(self.num_layers)]
        self.args = args
        self.error_feedback = args.ef
        self.two_phase = self.args.two_phase
        for param in self.parameters:
            param_size = param.flatten().shape[0]
            self.compressors.append(
                Compressor(param_size, param.shape, args) if param_size > 1000
                else IdenticalCompressor()
            )
            if self.error_feedback:
                param.error = [torch.zeros_like(param)
                               for _ in range(args.num_users)]
            if self.error_feedback and self.two_phase:
                param.server_error = torch.zeros_like(param)

    def record(self, user, epoch):
        if self.args.scale == 'exp':
            scale = (2 / (math.exp(-epoch) + 1) - 1)
        else:
            scale = float(self.args.scale)

        for i, param in enumerate(self.parameters):
            if self.error_feedback:
                param.grad.data.add_(scale * param.error[user])
                decompressed_g = self.compressors[i].decompress(
                    self.compressors[i].compress(param.grad.data)
                )
                param.error[user].data = param.grad.data - decompressed_g
            else:
                decompressed_g = self.compressors[i].decompress(
                    self.compressors[i].compress(param.grad.data)
                )
            self.compressed_gradients[i].append(decompressed_g)

    def apply(self):
        for i, param in enumerate(self.parameters):
            g = torch.stack(self.compressed_gradients[i], dim=0).mean(dim=0)

            # if compress gradient on two phase, i.e.,
            # compress the sum of decompressed gradient
            if self.two_phase:
                if self.error_feedback:
                    g.add_(param.server_error)
                    decompressed_g = self.compressors[i].decompress(
                        self.compressors[i].compress(g))
                    param.server_error = g - decompressed_g
                    g = decompressed_g
                else:
                    g = self.compressors[i].decompress(
                        self.compressors[i].compress(g))

            param.grad.data = g
        for compressed in self.compressed_gradients:
            compressed.clear()

In [7]:

class QSGDCompressor(object):
    def __init__(self, size, shape, args):
        self.random = args.random
        self.bit = args.n_bit
        c_dim = args.c_dim
        assert self.bit > 0

        self.cuda = not args.no_cuda
        self.s = 2 ** self.bit
        self.size = size
        self.shape = shape


        self.code_dtype = torch.int32


    def compress(self, vec):
        """
        :param vec: torch tensor
        :return: norm, signs, quantized_intervals
        """
        vec = vec.view(-1)
        # norm = torch.norm(vec, dim=1, keepdim=True)
        norm = torch.max(torch.abs(vec), dim=0, keepdim=True)[0]
        normalized_vec = vec / norm

        scaled_vec = torch.abs(normalized_vec) * self.s
        l = torch.clamp(scaled_vec, 0, self.s-1).type(self.code_dtype)

        if self.random:
            # l[i] <- l[i] + 1 with probability |v_i| / ||v|| * s - l
            probabilities = scaled_vec - l.type(torch.float32)
            r = torch.rand(l.size())
            if self.cuda:
                r = r.cuda()
            l[:] += (probabilities > r).type(self.code_dtype)

        signs = torch.sign(vec) > 0
        return [norm, signs.view(self.shape), l.view(self.shape)]

    def decompress(self, signature):
        [norm, signs, l] = signature
        assert l.shape == signs.shape
        scaled_vec = l.type(torch.float32) * (2 * signs.type(torch.float32) - 1)
        compressed = (scaled_vec.view(-1)) * norm / self.s
        return compressed.view(self.shape)

#Early stoppage

In [8]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')  # save checkpoint
        self.val_loss_min = val_loss


#Training Validation code

In [9]:
def train(train_loader, model, criterion, optimizer, epoch, nodes_number, quantizer):


    """
        Run one train epoch
    """

    # switch to train mode
    nodes = nodes_number

    lossNodes = []

    for i, (input, target) in enumerate(train_loader):


        input_var, target = input.cuda(), target.cuda()


        images, labels = torch.chunk(input_var, nodes), torch.chunk(target, nodes)

        optimizer.zero_grad()



        for node in range(nodes):
          outputnode = model(images[node])
          lossNode = criterion(outputnode, labels[node])
          lossNodes.append(lossNode)

          lossNode.backward()
          quantizer.record(node, epoch=epoch)

        quantizer.apply()
        optimizer.step()


    loss_values = [loss.item() for loss in lossNodes]
    loss_value = sum(loss_values) / len(loss_values)
    print ('Epoch [{}/{}],  Total loss : {:.4f}'
                   .format(epoch+1, num_epochs, loss_value ))


    with torch.no_grad():
        correct = 0
        total = 0
        losses = []
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)  # Compute validation loss
            losses.append(loss.item())
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

    # Calculate average validation loss
    average_validation_loss = sum(losses) / len(losses)
    return average_validation_loss

    # Early stopping check


#Setting Hyperparameters

In [10]:
### Default model setup ###
num_classes = 10
num_epochs = 40
batch_size = 128
learning_rate = 0.01
# args used for quantization with defaults set
arguments = namedtuple('arguments', ['ef', 'two_phase', 'n_bit', 'c_dim', 'random', 'no_cuda', 'num_users', 'scale'])
error_feedback = True
two_phase = False
random = True
no_cuda = False
c_dim = 0
scale = 'exp'



#Emilien

In [18]:
n = 0
for p in model.parameters():
  n += len(p.view(-1))
print(n)
norm = n * 32
q = 8
res = 32 + n + n * q
print("8 bits : " + str(norm/res))
q = 4
res = 32 + n + n * q
print("4 bits : " + str(norm/res))
q = 2
res = 32 + n + n * q
print("2 bits : " + str(norm/res))


11186442
8 bits : 3.5555544254400737
4 bits : 6.39999633842677
2 bits : 10.666656495633797


#Baseline

In [None]:
#2 nodes 32 bits

num_nodes = 2
num_bits = 32

# Model
model = resnet18(num_classes).to(device)

#Early stopping
early_stopping = EarlyStopping(patience=17, verbose=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9)




print(f'###########################################################################\n')
print(f'Running Resnet18 for {num_nodes} node(s) and {num_bits} quantization bits:')
start = time.time()

args = arguments(ef=error_feedback, two_phase=two_phase, n_bit=num_bits, c_dim=c_dim, random=random, no_cuda=no_cuda, num_users=num_nodes, scale=scale)
quantizer = PSQuantizer(QSGDCompressor, model.parameters(), args)
model.train()
for epoch in range(num_epochs):
    average_validation_loss = train(train_loader, model, criterion, optimizer, epoch, num_nodes,  quantizer)

    early_stopping(average_validation_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break




end = time.time()
print(f'Elapsed time: {(end - start) / 60} minutes')
print(f'\n\n')
# initialize the model
model = resnet18(num_classes).to(device)

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

# Put model in evaluation mode
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))



#2 nodes 2 bits

In [11]:
#2 nodes 2 bits

num_nodes = 2
num_bits = 2
# Model
model = resnet18(num_classes).to(device)

#Early stopping
early_stopping = EarlyStopping(patience=15, verbose=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9)




print(f'###########################################################################\n')
print(f'Running Resnet18 for {num_nodes} node(s) and {num_bits} quantization bits:')
start = time.time()

args = arguments(ef=error_feedback, two_phase=two_phase, n_bit=num_bits, c_dim=c_dim, random=random, no_cuda=no_cuda, num_users=num_nodes, scale=scale)
quantizer = PSQuantizer(QSGDCompressor, model.parameters(), args)
model.train()
for epoch in range(num_epochs):
    average_validation_loss = train(train_loader, model, criterion, optimizer, epoch, num_nodes,  quantizer)

    early_stopping(average_validation_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break




end = time.time()
print(f'Elapsed time: {(end - start) / 60} minutes')
print(f'\n\n')
# initialize the model
model = resnet18(num_classes).to(device)

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

# Put model in evaluation mode
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))



###########################################################################

Running Resnet18 for 2 node(s) and 2 quantization bits:
Epoch [1/40],  Total loss : 1.4854
Accuracy of the network on the 5000 validation images: 59.96 %
Validation loss decreased (inf --> 1.114754).  Saving model ...
Epoch [2/40],  Total loss : 0.9264
Accuracy of the network on the 5000 validation images: 72.7 %
Validation loss decreased (1.114754 --> 0.762367).  Saving model ...
Epoch [3/40],  Total loss : 0.6816
Accuracy of the network on the 5000 validation images: 77.02 %
Validation loss decreased (0.762367 --> 0.671238).  Saving model ...
Epoch [4/40],  Total loss : 0.5381
Accuracy of the network on the 5000 validation images: 79.84 %
Validation loss decreased (0.671238 --> 0.573482).  Saving model ...
Epoch [5/40],  Total loss : 0.4405
Accuracy of the network on the 5000 validation images: 81.06 %
Validation loss decreased (0.573482 --> 0.564173).  Saving model ...
Epoch [6/40],  Total loss : 0.3505
Acc

In [11]:
#2 nodes 4 bits

num_nodes = 2
num_bits = 4
# Model
model = resnet18(num_classes).to(device)

#Early stopping
early_stopping = EarlyStopping(patience=15, verbose=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9)




print(f'###########################################################################\n')
print(f'Running Resnet18 for {num_nodes} node(s) and {num_bits} quantization bits:')
start = time.time()

args = arguments(ef=error_feedback, two_phase=two_phase, n_bit=num_bits, c_dim=c_dim, random=random, no_cuda=no_cuda, num_users=num_nodes, scale=scale)
quantizer = PSQuantizer(QSGDCompressor, model.parameters(), args)
model.train()
for epoch in range(num_epochs):
    average_validation_loss = train(train_loader, model, criterion, optimizer, epoch, num_nodes,  quantizer)

    early_stopping(average_validation_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break




end = time.time()
print(f'Elapsed time: {(end - start) / 60} minutes')
print(f'\n\n')
# initialize the model
model = resnet18(num_classes).to(device)

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

# Put model in evaluation mode
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))



###########################################################################

Running Resnet18 for 2 node(s) and 4 quantization bits:
Epoch [1/40],  Total loss : 1.4764
Accuracy of the network on the 5000 validation images: 59.82 %
Validation loss decreased (inf --> 1.095513).  Saving model ...
Epoch [2/40],  Total loss : 0.9141
Accuracy of the network on the 5000 validation images: 74.38 %
Validation loss decreased (1.095513 --> 0.741983).  Saving model ...
Epoch [3/40],  Total loss : 0.6692
Accuracy of the network on the 5000 validation images: 77.2 %
Validation loss decreased (0.741983 --> 0.684616).  Saving model ...
Epoch [4/40],  Total loss : 0.5322
Accuracy of the network on the 5000 validation images: 77.34 %
Validation loss decreased (0.684616 --> 0.648937).  Saving model ...
Epoch [5/40],  Total loss : 0.4337
Accuracy of the network on the 5000 validation images: 80.52 %
Validation loss decreased (0.648937 --> 0.584483).  Saving model ...
Epoch [6/40],  Total loss : 0.3585
Acc

#2 nodes 8 bits

In [11]:
#2 nodes 8 bits

num_nodes = 2
num_bits = 8
# Model
model = resnet18(num_classes).to(device)

#Early stopping
early_stopping = EarlyStopping(patience=17, verbose=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9)




print(f'###########################################################################\n')
print(f'Running Resnet18 for {num_nodes} node(s) and {num_bits} quantization bits:')
start = time.time()

args = arguments(ef=error_feedback, two_phase=two_phase, n_bit=num_bits, c_dim=c_dim, random=random, no_cuda=no_cuda, num_users=num_nodes, scale=scale)
quantizer = PSQuantizer(QSGDCompressor, model.parameters(), args)
model.train()
for epoch in range(num_epochs):
    average_validation_loss = train(train_loader, model, criterion, optimizer, epoch, num_nodes,  quantizer)

    early_stopping(average_validation_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break




end = time.time()
print(f'Elapsed time: {(end - start) / 60} minutes')
print(f'\n\n')
# initialize the model
model = resnet18(num_classes).to(device)

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

# Put model in evaluation mode
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))



###########################################################################

Running Resnet18 for 2 node(s) and 8 quantization bits:
Epoch [1/40],  Total loss : 1.5110
Accuracy of the network on the 5000 validation images: 59.62 %
Validation loss decreased (inf --> 1.117321).  Saving model ...
Epoch [2/40],  Total loss : 0.9354
Accuracy of the network on the 5000 validation images: 70.34 %
Validation loss decreased (1.117321 --> 0.832653).  Saving model ...
Epoch [3/40],  Total loss : 0.6936
Accuracy of the network on the 5000 validation images: 76.66 %
Validation loss decreased (0.832653 --> 0.664794).  Saving model ...
Epoch [4/40],  Total loss : 0.5472
Accuracy of the network on the 5000 validation images: 79.54 %
Validation loss decreased (0.664794 --> 0.603012).  Saving model ...
Epoch [5/40],  Total loss : 0.4516
Accuracy of the network on the 5000 validation images: 81.98 %
Validation loss decreased (0.603012 --> 0.553483).  Saving model ...
Epoch [6/40],  Total loss : 0.3638
Ac