In [1]:
import numpy as np
import copy
import torch
import torch.nn as nn
import torch.nn.utils as utils
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Mounted at /content/drive


In [2]:
### LOADING DATA ###

def get_train_valid_loader(data_dir, batch_size, augment, random_seed, valid_size=0.1, shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # Define transform
    transform = transforms.Compose([
        transforms.Resize((227, 227)),
        transforms.ToTensor(),
        normalize,
    ])

    # Load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )
    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx1, train_idx2 = indices[split:(split + num_train) // 2], indices[(split + num_train) // 2:]
    valid_idx = indices[:split]
    train_sampler1 = SubsetRandomSampler(train_idx1)
    train_sampler2 = SubsetRandomSampler(train_idx2)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader1 = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler1
    )
    train_loader2 = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler2
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler
    )

    return (train_loader1, train_loader2, valid_loader)


def get_test_loader(data_dir, batch_size, shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
    # define transform
    transform = transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,
    ])
    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )
    return data_loader


# CIFAR10 dataset
train_loader1, train_loader2, valid_loader = get_train_valid_loader(data_dir = './data', batch_size = 64, augment = False, random_seed = 1)

test_loader = get_test_loader(data_dir = './data', batch_size = 64)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:12<00:00, 13301791.79it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified


In [3]:
## taken from https://blog.paperspace.com/alexnet-pytorch/

class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [4]:
### QUANTIZATION CODE ###

def quantize(vec, num_bits, stochastic):
    """
    :param : torch tensor, number of bits
    :return: quantized vector
    """
    s = 2 ** num_bits
    shape = vec.shape

    # Normalized vector
    vec = vec.view(-1)
    # norm = torch.norm(vec, dim=1, keepdim=True)
    norm = torch.max(torch.abs(vec), dim=0, keepdim=True)[0]
    normalized_vec = vec / norm

    # Epsilon
    scaled_vec = (torch.abs(normalized_vec) * s).to(device)
    l = torch.clamp(scaled_vec, 0, s-1).type(torch.int32)

    if stochastic:
      probabilities = scaled_vec - l.type(torch.float32)
      r = torch.rand(l.size()).to(device)
      l[:] += (probabilities > r).type(torch.int32) - (l == s-1).type(torch.int32)

    # Sign
    signs = torch.where(torch.sign(vec) > 0, torch.tensor(1), torch.tensor(-1)).to(device)

    m = torch.mul(torch.mul(l.view(shape), signs.view(shape)), norm / (s-1.0))
    return m

def map_gradients(model, num_bits, stochastic, previous):
  quantized_grads = []
  prev_grads = []
  for p, prev in zip(model.parameters(), previous):
    gradients = quantize(p.grad.data.clone() + prev, num_bits, stochastic) # compute quantized gradient
    p.grad.data.copy_(gradients) # set gradient to quantized gradient
    quantized_grads.append(gradients)
    prev_grads.append(p.grad.data - gradients)
  return quantized_grads, prev_grads

In [5]:
def test_and_save_best(mem_models, num_bits):
  # Sort the models based on accuracy in descending order
  sorted_models = sorted(mem_models, key=lambda x: x[0], reverse=True)

  # Select the top 3 models
  top_models = sorted_models[:3]

  for acc, model, epoch in top_models:
    print("Epoch : " + str(epoch) + ", Validation Accuracy : " + str(acc))

  best_accuracy = 0.0
  best_model = None
  best_epoch = 0

  # Evaluate the top models on the test data
  for accuracy, model, epoch in top_models:
      model.eval()  # Set the model to evaluation mode
      with torch.no_grad():
          correct = 0
          total = 0
          for images, labels in test_loader:
              images = images.to(device)
              labels = labels.to(device)
              outputs = model(images)
              _, predicted = torch.max(outputs.data, 1)
              total += labels.size(0)
              correct += (predicted == labels).sum().item()
          accuracy = 100 * correct / total
          print('Accuracy of the network on the {} test images: {} %'.format(10000, accuracy))

          # Check if this model has the best accuracy so far
          if accuracy > best_accuracy:
              best_accuracy = accuracy
              best_model = model
              best_epoch = epoch

  # Save the best model to disk
  torch.save(best_model.state_dict(), '/content/drive/MyDrive/OptML/best_model_{}_{}_{}.pth'.format(num_bits, best_accuracy, best_epoch))
  print('Best model saved with accuracy: {} %, at epoch : {}'.format(best_accuracy, best_epoch))

In [6]:
### BASELINE MODEL ###

def get_train_valid_loader(data_dir,batch_size,augment,random_seed,valid_size=0.1,shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
    ])

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


def get_test_loader(data_dir,batch_size,shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )

    # define transform
    transform = transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,
    ])

    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )

    return data_loader


# CIFAR10 dataset
train_loader, valid_loader = get_train_valid_loader(data_dir = './data',batch_size = 64,
                       augment = False,random_seed = 1)

test_loader = get_test_loader(data_dir = './data',
                              batch_size = 64)

num_classes = 10
num_epochs = 20
batch_size = 64
learning_rate = 0.005

mem_models = []

model = AlexNet(num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)

# Train the model
total_step = len(train_loader)


for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))
        if epoch > 8:
          mem_models += [(correct / total, copy.deepcopy(model), epoch+1)]

test_and_save_best(mem_models, 32)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Epoch [1/20], Step [704/704], Loss: 1.5231
Accuracy of the network on the 5000 validation images: 61.1 %
Epoch [2/20], Step [704/704], Loss: 1.6057
Accuracy of the network on the 5000 validation images: 67.9 %
Epoch [3/20], Step [704/704], Loss: 0.6121
Accuracy of the network on the 5000 validation images: 72.14 %
Epoch [4/20], Step [704/704], Loss: 0.5621
Accuracy of the network on the 5000 validation images: 76.22 %
Epoch [5/20], Step [704/704], Loss: 0.2092
Accuracy of the network on the 5000 validation images: 76.68 %
Epoch [6/20], Step [704/704], Loss: 1.0236
Accuracy of the network on the 5000 validation images: 75.66 %
Epoch [7/20], Step [704/704], Loss: 1.1628
Accuracy of the network on the 5000 validation images: 77.14 %
Epoch [8/20], Step [704/704], Loss: 0.2319
Accuracy of the network on the 5000 validation images: 79.24 %
Epoch [9/20], Step [704/704], Loss: 0.03

In [None]:
##### 2 NODES, 8 BIT QUANT, ERROR CORRECTION, STOCHASTIC ######  3h50min, 84.2%

num_classes = 10
num_epochs = 25
learning_rate = 0.005
nodes = 2

num_bits = 8
stochastic = True

mem_models = []

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
model = AlexNet(num_classes).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)
total_step = len(train_loader1)

prev1 = [torch.zeros(p.shape).to(device) for p in model.parameters()]
prev2 = [torch.zeros(p.shape).to(device) for p in model.parameters()]

for epoch in range(num_epochs):
    for i, (samples1, samples2) in enumerate(zip(train_loader1, train_loader2)):

        # Separate data per node
        images1, labels1 = samples1[0].to(device), samples1[1].to(device)
        images2, labels2 = samples2[0].to(device), samples2[1].to(device)

        optimizer.zero_grad()

        # First forward and backward pass
        outs = model(images1)
        loss1 = criterion(outs, labels1)
        loss1.backward()
        grad1, prev1 = map_gradients(model, num_bits, stochastic, prev1)

        # Clear gradients from first pass (not necessary)
        model.zero_grad()

        # Second forward and backward pass
        outs = model(images2)
        loss2 = criterion(outs, labels2)
        loss2.backward()
        _, prev2 = map_gradients(model, num_bits, stochastic, prev2)

        # Average the quantized gradients
        for p, g in zip(model.parameters(), grad1):
          p.grad.data.copy_(torch.add(p.grad.data, g))
          p.grad.data.mul_(1.0 / nodes)

        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss2.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

        if epoch > 8:
          mem_models += [(correct / total, copy.deepcopy(model), epoch+1)]

test_and_save_best(mem_models, num_bits)

Epoch [1/25], Step [352/352], Loss: 1.2077
Accuracy of the network on the 5000 validation images: 57.08 %
Epoch [2/25], Step [352/352], Loss: 1.1147
Accuracy of the network on the 5000 validation images: 68.56 %
Epoch [3/25], Step [352/352], Loss: 0.8194
Accuracy of the network on the 5000 validation images: 70.88 %
Epoch [4/25], Step [352/352], Loss: 0.5518
Accuracy of the network on the 5000 validation images: 74.96 %
Epoch [5/25], Step [352/352], Loss: 0.4984
Accuracy of the network on the 5000 validation images: 77.14 %
Epoch [6/25], Step [352/352], Loss: 0.5769
Accuracy of the network on the 5000 validation images: 78.52 %
Epoch [7/25], Step [352/352], Loss: 0.4584
Accuracy of the network on the 5000 validation images: 79.36 %
Epoch [8/25], Step [352/352], Loss: 0.4030
Accuracy of the network on the 5000 validation images: 78.72 %
Epoch [9/25], Step [352/352], Loss: 0.4573
Accuracy of the network on the 5000 validation images: 80.84 %
Epoch [10/25], Step [352/352], Loss: 0.6513
Ac

In [None]:
##### 2 NODES, 4 BIT QUANT, ERROR CORRECTION, STOCHASTIC ######  4h9min, 85.19%

num_classes = 10
num_epochs = 25
learning_rate = 0.005
nodes = 2

num_bits = 4
stochastic = True

mem_models = []

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
model = AlexNet(num_classes).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)
total_step = len(train_loader1)

prev1 = [torch.zeros(p.shape).to(device) for p in model.parameters()]
prev2 = [torch.zeros(p.shape).to(device) for p in model.parameters()]

for epoch in range(num_epochs):
    for i, (samples1, samples2) in enumerate(zip(train_loader1, train_loader2)):

        # Separate data per node
        images1, labels1 = samples1[0].to(device), samples1[1].to(device)
        images2, labels2 = samples2[0].to(device), samples2[1].to(device)

        optimizer.zero_grad()

        # First forward and backward pass
        outs = model(images1)
        loss1 = criterion(outs, labels1)
        loss1.backward()
        grad1, prev1 = map_gradients(model, num_bits, stochastic, prev1)

        # Clear gradients from first pass (not necessary)
        model.zero_grad()

        # Second forward and backward pass
        outs = model(images2)
        loss2 = criterion(outs, labels2)
        loss2.backward()
        _, prev2 = map_gradients(model, num_bits, stochastic, prev2)

        # Average the quantized gradients
        for p, g in zip(model.parameters(), grad1):
          p.grad.data.copy_(torch.add(p.grad.data, g))
          p.grad.data.mul_(1.0 / nodes)

        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss2.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

        if epoch > 8:
          mem_models += [(correct / total, copy.deepcopy(model), epoch+1)]

test_and_save_best(mem_models, num_bits)

Epoch [1/25], Step [352/352], Loss: 1.5242
Accuracy of the network on the 5000 validation images: 57.58 %
Epoch [2/25], Step [352/352], Loss: 0.7497
Accuracy of the network on the 5000 validation images: 69.14 %
Epoch [3/25], Step [352/352], Loss: 0.6480
Accuracy of the network on the 5000 validation images: 73.86 %
Epoch [4/25], Step [352/352], Loss: 0.6750
Accuracy of the network on the 5000 validation images: 75.86 %
Epoch [5/25], Step [352/352], Loss: 0.6826
Accuracy of the network on the 5000 validation images: 76.16 %
Epoch [6/25], Step [352/352], Loss: 0.5798
Accuracy of the network on the 5000 validation images: 77.02 %
Epoch [7/25], Step [352/352], Loss: 0.8281
Accuracy of the network on the 5000 validation images: 78.34 %
Epoch [8/25], Step [352/352], Loss: 0.2373
Accuracy of the network on the 5000 validation images: 78.66 %
Epoch [9/25], Step [352/352], Loss: 0.5098
Accuracy of the network on the 5000 validation images: 79.56 %
Epoch [10/25], Step [352/352], Loss: 0.5335
Ac

In [13]:
test_and_save_best(mem_models, num_bits) # Took 4th best

Epoch : 25, Validation Accuracy : 0.853
Epoch : 18, Validation Accuracy : 0.8426
Epoch : 19, Validation Accuracy : 0.8426
Epoch : 22, Validation Accuracy : 0.842
Accuracy of the network on the 10000 test images: 84.53 %
Accuracy of the network on the 10000 test images: 84.09 %
Accuracy of the network on the 10000 test images: 84.65 %
Accuracy of the network on the 10000 test images: 85.19 %
Best model saved with accuracy: 85.19 %, at epoch : 22


In [6]:
##### 2 NODES, 2 BIT QUANT, ERROR CORRECTION, STOCHASTIC ######  4h37min, 84.3%

num_classes = 10
num_epochs = 30
learning_rate = 0.005
nodes = 2

num_bits = 2
stochastic = True

mem_models = []

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
model = AlexNet(num_classes).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)
total_step = len(train_loader1)

prev1 = [torch.zeros(p.shape).to(device) for p in model.parameters()]
prev2 = [torch.zeros(p.shape).to(device) for p in model.parameters()]

for epoch in range(num_epochs):
    for i, (samples1, samples2) in enumerate(zip(train_loader1, train_loader2)):

        # Separate data per node
        images1, labels1 = samples1[0].to(device), samples1[1].to(device)
        images2, labels2 = samples2[0].to(device), samples2[1].to(device)

        optimizer.zero_grad()

        # First forward and backward pass
        outs = model(images1)
        loss1 = criterion(outs, labels1)
        loss1.backward()
        grad1, prev1 = map_gradients(model, num_bits, stochastic, prev1)

        # Clear gradients from first pass (not necessary)
        model.zero_grad()

        # Second forward and backward pass
        outs = model(images2)
        loss2 = criterion(outs, labels2)
        loss2.backward()
        _, prev2 = map_gradients(model, num_bits, stochastic, prev2)

        # Average the quantized gradients
        for p, g in zip(model.parameters(), grad1):
          p.grad.data.copy_(torch.add(p.grad.data, g))
          p.grad.data.mul_(1.0 / nodes)

        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss2.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

        if epoch > 8:
          mem_models += [(correct / total, copy.deepcopy(model), epoch+1)]

test_and_save_best(mem_models, num_bits)

Epoch [1/30], Step [352/352], Loss: 0.8602
Accuracy of the network on the 5000 validation images: 57.12 %
Epoch [2/30], Step [352/352], Loss: 0.7568
Accuracy of the network on the 5000 validation images: 67.46 %
Epoch [3/30], Step [352/352], Loss: 0.9965
Accuracy of the network on the 5000 validation images: 72.76 %
Epoch [4/30], Step [352/352], Loss: 0.5374
Accuracy of the network on the 5000 validation images: 75.24 %
Epoch [5/30], Step [352/352], Loss: 0.6035
Accuracy of the network on the 5000 validation images: 74.48 %
Epoch [6/30], Step [352/352], Loss: 0.5272
Accuracy of the network on the 5000 validation images: 78.2 %
Epoch [7/30], Step [352/352], Loss: 0.6479
Accuracy of the network on the 5000 validation images: 78.32 %
Epoch [8/30], Step [352/352], Loss: 0.4776
Accuracy of the network on the 5000 validation images: 80.12 %
Epoch [9/30], Step [352/352], Loss: 0.5016
Accuracy of the network on the 5000 validation images: 79.42 %
Epoch [10/30], Step [352/352], Loss: 0.6351
Acc