In [0]:
import torch
import torchvision
import torch.nn as nn
import copy
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
#from networks import *
from iCIFAR import *

%matplotlib inline
import matplotlib.pyplot as plt
import random
import time

In [0]:
######### Modifiable Settings ##########
batch_size = 128  # Batch size
n = 5  # Set the depth of the architecture: n = 5 -> 32 layers (See He et al. paper)
nb_cl = 10  # Classes per group
nb_protos = 20  # Number of prototypes per class at the end: total protoset memory/ total number of classes
epochs = 10  # Total number of epochs
lr_old = 2.  # Initial learning rate
lr_strat = [49, 63]  # Epochs where learning rate gets decreased
lr_factor = 5.  # Learning rate decrease factor
wght_decay = 0.00001  # Weight Decay
nb_runs = 10  # Number of runs (random ordering of classes at each run)
np.random.seed(42)  # Fix the random seed

########################################

device = 'cuda'


def save_checkpoint(state, filename):
    torch.save(state, filename)

# dictionary=number of training samples per class
dictionary_size = 500

## Network

In [0]:
import math
from torch.nn import init

class DownsampleA(nn.Module):
    def __init__(self, nIn, nOut, stride):
        super(DownsampleA, self).__init__()
        assert stride == 2
        self.avg = nn.AvgPool2d(kernel_size=1, stride=stride)

    def forward(self, x):
        x = self.avg(x)
        return torch.cat((x, x.mul(0)), 1)

class ResNetBasicblock(nn.Module):
    expansion = 1
    """
    RexNet basicblock (https://github.com/facebook/fb.resnet.torch/blob/master/models/resnet.lua)
    """

    def __init__(self, inplanes, planes, stride=1, downsample=None, relu=True):
        super(ResNetBasicblock, self).__init__()

        self.conv_a = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn_a = nn.BatchNorm2d(planes)

        self.conv_b = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn_b = nn.BatchNorm2d(planes)

        self.downsample = downsample
        self.relu = relu

    def forward(self, x):
        residual = x

        basicblock = self.conv_a(x)
        basicblock = self.bn_a(basicblock)
        basicblock = F.relu(basicblock, inplace=True)

        basicblock = self.conv_b(basicblock)
        basicblock = self.bn_b(basicblock)

        if self.downsample is not None:
            residual = self.downsample(x)

        y = residual + basicblock

        if self.relu:
            y = F.relu(y)

        return y


class CifarResNet(nn.Module):
    """
    ResNet optimized for the Cifar Dataset, as specified in
    https://arxiv.org/abs/1512.03385.pdf
    """

    def __init__(self, block=ResNetBasicblock, depth=32, num_classes=100, channels=3):
        """ Constructor
        Args:
          depth: number of layers.
          num_classes: number of classes
          base_width: base width
        """
        super(CifarResNet, self).__init__()

        # Model type specifies number of layers for CIFAR-10 and CIFAR-100 model
        assert (depth - 2) % 6 == 0, 'depth should be one of 20, 32, 44, 56, 110'
        layer_blocks = (depth - 2) // 6

        self.num_classes = num_classes

        self.conv_1_3x3 = nn.Conv2d(channels, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn_1 = nn.BatchNorm2d(16)

        self.inplanes = 16
        self.stage_1 = self._make_layer(block, 16, layer_blocks, 1)
        self.stage_2 = self._make_layer(block, 32, layer_blocks, 2)
        self.stage_3 = self._make_layer(block, 64, layer_blocks, 2, last=True)
        self.avgpool = nn.AvgPool2d(8)
        self.linear = nn.Linear(64, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                init.kaiming_normal_(m.weight, nonlinearity='relu')
                # m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, math.sqrt(1. / 64.))
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, last=False):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = DownsampleA(self.inplanes, planes * block.expansion, stride)

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        if last:
            for i in range(1, blocks - 1):
                layers.append(block(self.inplanes, planes))
            layers.append(block(self.inplanes, planes, relu=False))

        else:
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):

        x = self.conv_1_3x3(x)
        x = self.bn_1(x)
        x = F.relu(x)
        x = self.stage_1(x)
        x = self.stage_2(x)
        x = self.stage_3(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)

        return x

    def update_means(self, x, y):
        self.linear.update_means(x, y)

    def predict(self, x):
        out = self.linear(x)
        return out


# Training

In [0]:
def fit_incremental(lr):
  
  new_lr = lr
  optimizer = optim.SGD(filter(lambda p: p.requires_grad, network.parameters()), lr=new_lr, momentum=0.9,
                        weight_decay=wght_decay, nesterov=False)

  for epoch in range(epochs):
    network.train()
    train_loss = 0
    correct = 0
    total = 0

    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    count = 0
    for inputs, targets_prep in icifar.minibatches(augment=True):
        
        targets = np.zeros((inputs.shape[0], 100), np.float32)  # 100 = classes of cifar
        targets[range(len(targets_prep)), targets_prep.type(torch.int32)] = 1.  # prepare target for CE loss

        inputs = inputs.to(device)
        
        optimizer.zero_grad()
        outputs = network.forward(inputs)  # feature vector only
        prediction = network.predict(outputs)  # make the prediction with sigmoid, making g_y(xi)
        targets = torch.tensor(targets).to(outputs.device)
        targets_prep = torch.LongTensor(targets_prep).to(outputs.device)

        if iteration > 0:  # apply distillation
            outputs_old = network2.forward(inputs)
            prediction_old = network2.predict(outputs_old)
            targets[:, np.array(icifar.order[range(0, iteration * nb_cl)])] = \
                F.sigmoid(prediction_old[:, np.array(icifar.order[range(0, iteration * nb_cl)])])

        loss_bx = loss(prediction, targets)  # joins classification and distillation losses
        loss_bx.backward()
        optimizer.step()

        train_loss += loss_bx.item()
        _, predicted = prediction.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets_prep).sum().item()

    # END loop minibatches
    network.eval()
    test_loss = 0
    correct = 0
    total = 0
    # count = 0
    for inputs, targets_prep in icifar.minibatches(train=False):
        # count += 1

        targets = np.zeros((inputs.shape[0], 100), np.float32)
        targets[range(len(targets_prep)), targets_prep.type(torch.int32)] = 1.

        inputs = inputs.to(device)

        outputs = network.forward(inputs)   # make the embedding
        outputs = network.predict(outputs)  # make the NCM#

        targets = torch.tensor(targets).to(outputs.device)
        loss_bx = loss(outputs, targets)
        test_loss += loss_bx.item()

        targets_prep = torch.LongTensor(targets_prep).to(outputs.device)
        _, predicted = outputs.max(1)
        correct += predicted.eq(targets_prep).sum().item()

        total += targets.size(0)

    acc = 100. * correct / total
    print(f"Epoch {epoch} : Loss {test_loss/total:.8f} - Accuracy {acc:.2f}")
    
      # adjust learning rate
    if (epoch + 1) in lr_strat:
      new_lr = new_lr / lr_factor
      print("New LR:" + str(new_lr))
      optimizer = optim.SGD(filter(lambda p: p.requires_grad, network.parameters()), lr=new_lr, momentum=0.9,
                            weight_decay=wght_decay, nesterov=False)


In [0]:
def update_exemplars(): 
    nb_protos_cl = int(np.ceil(nb_protos * 100. / nb_cl / (iteration + 1)))  # num of exemplars per class
    # Herding
    print('Updating exemplar set...')
    network.eval()
    
    for iter_dico in range(nb_cl):
        # Possible exemplars in the feature space and projected on the L2 sphere
        # exemplars of class iter_dico + nb_cl
        pinput = torch.tensor(icifar.get_X_of_class(icifar.order[iteration * nb_cl + iter_dico])).to(device)
        mapped_prototypes = network.forward(pinput).cpu().detach().numpy()
        D = mapped_prototypes.T
        D = D / np.linalg.norm(D, axis=0)
        # Herding procedure : ranking of the potential exemplars
        mu = np.mean(D, axis=1)
        # set exemplar to zero
        alpha_dr_herding[iteration, :, iter_dico] = alpha_dr_herding[iteration, :, iter_dico] * 0
        w_t = mu
        iter_herding = 0
        iter_herding_eff = 0
        # Herding algorithm
        while not (np.sum(alpha_dr_herding[iteration, :, iter_dico] != 0) == min(nb_protos_cl,
                                                                                 500)) and iter_herding_eff < 1000:
            tmp_t = np.dot(w_t, D)
            ind_max = np.argmax(tmp_t)
            iter_herding_eff += 1
            if alpha_dr_herding[iteration, ind_max, iter_dico] == 0:
                alpha_dr_herding[iteration, ind_max, iter_dico] = 1 + iter_herding
                iter_herding += 1
            w_t = w_t + mu - D[:, ind_max]

    # Prepare the protoset
    X_protoset_cumuls = []
    Y_protoset_cumuls = []
    
    # Storing the selected exemplars in the protoset
    for iteration2 in range(iteration + 1):

        for iter_dico in range(nb_cl):
            alph = alpha_dr_herding[iteration2, :, iter_dico]  # select the herd of the current class
            alph = (alph > 0) * (alph < nb_protos_cl + 1) * 1. # put one in the ones to select
            
            # append exeplars in the protoset
            X_protoset_cumuls.append(icifar.get_X_of_class(icifar.order[iteration2 * nb_cl + iter_dico])[np.where(alph == 1)[0]])
            Y_protoset_cumuls.append(icifar.order[iteration2 * nb_cl + iter_dico] * np.ones(len(np.where(alph == 1)[0]), dtype=np.int32))

    return X_protoset_cumuls, Y_protoset_cumuls
  

In [0]:
def compute_means(iteration=10):
  
  class_means = np.zeros((64, 100, 2))
  nb_protos_cl = int(np.ceil(nb_protos * 100. / nb_cl / (iteration)))  # num of exemplars per class
  
  for iteration2 in range(iteration):
    current_cl = icifar.order[range(iteration2 * nb_cl, (iteration2 + 1) * nb_cl)]
    for iter_dico in range(nb_cl):

      pinput = torch.tensor(icifar.get_X_of_class(icifar.order[iteration2 * nb_cl + iter_dico])).to(
          device)

      # Collect data in the feature space for each class
      mapped_prototypes = network.forward(pinput).cpu().detach().numpy()
      D = mapped_prototypes.T
      D = D / np.linalg.norm(D, axis=0)

      # Flipped version also #todo non capisco perche' usa anche il flippato, check di performance se lo togliessi
      inverted = np.array(icifar.get_X_of_class(icifar.order[iteration2 * nb_cl + iter_dico])[:, :, :, ::-1])
      pinput2 = torch.tensor(np.array((inverted - icifar.pixel_means), dtype=np.float32)).to(device)
      mapped_prototypes2 = network.forward(pinput2).cpu().detach().numpy()
      D2 = mapped_prototypes2.T
      D2 = D2 / np.linalg.norm(D2, axis=0)

      # iCaRL
      alph = alpha_dr_herding[iteration2, :, iter_dico] # importance of each image of this class
      alph = (alph > 0) * (alph < nb_protos_cl + 1) * 1. # 1 if in the current herd
      
      alph = alph / np.sum(alph) # to make the average only for the current prototypes. 
      class_means[:, current_cl[iter_dico], 0] = (np.dot(D, alph) + np.dot(D2, alph)) / 2 #dot operation is for weighting each f(xi) with alpha
      class_means[:, current_cl[iter_dico], 0] /= np.linalg.norm(class_means[:, current_cl[iter_dico], 0])

      # Normal NCM
      alph = np.ones(dictionary_size) / dictionary_size # to make the avg over all samples
      class_means[:, current_cl[iter_dico], 1] = (np.dot(D, alph) + np.dot(D2, alph)) / 2 #dot operation is for weighting each f(xi) with alpha
      class_means[:, current_cl[iter_dico], 1] /= np.linalg.norm(class_means[:, current_cl[iter_dico], 1])

  np.save('cl_means', class_means)
  return class_means


# Validation

In [0]:
from scipy.spatial.distance import cdist

def test(network, iteration, class_means=None):
  
    if class_means is None:
      class_means = compute_means(iteration)
    
    top1_acc_list = np.zeros(3)
    
    stat_hb1 = []
    stat_icarl = []
    stat_ncm = []

    # TODO converto to icifar.minibatches
    for inputs, targets_prep in icifar.minibatches_for_test(iteration):
        
        inputs = inputs.to(device)
        
        # compute prediction
        outputs = network.forward(inputs) #returns embeddings
        pred = network.predict(outputs).cpu().detach().numpy() #return classes of Hybrid1
        outputs = outputs.cpu().detach().numpy()

        outputs = (outputs.T / np.linalg.norm(outputs.T, axis=0)).T # normalize output

        # Compute score for iCaRL
        sqd = cdist(class_means[:, :, 0].T, outputs, 'sqeuclidean') # Squared euclidean distance
        score_icarl = (-sqd).T
        # Compute score for NCM
        sqd = cdist(class_means[:, :, 1].T, outputs, 'sqeuclidean') # Squared euclidean distance
        score_ncm = (-sqd).T

        # Compute the accuracy over the batch
        targets_prep = targets_prep.numpy()
        
        stat_hb1 += ([ll in best for ll, best in zip(targets_prep, np.argsort(pred, axis=1)[:, -1:])])
        stat_icarl += ([ll in best for ll, best in zip(targets_prep, np.argsort(score_icarl, axis=1)[:, -1:])])
        stat_ncm += ([ll in best for ll, best in zip(targets_prep, np.argsort(score_ncm, axis=1)[:, -1:])])

    top1_acc_list[0] = np.average(stat_icarl) * 100 # ICarl
    top1_acc_list[1] = np.average(stat_hb1) * 100   # Hybrid 1
    top1_acc_list[2] = np.average(stat_ncm) * 100   # NCM

    return top1_acc_list


# Main.py

In [22]:
# get the data
icifar = ICIFAR('data',batch_size, nb_cl,'fixed_order.npy')

Files already downloaded and verified
Files already downloaded and verified


In [0]:
# define network
network = CifarResNet().to(device)
loss = nn.BCEWithLogitsLoss(size_average=True)

icifar.set_run(0)

lr = lr_old

# --- Initialization of the variables for this run
X_protoset_cumuls = []
Y_protoset_cumuls = []

acc_cumuls = [[], [], []]
acc_original = [[], [], []]
# 10, 500, 10 -> 100 (class) * 500 (img per class)
# ordered queue for all images of the dataset
alpha_dr_herding = np.zeros((100 // nb_cl, dictionary_size, nb_cl), np.float32)  

for iteration in range(100 // nb_cl):
      
  # Add the stored exemplars to the training data
  if iteration > 0:
      X_protoset = np.concatenate(X_protoset_cumuls)
      Y_protoset = np.concatenate(Y_protoset_cumuls)
  else:
      X_protoset = None
      Y_protoset = None

  # Prepare the training data for the current batch of classes
  icifar.next_iteration(X_protoset, Y_protoset)

  ## TRAIN THIS ITERATION ##
  print('Batch of classes number {0} arrives ...'.format(iteration + 1))
  
  fit_incremental(lr) #train for N epochs (after each epoch validate)

  # Duplicate current network to distillate info
  network2 = copy.deepcopy(network)
  network2.eval()

  # Save the network

  save_checkpoint({
      'iteration': iteration,
      'state_dict': network.state_dict()
  }, "iter_" + str(iteration) + "_checkpoint.pth.tar")
  ## END OF TRAINING FOR THIS ITERATION ##

  ## UPDATE EXEMPLARS ##
  X_protoset_cumuls, Y_protoset_cumuls = update_exemplars()
  
  acc_cum = test(network, iteration+1)
  
  print("Cumulative results")
  print("  top 1 accuracy iCaRL          :\t\t{:.2f} %".format(acc_cum[0]))
  print("  top 1 accuracy Hybrid 1       :\t\t{:.2f} %".format(acc_cum[1]))
  print("  top 1 accuracy NCM            :\t\t{:.2f} %".format(acc_cum[2]))
          
  acc_base = test(network, 1)
  
  print("First batch results")
  print("  top 1 accuracy iCaRL          :\t\t{:.2f} %".format(acc_base[0]))
  print("  top 1 accuracy Hybrid 1       :\t\t{:.2f} %".format(acc_base[1]))
  print("  top 1 accuracy NCM            :\t\t{:.2f} %".format(acc_base[2]))

  yield torch.tensor(inp_exc), torch.tensor(y[excerpt])


Batch of classes number 1 arrives ...
Epoch 0 : Loss 0.00022935 - Accuracy 31.58
Epoch 1 : Loss 0.00019274 - Accuracy 42.41
Epoch 2 : Loss 0.00024587 - Accuracy 36.83
Epoch 3 : Loss 0.00020382 - Accuracy 42.41
Epoch 4 : Loss 0.00018578 - Accuracy 47.99
Epoch 5 : Loss 0.00016403 - Accuracy 53.57
Epoch 6 : Loss 0.00015605 - Accuracy 56.25
Epoch 7 : Loss 0.00015694 - Accuracy 57.03
Epoch 8 : Loss 0.00013885 - Accuracy 64.73
Epoch 9 : Loss 0.00015215 - Accuracy 59.71
Updating exemplar set...
Computing mean-of_exemplars and theoretical mean...
Cumulative results
  top 1 accuracy iCaRL          :		6.36 %
  top 1 accuracy Hybrid 1       :		59.71 %
  top 1 accuracy NCM            :		6.25 %
First batch results
  top 1 accuracy iCaRL          :		6.36 %
  top 1 accuracy Hybrid 1       :		59.71 %
  top 1 accuracy NCM            :		6.25 %
Batch of classes number 2 arrives ...




Epoch 0 : Loss 0.00027126 - Accuracy 22.32
Epoch 1 : Loss 0.00023607 - Accuracy 38.39
Epoch 2 : Loss 0.00021935 - Accuracy 47.99
Epoch 3 : Loss 0.00022347 - Accuracy 47.66
Epoch 4 : Loss 0.00020678 - Accuracy 54.69
Epoch 5 : Loss 0.00020632 - Accuracy 54.58
Epoch 6 : Loss 0.00020562 - Accuracy 57.70
Epoch 7 : Loss 0.00020171 - Accuracy 59.26
Epoch 8 : Loss 0.00020149 - Accuracy 59.26
Epoch 9 : Loss 0.00019685 - Accuracy 63.28
Updating exemplar set...
Computing mean-of_exemplars and theoretical mean...
Cumulative results
  top 1 accuracy iCaRL          :		1.61 %
  top 1 accuracy Hybrid 1       :		50.99 %
  top 1 accuracy NCM            :		1.51 %
First batch results
  top 1 accuracy iCaRL          :		5.25 %
  top 1 accuracy Hybrid 1       :		38.06 %
  top 1 accuracy NCM            :		5.47 %
Batch of classes number 3 arrives ...
Epoch 0 : Loss 0.00033172 - Accuracy 14.17
Epoch 1 : Loss 0.00034507 - Accuracy 18.42
Epoch 2 : Loss 0.00033184 - Accuracy 23.44
Epoch 3 : Loss 0.00031998 - Accur