# Training and testing of GradNet

### Imports

In [1]:
from __future__ import print_function
import numpy as np
try:
    from chainer.cuda import cupy as cp
except:
    pass
import chainer
import chainer.links as L
from chainer import training
from chainer.training import extensions
from chainer.datasets import TupleDataset
from copy import deepcopy
from chainer.datasets import get_cifar10
from linalg import maxpercentile
import time
import net
import augmentation
import os

  from ._conv import register_converters as _register_converters


### Utility functions

In [2]:
def conv_distortion(databatch, device, test=False):
   """Applies augmentation to the images and converts databatch into a tuple of form (images, labels) where both 'images' and 'labels' are arrays.
      Moves data to gpu if device>=0."""
   if device>=0:
        xp=cp
   else:
        xp=np
   batchsize = len(databatch)
   inputdata = [datatuple[0] for datatuple in databatch]
   labels = xp.array([datatuple[1] for datatuple in databatch])
   if device>=0:
      inputdata = chainer.cuda.to_gpu(inputdata, device)
      labels = chainer.cuda.to_gpu(labels, device)
   distorted_input = augmentation.distortion_batch(inputdata, device, test=test)
   return (distorted_input, labels) 

def grads (optimizer, data, label, mins=None, ranges=None, xp=np, percentage=15, exponent=0.5):
   """Creates gradient array for a given (data, label) pair. The neural net is contained by 'optimizer'.
      'mins' and 'ranges' are arrays needed for scaling the gradients, 'exponent' is the exponent of the power norm.
      'percentage' determines where we cut the gradient values to throw away the small ones."""
   opt = deepcopy (optimizer)
   model = opt.target
   net = opt.target.predictor
   model.cleargrads ()
   opt.update (model, data, label)
   listoflinks = list(net.links(skipself=True))
   gradients = xp.concatenate([maxpercentile(xp.ravel(link.W.grad), 15, xp) for link in listoflinks])
   gradients = xp.ndarray.astype (gradients, xp.float32)
   # scale norm + power norm:
   normalized = xp.divide ((gradients-mins), xp.where (ranges!=0, ranges, 1))
   n = xp.where (gradients!=0, normalized, 0)
   out = xp.sign(n)*xp.power(xp.abs(n), exponent)
   return out

def grad_gen_new (updater, iterator, mins=None, ranges=None, test=False, xp=np, realbatchsize=25, percentage=15, exponent=0.5):
   """Generator that returns normalized gradient arrays given a dataset iterator and an updater. 
      The iterator gives data batches, the updater contains the neural net."""
   for (i, batch)  in enumerate (iterator):
      batchsize=len (batch)
      in_arr, truelabels = updater.converter (batch, updater.device)
      random_labels = xp.random.randint (10, size=(batchsize,))
      optimizer = updater._optimizers ['main']
      gradients = xp.concatenate(([xp.expand_dims(grads(optimizer, xp.expand_dims(in_arr[j], 0), xp.expand_dims(random_labels[j], 0), mins, ranges, xp, percentage, exponent), axis=0) for j in range(batchsize)]))
      gradients = xp.ndarray.astype (gradients, xp.float32)
      yield gradients, truelabels


def grad_gen_all (updater, iterator, mins=None, ranges=None,  test=False, xp=np, percentage=15):                                          
      for (i, batch)  in enumerate (iterator):                                                                                                                                                
         batchsize=len (batch)                                                                                                                                                                
         in_arr, truelabels = updater.converter (batch, updater.device, test=test)                                                                                                            
         optimizer = updater._optimizers ['main']                                                                                                                                             
         gradients = xp.concatenate(([xp.expand_dims(grads(optimizer, xp.expand_dims(in_arr[j], 0), xp.array ([k]), mins, ranges, xp, percentage), axis=0) for j\
 in range(batchsize) for k in range (10)]))                                                                                                                                                   
         gradients = xp.ndarray.astype (gradients, xp.float32)                                                                                                                                
         yield gradients, truelabels                                                                                                                                                        


### Loading the Cifar-10 dataset. 

We split the training data into two equally sized sets. We use the first one to train the original CNN network and the second one to train the GradNet.

In [3]:
# loading the dataset:
train, test = get_cifar10()
train_CNN = train [0:int (len (train)/2) ]
train_gradnet = train [int (len (train)/2)  : len (train)]

# Testing our pre-trained GradNet using Cifar-10 or other datasets

In this following section, we measure the performance of our pre-trained GradNet on the Cifar-10 dataset. 

To use your own image dataset, convert your images into numpy or cupy array 'images' of shape (number_of_images, 3, 32, 32) and your labels into numpy or cupy array 'labels' of shape (number_of_images, ). Then, create a TupleDataset:

test = TupleDataset(images, labels)

In [None]:
gpu=0 #set this to -1 if GPU is not available
model_CNN = L.Classifier (net.cnn_cifar ())
if gpu >= 0:
    chainer.cuda.get_device_from_id(gpu).use()
    model_CNN.to_gpu()  # Copy the model to the GPU
    xp=cp
else:
    xp=np

optimizer_CNN = chainer.optimizers.SGD(0.05)
optimizer_CNN.setup(model_CNN)
optimizer_CNN.add_hook(chainer.optimizer.WeightDecay(5e-4))

train_iter_CNN = chainer.iterators.SerialIterator(train_CNN, 64)
test_iter_CNN = chainer.iterators.SerialIterator(test, 64,
                                                 repeat=False, shuffle=False)
# Set up a trainer
updater_CNN = training.StandardUpdater(train_iter_CNN, optimizer_CNN, converter=conv_distortion, device=gpu)
trainer_CNN = training.Trainer(updater_CNN, (100, 'epoch'), out="result_CNN")

chainer.serializers.load_npz ('./result_CNN/snapshot_iter_91016', trainer_CNN, strict=False)

mins=xp.load("CNNgradmin_91016.npy")
maxes=xp.load("CNNgradmax_91016.npy")
ranges = maxes-mins

linksizes = [link.W.size for link in list(model_CNN.predictor.links(skipself=True))]
dividers = xp.cumsum(xp.array(linksizes))[:-1].tolist()

model=net.gradnet(input_dividers = dividers, middle_sizes = [5, 100, 25])

optimizer = chainer.optimizers.SGD()
optimizer.setup(model)

if gpu >=0 :
   chainer.cuda.get_device_from_id(gpu).use()
   model.to_gpu(gpu)

chainer.serializers.load_npz ('./result_gradnet/snapshot_iter_200000', model, strict=False)

test_iter_new = chainer.iterators.SerialIterator (test, 1, repeat=False, shuffle=False)
gg_test = grad_gen_all (updater_CNN, test_iter_new, mins=mins, ranges = ranges, test=True, xp=xp) 

correct = 0                                                                                                                                                                                 
for (i, databatch) in enumerate (gg_test):                                                                                                                                                  
   gradients, truelabel = databatch                                                                                                                                                         
   guessed_label = xp.argmax (xp.sum(model (gradients).data, axis=0))                                                                                                                  
   if guessed_label==truelabel:                                                                                                                                                             
      correct+=1
   if i%500 ==0:
      print("{}/10000".format(i), "images done")
print(correct/len(test))

0/10000 images done
500/10000 images done
1000/10000 images done
1500/10000 images done
2000/10000 images done
2500/10000 images done
3000/10000 images done
3500/10000 images done
4000/10000 images done
4500/10000 images done
5000/10000 images done
5500/10000 images done
6000/10000 images done
6500/10000 images done
7000/10000 images done


# Training the original CNN network.

This can take a while. A pre-trained model can be found in the repository. To skip this training, jump to the "training the GradNet" section.

In [15]:
batchsize_CNN=64
learnrate_CNN=0.05
epoch_CNN=100
gpu=0

In [16]:
# Training of the original CNN network:
model_CNN = L.Classifier (net.cnn_cifar ())
if gpu >= 0:
    # Make a specified GPU current
    chainer.cuda.get_device_from_id(gpu).use()
    model_CNN.to_gpu()  # Copy the model to the GPU

optimizer_CNN = chainer.optimizers.SGD(learnrate_CNN)
optimizer_CNN.setup(model_CNN)
optimizer_CNN.add_hook(chainer.optimizer.WeightDecay(5e-4))

train_iter_CNN = chainer.iterators.SerialIterator(train_CNN, batchsize_CNN)
test_iter_CNN = chainer.iterators.SerialIterator(test, batchsize_CNN,
                                                 repeat=False, shuffle=False)
# Set up a trainer
updater_CNN = training.StandardUpdater(train_iter_CNN, optimizer_CNN, converter=conv_distortion, device=gpu)
trainer_CNN = training.Trainer(updater_CNN, (epoch_CNN, 'epoch'), out="result_CNN")


# Evaluate the model with the test dataset for each epoch
trainer_CNN.extend(extensions.Evaluator(test_iter_CNN, model_CNN, converter=conv_distortion, device=gpu), trigger=(1, "epoch"))

# Reduce the learning rate by half every 25 epochs.
trainer_CNN.extend(extensions.ExponentialShift('lr', 0.5),
               trigger=(25, 'epoch'))

# Take a snapshot at each epoch
trainer_CNN.extend(extensions.snapshot(), trigger=(1, 'epoch'))

# Write a log of evaluation statistics for each epoch
trainer_CNN.extend(extensions.LogReport(), trigger=(1, 'epoch'))

# Print selected entries of the log to stdout
# Here "main" refers to the target link of the "main" optimizer again, and
# "validation" refers to the default name of the Evaluator extension.
# Entries other than 'epoch' are reported by the Classifier link, called by
# either the updater or the evaluator.
trainer_CNN.extend(extensions.PrintReport(
    ['epoch', 'main/loss', 'validation/main/loss',
     'main/accuracy', 'validation/main/accuracy', 'elapsed_time']), trigger = (1, 'epoch'))

trainer_CNN.run()

epoch       main/loss   validation/main/loss  main/accuracy  validation/main/accuracy  elapsed_time
[J1           1.48478     1.63433               0.421875       0.403165                  232.612       
[J2           1.27392     1.48166               0.609375       0.46865                   462.268       
[J3           1.25437     1.38152               0.546875       0.503583                  679.968       
[J4           1.42943     1.29438               0.515625       0.542596                  903.339       
[J5           1.19399     1.26791               0.59375        0.554439                  1124.06       
[J6           1.24174     1.24069               0.59375        0.562898                  1357.5        
[J7           1.14403     1.22494               0.53125        0.583002                  1574.54       
[J8           1.00215     1.1674                0.671875       0.592357                  1812.38       
[J9           1.02669     1.18634               0.625      

KeyboardInterrupt: 

# Training the GradNet

This can take a while. A pre-trained model can be found in the repository.

In [4]:
batchsize=50
learnrate=0.05
out="result_CNN"
epoch=200
gpu=0
original=91016
percentage=15
exponent=0.5
sizes = [5, 100, 25]

In [None]:
model_old = L.Classifier(net.cnn_cifar())

optimizer_old = chainer.optimizers.SGD(learnrate)
optimizer_old.setup(model_old)
optimizer_old.add_hook(chainer.optimizer.WeightDecay(5e-4))
    
train_iter_old = chainer.iterators.SerialIterator (train_CNN, 1, repeat=False, shuffle=False)
test_iter_old = chainer.iterators.SerialIterator(test, batchsize,
                                                repeat=False, shuffle=False)
# Set up a trainer
updater_old = training.StandardUpdater(train_iter_old, optimizer_old, converter=conv_distortion, device=gpu)
trainer_old = training.Trainer(updater_old, (1000, 'epoch'), out=out)

chainer.serializers.load_npz ('./result_CNN/snapshot_iter_{}'.format(original), trainer_old, strict=False)

if not os.path.exists('./result_gradnet/'):
    os.mkdir('./result_gradnet/')

if gpu >= 0:
   # Make a specified GPU current
   chainer.cuda.get_device_from_id(gpu).use()
   model_old.to_gpu(gpu)  # Copy the model to the GPU
   xp = cp
else:
   xp = np

mins=xp.load("CNNgradmin_{}.npy".format(original))
maxes=xp.load("CNNgradmax_{}.npy".format(original))
ranges = maxes-mins

linksizes = [link.W.size for link in list(model_old.predictor.links(skipself=True))]
dividers = xp.cumsum(xp.array(linksizes))[:-1].tolist()

model=net.gradnet(input_dividers = dividers, middle_sizes = sizes)

optimizer = chainer.optimizers.SGD()
optimizer.setup(model)

if gpu >=0 :
   chainer.cuda.get_device_from_id(gpu).use()
   model.to_gpu(gpu)

train_iter_new = chainer.iterators.SerialIterator (train [int (len (train)/2)  : len (train)], 25, repeat=True, shuffle=True)
gg = grad_gen_new (updater_old, train_iter_new, mins, ranges, test=False, xp=xp, realbatchsize=25, percentage=percentage, exponent=exponent)

test_iter_new = chainer.iterators.SerialIterator (test, 1, repeat=False, shuffle=False)
gg_test = grad_gen_all (updater_old, test_iter_new, mins=mins, ranges = ranges, test=True, xp=xp) 

t0=time.time()
while train_iter_new.epoch < epoch:
   grads_train, target_train = gg.__next__()
   grads_train = chainer.Variable(grads_train)
   target_train = chainer.Variable(target_train)
   if gpu >=0:
      grads_train.to_gpu(gpu)
      target_train.to_gpu(gpu)

   # Calculate the prediction of the network
   prediction_train = model(grads_train)
   # Calculate the loss with softmax_cross_entropy
   loss = chainer.functions.softmax_cross_entropy(prediction_train, target_train)
   # Calculate the gradients in the network
   model.cleargrads()
   loss.backward()
   # Update all the trainable paremters
   optimizer.update()
   if optimizer.t%50==0:
      print(optimizer.t)
                      
   if optimizer.t%1000==0:
      chainer.serializers.save_npz('result_gradnet/snapshot_iter_{}'.format(optimizer.t), model)
      t1=time.time()
      print("elapsed time:", t1-t0)
      correct = 0
      for (i, databatch) in enumerate (gg_test):                                                                                                                                                  
          gradients, truelabel = databatch                                                                                                                                                         
          guessed_label = xp.argmax (xp.sum(model (gradients).data, axis=0))                                                                                                                  
          if guessed_label==truelabel:                                                                                                                                                             
              correct+=1
      print("accuracy =", correct/len(test)) 
      test_iter_new.reset()
      gg_test = grad_gen_all (updater_old, test_iter_new, mins=mins, ranges = ranges, test=True, xp=xp) 
      

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
elapsed time: 549.2684609889984
accuracy = 0.7716
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
elapsed time: 2741.32288312912
accuracy = 0.7822
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
elapsed time: 4951.305458784103
accuracy = 0.7847
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
elapsed time: 7073.333925962448
accuracy = 0.7855
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
elapsed time: 9261.793446063995
accuracy = 0.7881
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
elapsed time: 11539.685985088348
