In [1]:
import torch.nn as nn
import torch
import numpy as np
import matplotlib.pyplot as plt 
import random

In [2]:
from torchsummary import summary

In [44]:
random.seed('Mudit')

## EXPERIMENTS -- Morphism Type I

<img src="images/morph1.png" width=700>

In [3]:


class Net(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(Net, self).__init__()
        self.inp = D_in
        self.out = D_out
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        self.layers = [self.input_linear, self.middle_linear,self.output_linear] 
        
        
    
    def add_layer(self,pos) :
        #insert after pos
        #patchwork
        #pass from x to pos-1 to get output shape
        tempx = torch.randn(1, self.inp)
        resx = self.forward(tempx,pause=pos)
        print resx.shape
        #create pos shaped layer
        templayer = torch.nn.Linear(resx.shape[-1],resx.shape[-1])
        self.layers.insert(pos,templayer)
        
        #call for update params

   
    def forward(self, x,pause=-1):
        print 'Layers :',len(self.layers)
        
        cnt = 0
        out = self.layers[0](x)
        cnt += 1
        if(pause == 1):
            return out
        
        
        counter = 1
        for layer in self.layers[1:] :
            counter+=1
            cnt += 1
            out = layer(out)
            if pause != -1 :
                if(counter==pause) :
                    print 'Count of Layers = ',cnt
                    return out
            print 'Count of Layers = ',cnt
        return out



N, D_in, H, D_out = 64, 1000, 100, 10


x = torch.randn(N, D_in)
y = torch.randn(N, D_out)


1. Attempt is to change optimizer dynamically

2. Criterion failed

In [4]:
model = Net(D_in, H, D_out)
summary(model,(1,D_in))

Layers : 3
Count of Layers =  2
Count of Layers =  3
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 100]         100,100
            Linear-2               [-1, 1, 100]          10,100
            Linear-3                [-1, 1, 10]           1,010
Total params: 111,210
Trainable params: 111,210
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.42
Estimated Total Size (MB): 0.43
----------------------------------------------------------------


In [5]:
model.add_layer(2)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) 
summary(model,(1,D_in))

Layers : 3
Count of Layers =  2
torch.Size([1, 100])
Layers : 4
Count of Layers =  2
Count of Layers =  3
Count of Layers =  4
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 100]         100,100
            Linear-2               [-1, 1, 100]          10,100
            Linear-3                [-1, 1, 10]           1,010
Total params: 111,210
Trainable params: 111,210
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.42
Estimated Total Size (MB): 0.43
----------------------------------------------------------------


In [6]:
model.layers

[Linear(in_features=1000, out_features=100, bias=True),
 Linear(in_features=100, out_features=100, bias=True),
 Linear(in_features=100, out_features=100, bias=True),
 Linear(in_features=100, out_features=10, bias=True)]

In [7]:
model = Net(D_in, H, D_out)


criterion = torch.nn.MSELoss(reduction='sum')

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

num = 10
for t in range(num):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    if(t==num/2) :
        optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
        # with NLL gives certain Long/Float err
        print 'New optim at work'
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Layers : 3
Count of Layers =  2
Count of Layers =  3
(0, 675.8677368164062)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(1, 638.0596923828125)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(2, 572.4498291015625)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(3, 490.211669921875)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(4, 400.2493591308594)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(5, 308.8610534667969)
New optim at work
Layers : 3
Count of Layers =  2
Count of Layers =  3
(6, 298.4310302734375)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(7, 288.119384765625)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(8, 277.9320373535156)
Layers : 3
Count of Layers =  2
Count of Layers =  3
(9, 267.874755859375)


In [8]:
summary(model,(1,D_in))

Layers : 3
Count of Layers =  2
Count of Layers =  3
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 100]         100,100
            Linear-2               [-1, 1, 100]          10,100
            Linear-3                [-1, 1, 10]           1,010
Total params: 111,210
Trainable params: 111,210
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.42
Estimated Total Size (MB): 0.43
----------------------------------------------------------------


In [9]:
for e in model.parameters() :
    print e.shape

torch.Size([100, 1000])
torch.Size([100])
torch.Size([100, 100])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])


In [10]:
model.layers

[Linear(in_features=1000, out_features=100, bias=True),
 Linear(in_features=100, out_features=100, bias=True),
 Linear(in_features=100, out_features=10, bias=True)]

In [11]:
from collections import OrderedDict

In [12]:
pos = 2
sz = model.forward([1,D_in]).shape[-1]
m = nn.Sequential(for x in model.layers[:pos],nn.Linear(sz,sz),for x in model.layers[pos:])

SyntaxError: invalid syntax (<ipython-input-12-68ef22944adf>, line 3)

In [13]:
nn.Sequential?

In [14]:
class nnet(nn.Module) :
    def __init__(self,inp,out) :
        super(nnet, self).__init__()
        self.fc1 = nn.Linear(inp,64)
        self.fc2 = nn.Linear(64,128)
        self.fc3 = nn.Linear(128,32)
    
    def forward(self,x):
        return self.fc3(self.fc2(self.fc1(x)))

In [15]:
model = nnet(16,32)

In [16]:
x = torch.randn(1,16)

In [17]:
t = model(x)
t.shape

torch.Size([1, 32])

In [18]:
model.fc1.out_features

64

In [19]:
model.fc2 = nn.Sequential(nn.Linear(model.fc1.out_features,model.fc1.out_features),model.fc2)

In [20]:
model

nnet(
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=128, bias=True)
  )
  (fc3): Linear(in_features=128, out_features=32, bias=True)
)

In [21]:
model.fc2[1] = nn.Sequential(model.fc2[1],nn.Linear(model.fc2[1].out_features,model.fc2[1].out_features))

In [22]:
model

nnet(
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): Sequential(
      (0): Linear(in_features=64, out_features=128, bias=True)
      (1): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc3): Linear(in_features=128, out_features=32, bias=True)
)

In [23]:
summary(model,(1,16))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 64]           1,088
            Linear-2                [-1, 1, 64]           4,160
            Linear-3               [-1, 1, 128]           8,320
            Linear-4               [-1, 1, 128]          16,512
            Linear-5                [-1, 1, 32]           4,128
Total params: 34,208
Trainable params: 34,208
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.13
Estimated Total Size (MB): 0.13
----------------------------------------------------------------


In [24]:
model.fc2[0].out_features

64

Mechanism required to map all the layers, some sort of dict to call each layer may be stored there itself. We can add a final layer to pytorch module this way.

## Experiment 2 -- Network Morph II
<img src="images/morph2.png" width=700>

In [None]:
# https://discuss.pytorch.org/t/possible-to-add-initialize-new-nodes-to-hidden-layer-partway-through-training/3809/2
# https://github.com/mortezamg63/Accessing-and-modifying-different-layers-of-a-pretrained-model-in-pytorch

In [27]:
class Model(nn.Module):
    def __init__(self, layer_size, hidden, input_size, output_size):
        super(Model, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.layer_size = layer_size
        self.relu = nn.ReLU()
        
        self.device = 'cpu'

        # initialize weights
        self.fcs = nn.ModuleList([nn.Linear(self.input_size, self.layer_size)])
        self.fcs.append(nn.Linear(self.layer_size, self.output_size))

    def forward(self, x):
        print 'Forward...'
        print 'X:',x.shape
        print 'weight shapes...'
        print 'fcs-0:',self.fcs[0].weight.shape
        print 'fcs-1:',self.fcs[1].weight.shape
        print 'goes in ', self.fcs[0], 'and gives out'
        out = self.fcs[0](x)
        print 'out-0:',out.shape, 'and will go in'
#         print self.fcs[1]
        out = self.fcs[1](out)
        print 'out-2:', out.shape
#         return self.fcs[1](self.fcs[0](x))
    
    def add_units(self, addneurons):
        # take a copy of the current weights stored in self.fcs
        current = [ix.weight.data for ix in self.fcs]
        print 'Current:',len(current), ' 1:',current[0].shape,' 2:',current[1].shape

        # make the new weights in and out of hidden layer you are adding neurons to
        tmp_i = torch.zeros([addneurons, current[0].shape[1]])
        print 'tmp_i:',tmp_i.shape
        nn.init.xavier_uniform_(tmp_i, gain=nn.init.calculate_gain('relu'))
        tmp_o = torch.zeros([current[1].shape[0], addneurons])
        print 'tmp_o:',tmp_o.shape
        nn.init.xavier_uniform_(tmp_i, gain=nn.init.calculate_gain('relu'))

        # concatenate the old weights with the new weights
        new_wi = torch.cat([current[0], tmp_i], dim=0)
        print 'new_wi:',new_wi.shape
        new_wo = torch.cat([current[1], tmp_o], dim=1)
        print 'new_wo:',new_wo.shape

        # reset weight and grad variables to new size

        #TODO------ Pl have a look!
        
        # should chnage self.layer_size to previous layer's neuron number 
        self.fcs[0] = nn.Linear(current[0].shape[1], self.layer_size+addneurons)
        print 'fcs-0:',self.fcs[0]
        self.fcs[1] = nn.Linear(addneurons+self.layer_size, current[1].shape[0])
        print 'fcs-1:',self.fcs[1]
        
        
#         self.fcs[0] = nn.Linear(current[0].shape[1], self.layer_size)
#         print 'fcs-0:',self.fcs[0]
#         self.fcs[1] = nn.Linear(self.layer_size, current[1].shape[0])
#         print 'fcs-1:',self.fcs[1]

        # set the weight data to new values
        self.fcs[0].weight.data = torch.tensor(new_wi, requires_grad=True, device=self.device)
        self.fcs[1].weight.data = torch.tensor(new_wo, requires_grad=True, device=self.device)
        print 'self.fcs[0] and 1:',self.fcs[0].weight.shape, ' ', self.fcs[1].weight.shape

In [28]:
# will be buggy when layers increased to 3

In [29]:
nm = Model(5,4,3,2)

In [30]:
nm(torch.randn(1,3))

Forward...
X: torch.Size([1, 3])
weight shapes...
fcs-0: torch.Size([5, 3])
fcs-1: torch.Size([2, 5])
goes in  Linear(in_features=3, out_features=5, bias=True) and gives out
out-0: torch.Size([1, 5]) and will go in
out-2: torch.Size([1, 2])


In [31]:
nm.add_units(7)

Current: 2  1: torch.Size([5, 3])  2: torch.Size([2, 5])
tmp_i: torch.Size([7, 3])
tmp_o: torch.Size([2, 7])
new_wi: torch.Size([12, 3])
new_wo: torch.Size([2, 12])
fcs-0: Linear(in_features=3, out_features=12, bias=True)
fcs-1: Linear(in_features=12, out_features=2, bias=True)
self.fcs[0] and 1: torch.Size([12, 3])   torch.Size([2, 12])


In [32]:
nm(torch.randn(1,3))

Forward...
X: torch.Size([1, 3])
weight shapes...
fcs-0: torch.Size([12, 3])
fcs-1: torch.Size([2, 12])
goes in  Linear(in_features=3, out_features=12, bias=True) and gives out
out-0: torch.Size([1, 12]) and will go in
out-2: torch.Size([1, 2])


In [33]:
summary(nm,(1,3))

Forward...
X: torch.Size([2, 1, 3])
weight shapes...
fcs-0: torch.Size([12, 3])
fcs-1: torch.Size([2, 12])
goes in  Linear(in_features=3, out_features=12, bias=True) and gives out
out-0: torch.Size([2, 1, 12]) and will go in
out-2: torch.Size([2, 1, 2])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 12]              48
            Linear-2                 [-1, 1, 2]              26
Total params: 74
Trainable params: 74
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [34]:
nm.add_units(10)

Current: 2  1: torch.Size([12, 3])  2: torch.Size([2, 12])
tmp_i: torch.Size([10, 3])
tmp_o: torch.Size([2, 10])
new_wi: torch.Size([22, 3])
new_wo: torch.Size([2, 22])
fcs-0: Linear(in_features=3, out_features=15, bias=True)
fcs-1: Linear(in_features=15, out_features=2, bias=True)
self.fcs[0] and 1: torch.Size([22, 3])   torch.Size([2, 22])


In [35]:
# Error Reason listed in the class above, line 52
summary(nm,(1,3))

Forward...
X: torch.Size([2, 1, 3])
weight shapes...
fcs-0: torch.Size([22, 3])
fcs-1: torch.Size([2, 22])
goes in  Linear(in_features=3, out_features=15, bias=True) and gives out


RuntimeError: The expanded size of the tensor (22) must match the existing size (15) at non-singleton dimension 2

In [36]:
for i in nm.children():
    print i

ReLU()
ModuleList(
  (0): Linear(in_features=3, out_features=15, bias=True)
  (1): Linear(in_features=15, out_features=2, bias=True)
)


#### Setting up different learning rates for different layers

Advancement over other archs

In [37]:
# ignored_params = list(map(id, model.fc.parameters()))
# base_params = filter(lambda p: id(p) not in ignored_params,
#                      model.parameters())

# optimizer = torch.optim.SGD([
#             {'params': base_params},
#             {'params': model.fc.parameters(), 'lr': opt.lr}
#         ], lr=opt.lr*0.1, momentum=0.9)


## Experiment -- Network Morphism - III
<img src="images/morph3.png" width=700>

In [38]:
# Read idempotent functions

## Experiment -- Network Morphism IV
<img src="images/morph4b.png" width=700>
<img src="images/morph4a.png" width=700>

In [39]:
# Choosing h(x) is not trivial, atleast from first look.

## Implementing NASH

<img src="images/algo.png" width="700">

Total Training epochs => epoch_total = epoch_neigh.n_neigh.n_steps +epoch_final.

With the setting 

n_steps =5

epoch_neigh =17

epoch_final =100

and pretraining the starting network for 20 epochs, the models that are returned by the algorithm are trained for a total number of of 20 + 17 · 5 + 100 = 205 epochs.

In [42]:
# some start model_0
model_0 = 0
n_steps = 10
n_neigh = 4
n_nm = 3
epoch_neigh = 30
epoch_final = 50
lr_start = 0.01
lr_end = 0.001  # annealed via SGDR

model_best = model_0

In [43]:
# hill climbing

for i in range(n_steps) :
    
    #NEIGHBOURS ---------------------
    
    # get n_neigh neighbours of model_best
    for j in range(n_neigh-1) :
        model_j = applyMorph(model_best,n_nm)
        
        #train this model_j for a few epochs.
        
        model_j = Train(SGDR,model_j,epoch_neigh,lr_start,lr_end)
     
    # paper says  : "last model obtained is infact the best model therefore via hillclimbing we choose this."
    model_n_neigh = Train(SGDR,model_best,epoch_neigh,lr_start,lr_end)
    
    #best model on validation set.
    
    #SELECT MAX ---------------------
    
    model_best = argMax([ValidationPerformance(model_j) for model_j in models_1__n_neigh])
    
    #train final model.
    model_best = Train(SGDR,model_best,epoch_neigh,lr_start,lr_end)
        

NameError: name 'applyMorph' is not defined

## ApplyNetMorph 

Algorithm 1 provides full details for the algorithm. In the implementation, the function ApplyNetMorph(model,n) (line 15) applies n network morphisms, each of them sampled uniformly at random from the following three:


<img src="images/img1.png" width=500> <img src="images/img2.png" width=500>

In [71]:
# Testing random number generator bias.

cnt = {}

cnt[0] = 0
cnt[1] = 0 
cnt [2] = 0

n = 10

for _ in range (n) :
    cnt[random.choice(range(3))] += 1

print cnt[0]/float(n) , cnt[1]/float(n) , cnt[2]/float(n)

0.4 0.5 0.1


In [79]:
def widen(model) :
    #sample which conv layer to be widened.
    #NetMorph Type 2
    #Choose widening faction {2,4} uniformly.
    pass

def deepen(model) :
    #create a Conv-BatchNorm-Relu Block
    #Position this block -- 1
    #Kernel Size {3,5} -- 2
    #Perform 1 & 2 uniformly
    
    #Number of channels = Channels of closest preceeding channels.
    pass


def skip_1(model) :
    pass

def skip_2(model) :
    pass

skip_operations = [skip_1,skip_2]

def skip(model) :
    op = skip_operations[random.choice(range(2))]
    return op(model)

operations = [widen,deepen,skip]

def applyMorph(model,n) :
    '''
    @params :
    model : Input Model.
    n : Number of network morphisms applied.
    '''
    
    op = operations[random.choice(range(3))]
    print 'Choosen Operation: ',op
    op(model)
    return model

applyMorph(0,1)

Choosen Operation:  <function deepen at 0x11a7a3de8>


0

## Evolutionary ideas presented :

While the method is presented as a simple hill-climbing method, it can also be interpreted as a very simple evolutionary algorithm with a population size of n_neigh, no cross-over, network morphisms as mutations, and a selection mechanism that only considers the best-performing population member as the parent for the next generation. This interpretation also suggests several promising possibilities for extending this simple method.