### Pruning after every 50 iterations

In [1]:
import math
import random
from itertools import repeat
import numpy as np
import torch
from torch.nn import Parameter
from torch.nn.modules.module import Module
import torch.nn.functional as F
import torch.nn.init as init
import torch.nn as nn
import torchvision
import torch.optim as optim

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_epochs=2
batch_size_train=64
batch_size_test=1000
learning_rate=0.01
momentum=0.5 #momentum will use the gradient of past steps also instead of just the current gradient
log_interval=1000

random_seed=1 #seed means that the random number generation will start from this point only and hence same numbers will be generated
torch.backends.cudnn.enabled=False
torch.manual_seed(random_seed)

#loading the dataset
train_loader=torch.utils.data.DataLoader(torchvision.datasets.MNIST(root='./data/files_MNIST/',train=True,download=False,transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor(),torchvision.transforms.Normalize((0.1307,),(0.3081,))])),batch_size=batch_size_train,shuffle=True)
test_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST(root='./data/files_MNIST/', train=False, download=False,transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor(),torchvision.transforms.Normalize((0.1307,),(0.3081,))])),batch_size=batch_size_test,shuffle=True)

In [3]:
#creating a class for the Pruning model from concentration
class ConcPrune(nn.Module):
    #these are the idiotypic data
    def __init__(self):
        super(ConcPrune,self).__init__()
        self.channels_conv1=64
        self.channels_conv2=64
        self.ksize_conv1=3
        self.ksize_conv2=3
        self.fc_postpool=14*14*64
        self.h3ln=256
        self.h4ln=256
        self.out_classes=10
        self.init_conc=100.0
        
        
    def prune_by_conc(self):
        print('pruning by conc')


        #identifying the non hot least concentrated neurons here - only one population - not multiple
        #h1 and h2 are convolutional layers and h3 is a linear layer

        #add other functionalities here - send the conc from the other func
        #calculate the min and max conc weights - that is - determine hot and non hot neurons
    def prune_by_lth(self,reinit=False):
        print('Pruning by LTH')
        s=0.25
        self.conv1.lth_prune(reinit)
        self.conv2.lth_prune(reinit)
        self.fc1.lth_prune(reinit)
        self.fc2.lth_prune(reinit)
        self.fc3.lth_prune(reinit)
        
        return self

In [4]:
#creating a class for the Masked Layers - conv
class MaskConv(nn.Module):
    def __init__(self, inc, outc, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
        super(MaskConv,self).__init__()
        self.inc=inc
        self.outc=outc
        self.kernel_size= kernel_size
        self.stride=stride
        self.padding=padding
        self.dilation=dilation
        self.groups=groups
        self.pruning_fraction=0.2
        
        self.weights_cust=Parameter(torch.Tensor(self.outc,self.inc,*self.kernel_size))
        self.mask=Parameter(torch.ones([self.outc,self.inc,*self.kernel_size]),requires_grad=False)
        if bias:
            self.bias=Parameter(torch.Tensor(outc))
        else:
            self.register_parameter('bias',None)
        self.reset_parameters()
    
    #this is to reset the parameters - initialise the weights
    def reset_parameters(self):
        init.kaiming_uniform_(self.weights_cust,a=math.sqrt(5))
        
    def forward(self, input):
        return F.conv2d(input, self.weights_cust*self.mask, self.bias, self.stride, self.padding, self.dilation, self.groups)
            
    
    #here pruning by the lottery ticket hypothesis
    def lth_prune(self,reinit=False):
        prune_val=0.1 #this is equivalent to 10%
        weights = self.weights_cust.data.clone().cpu().detach().numpy()
        tensor_mask=self.mask.data.clone().cpu().detach().numpy()
        
        number_of_remaining_weights = np.sum(tensor_mask)
        number_of_weights_to_prune = np.ceil(prune_val * number_of_remaining_weights).astype(int)
        weight_vector=np.concatenate([weights[tensor_mask == 1]])
        threshold = np.sort(np.abs(weight_vector))[number_of_weights_to_prune]
        new_mask =np.where(np.abs(weights) > threshold, tensor_mask, np.zeros_like(weights))

        self.mask=torch.nn.Parameter(torch.from_numpy(new_mask),requires_grad=False)
        print(self.mask)
        
        #prune_val=10
        #prune_percentile=np.percentile(abs(alive), prune_val)
        #threshold=prune_percentile
        # Convert Tensors to numpy and calculate
        
        
        if reinit:
            nn.init.xavier_uniform_(self.weights_cust)
            self.weights_cust.data = self.weights_cust.data * self.mask.data
        
                    

In [5]:
#creating a class for the masked layers - linear
class MaskLin(Module):
    def __init__(self, inf, outf, bias=True):
        super(MaskLin,self).__init__()
        self.inf=inf
        self.outf=outf
        self.weight=Parameter(torch.Tensor(outf, inf))
        self.mask = Parameter(torch.ones([outf, inf]), requires_grad=False)
        if bias:
            self.bias = Parameter(torch.empty(outf))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
        
    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return F.linear(input,self.weight*self.mask, self.bias)
    
    
    def lth_prune(self,reinit=False):
        prune_val=0.12 #this is equivalent to 20% for the fc layers
        weights = self.weight.data.clone().cpu().detach().numpy()
        tensor_mask=self.mask.data.clone().cpu().detach().numpy()
        
        number_of_remaining_weights = np.sum(tensor_mask)
        number_of_weights_to_prune = np.ceil(prune_val * number_of_remaining_weights).astype(int)
        weight_vector=np.concatenate([weights[tensor_mask == 1]])
        threshold = np.sort(np.abs(weight_vector))[number_of_weights_to_prune]
        new_mask =np.where(np.abs(weights) > threshold, tensor_mask, np.zeros_like(weights))

        self.mask=torch.nn.Parameter(torch.from_numpy(new_mask),requires_grad=False)
        
        if reinit:
            nn.init.xavier_uniform_(self.weight)
            self.weight.data = self.weight.data * self.mask.data

In [6]:
#creating a class for the conv network
class Net(ConcPrune):
    def __init__(self,mask=True):
        super(Net,self).__init__()
        conv=MaskConv if mask else nn.Conv2d
        lin=MaskLin if mask else nn.Linear
        self.conv1 = conv(1, 64, kernel_size=(3, 3))
        self.conv2 = conv(64, 64, kernel_size=(3, 3))
        #self.conv2_drop = nn.Dropout2d()
        self.fc1 = lin(12*12*64,256)
        self.fc2 = lin(256, 256)
        self.fc3 = lin(256, 10)

        #all others are left out as of now        

        ##check whether you have to manually initialise the weights or not

    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x=F.max_pool2d(x,kernel_size=2)
        #x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(-1,12*12*64)
        x = F.relu(self.fc1(x))
        #x = F.dropout(x, training=self.training)
        x = F.relu(self.fc2(x))
        x= self.fc3(x)
        return F.log_softmax(x)

In [7]:
#initialising the network and the optimizer
nnetwork = Net()
#network.to(device)
noptimizer = optim.SGD(nnetwork.parameters(), lr=learning_rate,
                      momentum=momentum)

In [None]:
print(nnetwork.fc1.weight.shape)

In [None]:
masterfile_init='../Logs/initial_T1/common/'
torch.save(nnetwork.state_dict(),masterfile_init+'init_weights_first.pth')

In [8]:
masterfile_init='../Logs/initial_T1/common/'
nnetwork.load_state_dict(torch.load(masterfile_init+'init_weights_first.pth'))

<All keys matched successfully>

In [9]:
def test(network):
  network.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      #data=data.to(device)
      #target=target.to(device)
      output = network(data)
      test_loss += F.nll_loss(output, target, size_average=False).item()
      pred = output.data.max(1, keepdim=True)[1]
      #print(pred)
      correct += pred.eq(target.data.view_as(pred)).sum()
  test_loss /= len(test_loader.dataset)
  test_losses.append(test_loss)
  print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.6f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))
  test_log_list.append([test_loss,correct,100. * correct / len(test_loader.dataset)])

In [10]:
def train(epoch,network,optimizer):
    network.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        #print(batch_idx)
        #data=data.to(device)
        #target=target.to(device)
        #if batch_idx==100:
        #    break
        optimizer.zero_grad()
        output = network(data)
        loss = F.nll_loss(output, target)

        #this is where the backpropagation carries out
        loss.backward()
        #l1grad=network.conv1.weight.grad.clone()
        #print(l1grad)
        #pickle.dump(l1grad,testout)
        network.conv1.weights_cust.grad=network.conv1.weights_cust.grad*network.conv1.mask
        network.conv2.weights_cust.grad=network.conv2.weights_cust.grad*network.conv2.mask
        network.fc1.weight.grad=network.fc1.weight.grad*network.fc1.mask
        network.fc2.weight.grad=network.fc2.weight.grad*network.fc2.mask

        '''l1grad=network.conv1.weights_cust.grad.clone()
        l2grad=network.conv2.weights_cust.grad.clone()
        #print('l2grad is ',l2grad.shape)
        l3grad=network.fc1.weight.grad.clone()
        #print('l3grad is ',l3grad.shape)
        l4grad=network.fc2.weight.grad.clone()

        #print(grad_dic)
        #torch.save(grad_dic,'grad2.pt')
        gradconv1_list.append(l1grad)
        gradconv2_list.append(l2grad)
        gradfc1_list.append(l3grad)
        gradfc2_list.append(l4grad)'''
        train_losses.append(loss.item())
        #buffer = io.BytesIO()
        #torch.save(l1grad, buffer)
        optimizer.step()
        
        if batch_idx==35 and prune:
            network=network.prune_by_lth(reinit=False)
            network.load_state_dict(torch.load(masterfile_init+'init_weights_first.pth'))
            
        if batch_idx==36 and prune:
            print(network.conv1.mask)
        
        if batch_idx % log_interval == 0:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.item()))
          train_counter.append(
            (batch_idx*64) + ((epoch-1)*len(train_loader.dataset)))
    return network,optimizer

In [11]:
#Training the model
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
test_log_list=[]

gradconv1_list=[]
gradconv2_list=[]
gradfc1_list=[]
gradfc2_list=[]
prune=0

In [14]:
test(nnetwork)




Test set: Avg. loss: 2.3041, Accuracy: 767/10000 (7.670000%)



In [15]:
for epoch in range(1, 1+1):
  nnetwork,noptimizer=train(epoch,nnetwork,noptimizer)
  test(nnetwork)




Test set: Avg. loss: 0.1480, Accuracy: 9522/10000 (95.220001%)



In [16]:
masterfile_init='../Logs/initial_T1/common/'
torch.save(train_losses,masterfile_init+'train_losses_lth.pt')

In [13]:
np.count_nonzero(nnetwork.conv1.mask)

517

In [None]:
masterfile_init='../Logs/initial_T1/common/'
torch.save(nnetwork.state_dict(),masterfile_init+'trained_weights_nostrategy.pth')

In [None]:
#loading the old initialiased weights
masterfile_init='../Logs/initial_T1/common/'
nnetwork.load_state_dict(torch.load(masterfile_init+'init_weights_first.pth'))

In [12]:
nnetwork.prune_by_lth(reinit=False)

Pruning by LTH
Parameter containing:
tensor([[[[1., 1., 1.],
          [1., 1., 1.],
          [1., 1., 1.]]],


        [[[0., 1., 0.],
          [1., 1., 0.],
          [0., 1., 0.]]],


        [[[1., 1., 1.],
          [1., 1., 1.],
          [1., 0., 1.]]],


        [[[1., 1., 1.],
          [1., 1., 1.],
          [1., 1., 1.]]],


        [[[1., 1., 1.],
          [1., 1., 1.],
          [1., 0., 1.]]],


        [[[1., 1., 1.],
          [1., 0., 1.],
          [1., 1., 1.]]],


        [[[1., 1., 0.],
          [1., 1., 1.],
          [1., 1., 1.]]],


        [[[1., 1., 1.],
          [0., 1., 1.],
          [1., 1., 1.]]],


        [[[1., 0., 1.],
          [1., 1., 1.],
          [1., 1., 1.]]],


        [[[1., 0., 1.],
          [1., 1., 1.],
          [1., 1., 1.]]],


        [[[1., 1., 1.],
          [1., 1., 1.],
          [1., 1., 1.]]],


        [[[1., 1., 1.],
          [1., 1., 1.],
          [1., 1., 0.]]],


        [[[1., 1., 1.],
          [1., 1., 1.],
   

Net(
  (conv1): MaskConv()
  (conv2): MaskConv()
  (fc1): MaskLin()
  (fc2): MaskLin()
  (fc3): MaskLin()
)

In [None]:
#THIS WORKED FOR CONV - THIS IS AS PER ORIGINAL LTH FROM FRANKLE

weights = nnetwork.conv1.weights_cust.data.clone().cpu().detach().numpy()
tensor_mask=nnetwork.conv1.mask.data.clone().cpu().detach().numpy()
number_of_remaining_weights = np.sum(tensor_mask)
print(number_of_remaining_weights)
number_of_weights_to_prune = np.ceil(0.2 * number_of_remaining_weights).astype(int)
print(number_of_weights_to_prune)
weight_vector=np.concatenate([weights[tensor_mask == 1]])
print(weight_vector.shape)
threshold = np.sort(np.abs(weight_vector))[number_of_weights_to_prune]
print(threshold)
new_mask =np.where(np.abs(weights) > threshold, tensor_mask, np.zeros_like(weights))

number_of_remaining_weights_after = np.sum(new_mask)
print(number_of_remaining_weights_after)

nnetwork.conv1.mask=torch.nn.Parameter(torch.from_numpy(new_mask),requires_grad=False)

In [None]:
#THIS WORKED FOR FC - THIS IS AS PER ORIGINAL LTH FROM FRANKLE

weights = nnetwork.fc1.weight.data.clone().cpu().detach().numpy()
tensor_mask=nnetwork.fc1.mask.data.clone().cpu().detach().numpy()
number_of_remaining_weights = np.sum(tensor_mask)
print(number_of_remaining_weights)
number_of_weights_to_prune = np.ceil(0.2 * number_of_remaining_weights).astype(int)
print(number_of_weights_to_prune)
weight_vector=np.concatenate([weights[tensor_mask == 1]])
print(weight_vector.shape)
threshold = np.sort(np.abs(weight_vector))[number_of_weights_to_prune]
print(threshold)
new_mask =np.where(np.abs(weights) > threshold, tensor_mask, np.zeros_like(weights))

number_of_remaining_weights_after = np.sum(new_mask)
print(number_of_remaining_weights_after)

In [None]:
test(nnetwork)

In [None]:
for epoch in range(1, 1+1):
  nnetwork,noptimizer=train(epoch,nnetwork,noptimizer)
  test(nnetwork)