In [1]:
### Import Libraries
import numpy as np
import torch.nn as nn
import torch
from torch.autograd.variable import Variable
import torchvision
from torchvision import datasets as dset
from torchvision import models
from torchvision import transforms
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import timeit
import os
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
### Retrieve the Pretrained Model
model = models.vgg11(pretrained=True)

In [3]:
### Define a new model, only work with half of VGG
### Turn new model into gpu type, call in model_gpu
import copy
gpu_dtype = torch.cuda.FloatTensor

class new_model(nn.Module):
    def __init__(self):
        super(new_model, self).__init__()
        
        self.vismodel = nn.Sequential(*list(model.children())[0])
        self.fc = nn.Linear(512,10)
    
    def forward(self,x):
        x = self.vismodel(x)
        x = torch.squeeze(x)
        x = self.fc(x)
        return x

new_model = new_model()
print(new_model)
model_gpu = copy.deepcopy(new_model).type(gpu_dtype)

new_model (
  (vismodel): Sequential (
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU (inplace)
    (2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU (inplace)
    (5): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU (inplace)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU (inplace)
    (10): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU (inplace)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU (inplace)
    (15): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (16): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (17): ReLU 

In [4]:
### Test the model out
x_gpu = torch.randn(64, 3, 32, 32).type(gpu_dtype)
x_var_gpu = Variable(x_gpu) # Construct a PyTorch Variable out of your input data
ans = model_gpu(x_var_gpu)        # Feed it through the model!
print(ans[0])

Variable containing:
-0.1119
-0.7000
-0.2815
-0.6935
 0.3994
 2.0697
 0.3658
 0.3497
 0.8487
 0.1171
[torch.cuda.FloatTensor of size 10 (GPU 0)]



In [5]:
### Retrieve dataset
class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples
    
NUM_TRAIN = 10000
NUM_VAL = 1000

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          sampler=ChunkSampler(NUM_TRAIN, 0))

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [6]:
### Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_gpu.parameters())
running_loss = 0.0
for epoch in range(20):
    for i, data in enumerate(trainloader, 0): #i is a counter, start from 0, the tuple (i,data) 
                                          #is produced
        # get the inputs
        inputs, labels = data
        inputs_gpu = inputs.type(gpu_dtype)
        labels_gpu = labels.type(gpu_dtype).long()

        # wrap them in Variable
        inputs, labels = Variable(inputs_gpu), Variable(labels_gpu)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model_gpu(inputs) # Forward -> score
        loss = criterion(outputs, labels) # Forward -> loss
        loss.backward() # Backward generate gradients
        optimizer.step() # Update Parameters

        # print statistics
        running_loss += loss.data[0]
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
print("Finish Training")

[1,  2000] loss: 1.963
[2,  2000] loss: 2.031
[3,  2000] loss: 1.742
[4,  2000] loss: 1.510
[5,  2000] loss: 1.364
[6,  2000] loss: 1.219
[7,  2000] loss: 1.121
[8,  2000] loss: 1.018
[9,  2000] loss: 0.914
[10,  2000] loss: 0.874
[11,  2000] loss: 0.813
[12,  2000] loss: 0.725
[13,  2000] loss: 0.746
[14,  2000] loss: 0.656
[15,  2000] loss: 0.586
[16,  2000] loss: 0.634
[17,  2000] loss: 0.592
[18,  2000] loss: 0.519
[19,  2000] loss: 0.519
[20,  2000] loss: 0.481
Finish Training


In [7]:
### Compute Accuracy
correct = 0
total = 0
for data in testloader:
    images, labels = data
    images = images.type(gpu_dtype)
    labels = labels.type(gpu_dtype).long()
    outputs = model_gpu(Variable(images))
    _, predicted = torch.max(outputs.data, 1) # return (每一横行最大的那个数，这个数所在的index)
    total += labels.size(0) #will be added up to 10,000, 每次有4个 - batch size
    correct += (predicted == labels).sum()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 61 %


In [8]:
### Pruning Conv Layer
index_dict4d = dict([("conv1", []),("conv2", []),("conv3", []),("conv4", []),("conv5", []),("conv6", []),("conv7", []),("conv8", [])])
index_dict1d = dict([("conv1", []),("conv2", []),("conv3", []),("conv4", []),("conv5", []),("conv6", []),("conv7", []),("conv8", [])])
conv_counter = 0

# Freeze the parameters that is less than a threshold, Say -0.5
# When I say freeze up, I mean set the weight to 0
for child in model_gpu.children():
    for children_of_child in child.children(): # Going thru all layers of the network
        print(children_of_child)
        if "Conv2d" in str(children_of_child): #check if it is a conv layer
            conv_counter += 1
            #print("total parameters:",len(list(children_of_child.parameters())))
            for param in children_of_child.parameters():
                #print(type(param.data[0,0,0,0]))
                if len(param.data.size()) == 4:
                    #Loop through all the entries
                    for i in range(param.data.size()[0]):
                        for j in range(param.data.size()[1]):
                            for k in range(param.data.size()[2]):
                                for l in range(param.data.size()[3]):
                                    if param.data[i,j,k,l] < -0.5:
                                        param.data[i,j,k,l] = 0
                                        index_name = "conv" + str(conv_counter)
                                        index_dict4d[index_name].append([i,j,k,l])
                else:
                    for i in range(param.data.size()[0]):
                        if param.data[i] < -0.5:
                            param.data[i] = 0
                            index_name = "conv" + str(conv_counter)
                            index_dict1d[index_name].append(i)
                        

Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU (inplace)
MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))


In [9]:
print(index_dict1d["conv1"])
#### Visualize a set of pruned parameters
for child in model_gpu.children():
    for children_of_child in child.children(): # Going thru all layers of the network
        for param in children_of_child.parameters():
            if len(param.data.size()) == 4:
                #Loop through all the entries
                print(param.data)
            break
        break
    break

[3, 7, 10, 14, 15, 17, 19, 21, 22, 23, 26, 28, 29, 30, 31, 32, 38, 41, 43, 50, 51, 52, 57, 58, 60, 63]

(0 ,0 ,.,.) = 
  0.1628  0.0532 -0.2291
  0.0334  0.4052 -0.4388
 -0.2271  0.3733 -0.1803

(0 ,1 ,.,.) = 
  0.2702 -0.0601  0.0000
  0.3376  0.4988  0.0000
  0.0164  0.5087 -0.3284

(0 ,2 ,.,.) = 
  0.1215 -0.0277 -0.3225
  0.0875  0.3706 -0.3914
 -0.0539  0.4169 -0.0967
     ⋮ 

(1 ,0 ,.,.) = 
 -0.2983 -0.2527  0.5494
  0.0000 -0.3279  0.7624
 -0.3069 -0.0826  0.6425

(1 ,1 ,.,.) = 
  0.0000 -0.3998  0.6207
  0.0000 -0.3210  1.0808
 -0.4434 -0.0339  0.8969

(1 ,2 ,.,.) = 
  0.0251 -0.2610  0.1148
 -0.1314 -0.2159  0.3815
 -0.0313 -0.0154  0.3842
     ⋮ 

(2 ,0 ,.,.) = 
  0.0814 -0.0218  0.2336
  0.1044  0.1847 -0.0872
 -0.4612  0.0197 -0.0651

(2 ,1 ,.,.) = 
 -0.0434 -0.3602 -0.1910
  0.2864  0.2199 -0.3160
 -0.2144  0.2654 -0.0338

(2 ,2 ,.,.) = 
  0.0228 -0.1605 -0.0345
  0.2336  0.2143 -0.1375
 -0.1072  0.2294  0.0480
...   
     ⋮ 

(61,0 ,.,.) = 
  0.3634  0.5034  0.2240
  0.00

In [10]:
### Check accuracy after pruning without retraining
correct = 0
total = 0
for data in testloader:
    images, labels = data
    images = images.type(gpu_dtype)
    labels = labels.type(gpu_dtype).long()
    outputs = model_gpu(Variable(images))
    _, predicted = torch.max(outputs.data, 1) # return (每一横行最大的那个数，这个数所在的index)
    total += labels.size(0) #will be added up to 10,000, 每次有4个 - batch size
    correct += (predicted == labels).sum()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 15 %


In [11]:
### Add up hook to gradient in order to avoid updating certain weights
def my_hook4d1(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv1"])):
        a,b,c,d = index_dict4d["conv1"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d2(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv2"])):
        a,b,c,d = index_dict4d["conv2"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d3(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv3"])):
        a,b,c,d = index_dict4d["conv3"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d4(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv4"])):
        a,b,c,d = index_dict4d["conv4"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d5(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv5"])):
        a,b,c,d = index_dict4d["conv5"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d6(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv6"])):
        a,b,c,d = index_dict4d["conv6"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d7(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv7"])):
        a,b,c,d = index_dict4d["conv7"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

def my_hook4d8(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict4d["conv8"])):
        a,b,c,d = index_dict4d["conv8"][i]
        grad_clone[a,b,c,d] = 0
    return grad_clone

hook_dict4d = dict([("hook1",my_hook4d1),("hook2",my_hook4d2),("hook3",my_hook4d3),("hook4",my_hook4d4),("hook5",my_hook4d5),("hook6",my_hook4d6),("hook7",my_hook4d7),("hook8",my_hook4d8)])

def my_hook1d1(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv1"])):
        a = index_dict1d["conv1"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d2(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv2"])):
        a = index_dict1d["conv2"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d3(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv3"])):
        a = index_dict1d["conv3"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d4(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv4"])):
        a = index_dict1d["conv4"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d5(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv5"])):
        a = index_dict1d["conv5"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d6(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv6"])):
        a = index_dict1d["conv6"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d7(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv7"])):
        a = index_dict1d["conv7"][i]
        grad_clone[a] = 0
    return grad_clone

def my_hook1d8(grad):
    grad_clone = grad.clone()
    for i in range(len(index_dict1d["conv8"])):
        a = index_dict1d["conv8"][i]
        grad_clone[a] = 0
    return grad_clone

hook_dict1d = dict([("hook1",my_hook1d1),("hook2",my_hook1d2),("hook3",my_hook1d3),("hook4",my_hook1d4),("hook5",my_hook1d5),("hook6",my_hook1d6),("hook7",my_hook1d7),("hook8",my_hook1d8)])

conv_counter = 0
child_counter = 0
for child in model_gpu.children():
    children_of_child_counter = 0
    for children_of_child in child.children(): # Going thru all layers of the network
        if "Conv2d" in str(children_of_child): #check if it is a conv layer
            conv_counter += 1
            for param in children_of_child.parameters():
                if len(param.data.size()) == 4:
                    hook_name = "hook" + str(conv_counter)
                    param.register_hook(hook_dict4d[hook_name])
                    print('child ', children_of_child_counter, 'of child',child_counter,' requires gradient')
                else:
                    hook_name = "hook" + str(conv_counter)
                    param.register_hook(hook_dict1d[hook_name])
        else:
            for param in children_of_child.parameters():
                param.requires_grad = False
        children_of_child_counter += 1
    child_counter += 1

child  0 of child 0  requires gradient
child  3 of child 0  requires gradient
child  6 of child 0  requires gradient
child  8 of child 0  requires gradient
child  11 of child 0  requires gradient
child  13 of child 0  requires gradient
child  16 of child 0  requires gradient
child  18 of child 0  requires gradient


In [12]:
### Retraining
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_gpu.parameters())
running_loss = 0.0
for epoch in range(5):
    for i, data in enumerate(trainloader, 0): #i is a counter, start from 0, the tuple (i,data) 
                                          #is produced
        # get the inputs
        inputs, labels = data
        inputs_gpu = inputs.type(gpu_dtype)
        labels_gpu = labels.type(gpu_dtype).long()

        # wrap them in Variable
        inputs, labels = Variable(inputs_gpu), Variable(labels_gpu)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model_gpu(inputs) # Forward -> score
        loss = criterion(outputs, labels) # Forward -> loss
        loss.backward() # Backward generate gradients
        optimizer.step() # Update Parameters

        # print statistics
        running_loss += loss.data[0]
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
print("Finish Training")

[1,  2000] loss: 1.825
[2,  2000] loss: 1.723
[3,  2000] loss: 1.305
[4,  2000] loss: 1.047
[5,  2000] loss: 0.860
Finish Training


In [13]:
#### Checking if pruned parameters are set to be 0
for child in model_gpu.children():
    for children_of_child in child.children(): # Going thru all layers of the network
        for param in children_of_child.parameters():
            if len(param.data.size()) == 4:
                #Loop through all the entries
                print(param.data)
            break
        break
    break


(0 ,0 ,.,.) = 
  0.1815  0.0097 -0.3997
 -0.0021  0.3798 -0.6989
 -0.2638  0.4159 -0.3177

(0 ,1 ,.,.) = 
  0.2437 -0.1695  0.0000
  0.2635  0.4127  0.0000
 -0.0465  0.4961 -0.5362

(0 ,2 ,.,.) = 
  0.1580 -0.0457 -0.4509
  0.0729  0.3713 -0.5718
 -0.0717  0.4613 -0.1932
     ⋮ 

(1 ,0 ,.,.) = 
 -0.4471 -0.3161  0.5461
  0.0000 -0.4608  0.7493
 -0.4650 -0.1266  0.6495

(1 ,1 ,.,.) = 
  0.0000 -0.5272  0.5692
  0.0000 -0.5083  1.0261
 -0.6355 -0.1450  0.8409

(1 ,2 ,.,.) = 
 -0.0908 -0.3115  0.1389
 -0.3261 -0.3288  0.3939
 -0.1672 -0.0851  0.3785
     ⋮ 

(2 ,0 ,.,.) = 
  0.1070 -0.0997  0.1548
  0.1702  0.2074 -0.1850
 -0.4211  0.0529 -0.1714

(2 ,1 ,.,.) = 
 -0.0324 -0.4525 -0.2831
  0.3735  0.2597 -0.4072
 -0.1380  0.3340 -0.1186

(2 ,2 ,.,.) = 
  0.0306 -0.2275 -0.1110
  0.2961  0.2516 -0.2088
 -0.0426  0.2926 -0.0051
...   
     ⋮ 

(61,0 ,.,.) = 
  0.1343  0.2353  0.0022
  0.0000  0.0000 -0.8001
 -0.0093 -0.0114 -0.1382

(61,1 ,.,.) = 
  0.4346  0.5454  0.2508
  0.0000  0.0000  

In [14]:
### Check accuracy after pruning with 5 epochs retraining
correct = 0
total = 0
for data in testloader:
    images, labels = data
    images = images.type(gpu_dtype)
    labels = labels.type(gpu_dtype).long()
    outputs = model_gpu(Variable(images))
    _, predicted = torch.max(outputs.data, 1) # return (每一横行最大的那个数，这个数所在的index)
    total += labels.size(0) #will be added up to 10,000, 每次有4个 - batch size
    correct += (predicted == labels).sum()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 59 %


In [17]:
pruned_counter = 0
for child in model_gpu.children():
    for children_of_child in child.children(): # Going thru all layers of the network
        if "Conv2d" in str(children_of_child): #check if it is a conv layer
            for param in children_of_child.parameters():
                if len(param.data.size()) == 4:
                    #Loop through all the entries
                    for i in range(param.data.size()[0]):
                        for j in range(param.data.size()[1]):
                            for k in range(param.data.size()[2]):
                                for l in range(param.data.size()[3]):
                                    if param.data[i,j,k,l] == 0:
                                        pruned_counter += 1
                else:
                    for i in range(param.data.size()[0]):
                        if param.data[i] == 0:
                            pruned_counter += 1

print("we pruned a total of",pruned_counter,"weights")

we pruned a total of 9065 weights
