## Compressing a Resnet-18 trained on Imagenet

#### To replicate results with pretrained models please download the following models

1. resnet18_imagenet_pytorch.pth
2. resnet18_imagenet_pytorch_small_cup_t_point_9.pth
3. resnet18_imagenet_pytorch_lr_point1_cupSS_K_point03_b_point3.pth

In [1]:
import sys; sys.argv=['']; 
sys.path.insert(0, '../')
del sys
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
%matplotlib inline

import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2'

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torch.utils.model_zoo as model_zoo
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import datasets, transforms
from tensorboardX import SummaryWriter

import numpy as np
import random
import time
import copy

# os.environ["CUDA_VISIBLE_DEVICES"] = '1'

from src.imagenet_utils import train,validate,save_checkpoint,AverageMeter,ProgressMeter
from src.imagenet_utils import adjust_learning_rate,accuracy,adjust_learning_rate_pytorch_retrain
from src.utils import plot_tsne,fancy_dendrogram,save_obj,load_obj,weights_init
from src.model import VGG,load_model
from src.prune_model import prune_model
from src.cluster_model import cluster_model
from src.train_test import adjust_learning_rate_nips,adjust_learning_rate_iccv

### Specify imagenet data path

In [2]:
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

### add path to dataset here #####
parser.add_argument('--data', type=str,default='/localscratch/mgh/Rahul_Imagenet/',metavar='S',help='path to dataset')

_StoreAction(option_strings=['--data'], dest='data', nargs=None, const=None, default='/localscratch/mgh/Rahul_Imagenet/', type=<class 'str'>, choices=None, help='path to dataset', metavar='S')

In [3]:

parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=6, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--lr', '--learning-rate', default=0.01, type=float,
                    metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('-p', '--print-freq', default=500, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=12346, metavar='S',
                    help='random seed (default: 12346)')
parser.add_argument('--num_output', type=int, default=10, metavar='S',
                    help='number of classes(default: 10)')
parser.add_argument('--log-interval', type=int, default=100, metavar='N',
                    help='how many batches to wait before logging training status')
parser.add_argument('--checkpoint_path', type=str, default='./checkpoints/resnet18_imagenet_pytorch.pth', metavar='S',
                    help='path to store model training checkpoints')
parser.add_argument('--gpu', type=int, default=0, nargs='+', help='used gpu')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                        help='Use multi-processing distributed training to launch ')#,
#                          'N processes per node, which has N GPUs. This is the ',
#                          'fastest way to use PyTorch for either single node or ',
#                          'multi node data parallel training')

args = parser.parse_args()

use_cuda = not args.no_cuda and torch.cuda.is_available()
# if use_cuda:
#     print('using gpu',args.gpu)
#     os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu)
device = torch.device("cuda" if use_cuda else "cpu")

# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(
    traindir,
    transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))

train_sampler = None

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
    num_workers=args.workers, pin_memory=True, sampler=train_sampler)

val_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=args.batch_size, shuffle=False,
    num_workers=args.workers, pin_memory=True)

criterion = nn.CrossEntropyLoss().cuda(args.gpu)


writer = SummaryWriter('logs/resnet18_imagenet/')

#set all seeds for reproducability
def set_random_seed(seed):    
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(args.seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

set_random_seed(args.seed)

torch.cuda.set_device(args.gpu)

### Train baseline model

In [9]:
set_random_seed(args.seed)

args.lr = 0.1

resnet18 = torchvision.models.resnet18(pretrained=False)
optimizer = torch.optim.SGD(resnet18.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

resnet18 = torch.nn.DataParallel(resnet18)
resnet18.cuda(args.gpu)

best_val_acc = 0

if not os.path.isfile(args.checkpoint_path):
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet18, criterion, optimizer, epoch, args)

        # evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet18, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet18, args.checkpoint_path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet18_imagenet_pytorch_schedule/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet18_imagenet_pytorch_schedule/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
else:   
    resnet18 = torch.load(args.checkpoint_path)
    _,val_top1,val_top5 = validate(val_loader, resnet18, criterion, args, verbose=False)
    
print('loaded model with top1 : {}, top5 : {}'.format(val_top1,val_top5))

 * Acc@1 69.876 Acc@5 89.256
loaded model with top1 : 69.8759994506836, top5 : 89.25599670410156


### Compress using CUP (T = 0.8)

- This section compresses the resnet model that we trained

#### Load pre-trained model

In [10]:
resnet18 = torch.load(args.checkpoint_path).module
_,val_top1,val_top5 = validate(val_loader, resnet18, criterion, args, verbose=False)
best_val_acc = val_top5
print('loaded model with top1 : {}, top5 : {}'.format(val_top1,val_top5))

 * Acc@1 69.876 Acc@5 89.256
loaded model with top1 : 69.8759994506836, top5 : 89.25599670410156


In [11]:
cluster_args = {    
    'cluster_layers' : {4:0,9:0,14:0,21:0,26:0,33:0,38:0,45:0},
    'conv_feature_size' : 1,
    'features' : 'both',
    'channel_reduction' : 'fro',
    'use_bias' : False,
    'reshape_exists' : False,
    'linkage_method' : 'ward',
    'distance_metric' : 'euclidean',
    'cluster_criterion' : 'hierarchical',
    'distance_threshold' : 0.8,
    'merge_criterion' : 'max_l2_norm',
    'verbose' : False
}

path = args.checkpoint_path[:-4] + '_small_cup_t_point_8.pth' 
model_modifier = cluster_model(resnet18,cluster_args)
resnet18_clustered = model_modifier.cluster_model()
optimizer = torch.optim.SGD(resnet18_clustered.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

resnet18_clustered = torch.nn.DataParallel(resnet18_clustered)
resnet18_clustered.cuda(args.gpu)

_,top1_acc_no_retrain,top5_acc_no_retrain = validate(val_loader, resnet18_clustered, criterion, args, verbose=False)

best_val_acc = 0

if not os.path.isfile(path):
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet18_clustered, criterion, optimizer, epoch, args)

        # evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet18_clustered, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet18_clustered, path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet18_imagenet_t_point_8_pytorch/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet18_imagenet_t_point_8_pytorch/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
else:   
    resnet18_clustered = torch.load(path)
    _,test_top1_acc,test_top5_acc = validate(val_loader, resnet18_clustered, criterion, args, verbose=False)
        
print('large top-1 {:.2f}, small top-1 {:.2f}, top-1 drop {:.2f}, large top-5 {:.2f}, small top-5 {:.2f}, top-5 drop {:.2f}'.format(val_top1,test_top1_acc,val_top1-test_top1_acc,val_top5,test_top5_acc,val_top5-test_top5_acc))       

 * Acc@1 0.294 Acc@5 0.938
 * Acc@1 68.876 Acc@5 88.466
large top-1 69.88, small top-1 68.88, top-1 drop 1.00, large top-5 89.26, small top-5 88.47, top-5 drop 0.79


In [12]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

print_model_param_flops(resnet18.cpu(),input_res=224)
print_model_param_flops(resnet18_clustered.module.cpu(),input_res=224)

  + Number of FLOPs: 3.64G
  + Number of FLOPs: 2.08G


2077024562.0

### Compress using CUP (T = 0.825)

- This section compresses the resnet model that we trained

#### Load pre-trained model

In [13]:
resnet18 = torch.load(args.checkpoint_path).module
_,val_top1,val_top5 = validate(val_loader, resnet18, criterion, args, verbose=False)
best_val_acc = val_top5
print('loaded model with top1 : {}, top5 : {}'.format(val_top1,val_top5))

 * Acc@1 69.876 Acc@5 89.256
loaded model with top1 : 69.8759994506836, top5 : 89.25599670410156


In [14]:
cluster_args = {    
    'cluster_layers' : {4:0,9:0,14:0,21:0,26:0,33:0,38:0,45:0},
    'conv_feature_size' : 1,
    'features' : 'both',
    'channel_reduction' : 'fro',
    'use_bias' : False,
    'reshape_exists' : False,
    'linkage_method' : 'ward',
    'distance_metric' : 'euclidean',
    'cluster_criterion' : 'hierarchical',
    'distance_threshold' : 0.825,
    'merge_criterion' : 'max_l2_norm',
    'verbose' : False
}

path = args.checkpoint_path[:-4] + '_small_cup_t_point_825.pth' 
model_modifier = cluster_model(resnet18,cluster_args)
resnet18_clustered = model_modifier.cluster_model()
optimizer = torch.optim.SGD(resnet18_clustered.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

resnet18_clustered = torch.nn.DataParallel(resnet18_clustered)
resnet18_clustered.cuda(args.gpu)

_,top1_acc_no_retrain,top5_acc_no_retrain = validate(val_loader, resnet18_clustered, criterion, args, verbose=False)

best_val_acc = 0

if not os.path.isfile(path):
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet18_clustered, criterion, optimizer, epoch, args)

        # evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet18_clustered, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet18_clustered, path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet18_imagenet_t_point_825_pytorch/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet18_imagenet_t_point_825_pytorch/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
else:   
    resnet18_clustered = torch.load(path)
    _,test_top1_acc,test_top5_acc = validate(val_loader, resnet18_clustered, criterion, args, verbose=False)
        
print('large top-1 {:.2f}, small top-1 {:.2f}, top-1 drop {:.2f}, large top-5 {:.2f}, small top-5 {:.2f}, top-5 drop {:.2f}'.format(val_top1,test_top1_acc,val_top1-test_top1_acc,val_top5,test_top5_acc,val_top5-test_top5_acc))       

 * Acc@1 0.284 Acc@5 0.916
 * Acc@1 68.292 Acc@5 88.146
large top-1 69.88, small top-1 68.29, top-1 drop 1.58, large top-5 89.26, small top-5 88.15, top-5 drop 1.11


In [15]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

print_model_param_flops(resnet18.cpu(),input_res=224)
print_model_param_flops(resnet18_clustered.module.cpu(),input_res=224)

  + Number of FLOPs: 3.64G
  + Number of FLOPs: 1.97G


1970388704.0

### Try pruning alongside training the model (CUP-SS)

#### Summary

1. K  = 0.03, b = 0.3 : small top-1 67.38,  small top-5 87.86
  
  + Number of FLOPs: 3.64G
  + Number of FLOPs: 2.09G

2. K  = 0.03, b = 0.4 : small top-1 66.86,  small top-5 87.37
  
  + Number of FLOPs: 3.64G
  + Number of FLOPs: 1.92G
    
3. K  = 0.03, b = 0.5 : small top-1 67.24,  small top-5 87.59
  
  + Number of FLOPs: 3.64G
  + Number of FLOPs: 1.99G

#### K  = 0.03, b = 0.3

In [4]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

set_random_seed(args.seed)

resnet18 = torchvision.models.resnet18(pretrained=False)
resnet18_clustered = torch.nn.DataParallel(resnet18)
resnet18_clustered.cuda(args.gpu)

cluster_args = {    
    'cluster_layers' : {4:0,9:0,14:0,21:0,26:0,33:0,38:0,45:0},
    'conv_feature_size' : 1,
    'features' : 'both',
    'channel_reduction' : 'fro',
    'use_bias' : False,
    'reshape_exists' : False,
    'linkage_method' : 'ward',
    'distance_metric' : 'euclidean',
    'cluster_criterion' : 'hierarchical',
    'distance_threshold' : 0.825,
    'merge_criterion' : 'max_l2_norm',
    'verbose' : False
}

path = args.checkpoint_path[:-4] + '_lr_point1_cupSS_K_point03_b_point3.pth' 

T_values = {}
slope,b = 0.03,0.3

for epoch in range(args.epochs+1):
    T_values[epoch] = slope * (epoch) + b

In [5]:
args.epochs = 90
args.lr = 0.1
best_val_acc = 0
flag = True


if not os.path.isfile(path):
    for epoch in range(1, args.epochs+1):          
        if epoch in T_values.keys() and flag:
            print('changing T value to {}'.format(T_values[epoch]))
            cluster_args['distance_threshold'] = T_values[epoch]        
            model_modifier = cluster_model(resnet18_clustered.module,cluster_args)
            resnet18_clustered = model_modifier.cluster_model()            
            resnet18_clustered = torch.nn.DataParallel(resnet18_clustered)
            resnet18_clustered.cuda(args.gpu)
            
            optimizer = torch.optim.SGD(resnet18_clustered.parameters(), args.lr,
                                        momentum=args.momentum,
                                        weight_decay=args.weight_decay)
            T = T_values[epoch]            
            flops = print_model_param_flops(copy.deepcopy(resnet18_clustered.module).cpu(),input_res=224)
            if (flops/1e9 <= 2.12):
                print('stop filter pruning')
                flag = False
                
                
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet18_clustered, criterion, optimizer, epoch, args)

#         evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet18_clustered, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet18_clustered, path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet18_imagenet_lr_point1_k_point_03_b_point3/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet18_imagenet_lr_point1_k_point_03_b_point3/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
       
        writer.add_scalars('resnet18_imagenet_lr_point1_k_point_03_b_point3/flops',{'flops': flops}, epoch) 
        writer.add_scalars('resnet18_imagenet_lr_point1_k_point_03_b_point3/T',{'T': T}, epoch) 
else:
    resnet18_clustered = torch.load(path).module
    test_loss,test_top1_acc,test_top5_acc = validate(val_loader, resnet18_clustered, criterion, args, verbose=False)
        
print('small top-1 {:.2f},  small top-5 {:.2f}'.format(test_top1_acc,test_top5_acc))        



 * Acc@1 67.384 Acc@5 87.864
small top-1 67.38,  small top-5 87.86


In [6]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

print_model_param_flops(resnet18.cpu(),input_res=224)
print_model_param_flops(resnet18_clustered.cpu(),input_res=224)

  + Number of FLOPs: 3.64G
  + Number of FLOPs: 2.09G


2088542600.0