## Compressing a Resnet-34 trained on Imagenet

#### To replicate results with pretrained models please download the following models

1. resnet34_imagenet_pytorch.pth
2. resnet34_imagenet_pytorch_small_cup_t_point_60.pth
3. resnet34_imagenet_pytorch_small_cup_t_point_65.pth
4. resnet34_imagenet_pytorch_lr_point1_cupSS_K_point03_b_point_4.pth

In [2]:
import sys; sys.argv=['']; 
sys.path.insert(0, '../')
del sys
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
%matplotlib inline

import os
os.environ["CUDA_VISIBLE_DEVICES"] = '3,4,5'

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torch.utils.model_zoo as model_zoo
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import datasets, transforms
from tensorboardX import SummaryWriter

import numpy as np
import random
import time
import copy

# os.environ["CUDA_VISIBLE_DEVICES"] = '1'

from src.imagenet_utils import train,validate,save_checkpoint,AverageMeter,ProgressMeter
from src.imagenet_utils import adjust_learning_rate,accuracy,adjust_learning_rate_pytorch_retrain
from src.utils import plot_tsne,fancy_dendrogram,save_obj,load_obj
from src.model import VGG,load_model
from src.prune_model import prune_model
from src.cluster_model import cluster_model
from src.train_test import adjust_learning_rate_nips,adjust_learning_rate_iccv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Specify imagenet data path

In [14]:
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

### add path to dataset here #####
parser.add_argument('--data', type=str,default='/localscratch/mgh/Rahul_Imagenet/',metavar='S',help='path to dataset')

_StoreAction(option_strings=['--data'], dest='data', nargs=None, const=None, default='/localscratch/mgh/Rahul_Imagenet/', type=<class 'str'>, choices=None, help='path to dataset', metavar='S')

In [3]:

parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=6, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--lr', '--learning-rate', default=0.01, type=float,
                    metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('-p', '--print-freq', default=500, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=12346, metavar='S',
                    help='random seed (default: 12346)')
parser.add_argument('--num_output', type=int, default=10, metavar='S',
                    help='number of classes(default: 10)')
parser.add_argument('--log-interval', type=int, default=100, metavar='N',
                    help='how many batches to wait before logging training status')
parser.add_argument('--checkpoint_path', type=str, default='./checkpoints/resnet34_imagenet_pytorch.pth', metavar='S',
                    help='path to store model training checkpoints')
parser.add_argument('--gpu', type=int, default=0, nargs='+', help='used gpu')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                        help='Use multi-processing distributed training to launch ')#,
#                          'N processes per node, which has N GPUs. This is the ',
#                          'fastest way to use PyTorch for either single node or ',
#                          'multi node data parallel training')

args = parser.parse_args()

use_cuda = not args.no_cuda and torch.cuda.is_available()
# if use_cuda:
#     print('using gpu',args.gpu)
#     os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu)
device = torch.device("cuda" if use_cuda else "cpu")

# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(
    traindir,
    transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))

train_sampler = None

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
    num_workers=args.workers, pin_memory=True, sampler=train_sampler)

val_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=args.batch_size, shuffle=False,
    num_workers=args.workers, pin_memory=True)

criterion = nn.CrossEntropyLoss().cuda(args.gpu)


writer = SummaryWriter('logs/resnet34_imagenet/')

#set all seeds for reproducability
def set_random_seed(seed):    
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(args.seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

set_random_seed(args.seed)

torch.cuda.set_device(args.gpu)

### Train baseline model

In [4]:
set_random_seed(args.seed)

args.lr = 0.1

resnet34 = torchvision.models.resnet34(pretrained=False)
optimizer = torch.optim.SGD(resnet34.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

resnet34 = torch.nn.DataParallel(resnet34)
resnet34.cuda(args.gpu)

best_val_acc = 0

if not os.path.isfile(args.checkpoint_path):
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet34, criterion, optimizer, epoch, args)

        # evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet34, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet34, args.checkpoint_path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet34_imagenet_pytorch_schedule/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet34_imagenet_pytorch_schedule/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
else:   
    resnet34 = torch.load(args.checkpoint_path)
    _,val_top1,val_top5 = validate(val_loader, resnet34, criterion, args, verbose=False)
    
print('loaded model with top1 : {}, top5 : {}'.format(val_top1,val_top5))

 * Acc@1 73.590 Acc@5 91.440
loaded model with top1 : 73.58999633789062, top5 : 91.43999481201172


### CUP Compression summary

1. T = 0.60 : large top-1 73.59, small top-1 72.73, top-1 drop 0.86, large top-5 91.44, small top-5 90.91, top-5 drop 0.53

    - Number of FLOPs: 7.34G
    - Number of FLOPs: 4.12G
    
2. T = 0.65 : large top-1 73.59, small top-1 71.99, top-1 drop 1.60, large top-5 91.44, small top-5 90.47, top-5 drop 0.97

    + Number of FLOPs: 7.34G
    + Number of FLOPs: 3.53G

3. T = 0.675 : large top-1 73.59, small top-1 71.65, top-1 drop 1.94, large top-5 91.44, small top-5 90.21, top-5 drop 1.23

    + Number of FLOPs: 7.34G
    + Number of FLOPs: 3.20G
    
4. T = 0.70 : large top-1 73.59, small top-1 71.15, top-1 drop 2.44, large top-5 91.44, small top-5 90.08, top-5 drop 1.36

    + Number of FLOPs: 7.34G
    + Number of FLOPs: 2.88G
    

### Compress using CUP (T = 0.60)

- This section compresses the resnet model that we trained

#### Load pre-trained model

In [5]:
resnet34 = torch.load(args.checkpoint_path).module
_,val_top1,val_top5 = validate(val_loader, resnet34, criterion, args, verbose=False)
best_val_acc = val_top5
print('loaded model with top1 : {}, top5 : {}'.format(val_top1,val_top5))

 * Acc@1 73.590 Acc@5 91.440
loaded model with top1 : 73.58999633789062, top5 : 91.43999481201172


In [6]:
cluster_args = {
    'cluster_layers' : {4:0,9:0,14:0,19:0,26:0,31:0,36:0,41:0,48:0,53:0,58:0,63:0,68:0,73:0,80:0,85:0},
    'conv_feature_size' : 1,
    'features' : 'both',
    'channel_reduction' : 'fro',
    'use_bias' : False,
    'reshape_exists' : False,
    'linkage_method' : 'ward',
    'distance_metric' : 'euclidean',
    'cluster_criterion' : 'hierarchical',
    'distance_threshold' : 0.60,
    'merge_criterion' : 'max_l2_norm',
    'verbose' : False
}

path = args.checkpoint_path[:-4] + '_small_cup_t_point_60.pth' 
model_modifier = cluster_model(resnet34,cluster_args)
resnet34_clustered = model_modifier.cluster_model()#[int(nodes*drop_percentage) for nodes in [500,300]])
optimizer = torch.optim.SGD(resnet34_clustered.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

resnet34_clustered = torch.nn.DataParallel(resnet34_clustered)
resnet34_clustered.cuda(args.gpu)

_,top1_acc_no_retrain,top5_acc_no_retrain = validate(val_loader, resnet34_clustered, criterion, args, verbose=False)

best_val_acc = 0

if not os.path.isfile(path):
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet34_clustered, criterion, optimizer, epoch, args)

        # evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet34_clustered, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet34_clustered, path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet34_imagenet_t_point_60_pytorch/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet34_imagenet_t_point_60_pytorch/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
else:   
    resnet34_clustered = torch.load(path)
    _,test_top1_acc,test_top5_acc = validate(val_loader, resnet34_clustered, criterion, args, verbose=False)
        
print('large top-1 {:.2f}, small top-1 {:.2f}, top-1 drop {:.2f}, large top-5 {:.2f}, small top-5 {:.2f}, top-5 drop {:.2f}'.format(val_top1,test_top1_acc,val_top1-test_top1_acc,val_top5,test_top5_acc,val_top5-test_top5_acc))       

 * Acc@1 0.616 Acc@5 2.580
 * Acc@1 72.730 Acc@5 90.910
large top-1 73.59, small top-1 72.73, top-1 drop 0.86, large top-5 91.44, small top-5 90.91, top-5 drop 0.53


In [7]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

print_model_param_flops(resnet34.cpu(),input_res=224)
print_model_param_flops(resnet34_clustered.module.cpu(),input_res=224)

  + Number of FLOPs: 7.34G
  + Number of FLOPs: 4.12G


4117474673.0

### Compress using CUP (T = 0.65)

- This section compresses the resnet model that we trained

#### Load pre-trained model

In [8]:
resnet34 = torch.load(args.checkpoint_path).module
_,val_top1,val_top5 = validate(val_loader, resnet34, criterion, args, verbose=False)
best_val_acc = val_top5
print('loaded model with top1 : {}, top5 : {}'.format(val_top1,val_top5))

 * Acc@1 73.590 Acc@5 91.440
loaded model with top1 : 73.58999633789062, top5 : 91.43999481201172


In [9]:
cluster_args = {
    'cluster_layers' : {4:0,9:0,14:0,19:0,26:0,31:0,36:0,41:0,48:0,53:0,58:0,63:0,68:0,73:0,80:0,85:0},
    'conv_feature_size' : 1,
    'features' : 'both',
    'channel_reduction' : 'fro',
    'use_bias' : False,
    'reshape_exists' : False,
    'linkage_method' : 'ward',
    'distance_metric' : 'euclidean',
    'cluster_criterion' : 'hierarchical',
    'distance_threshold' : 0.65,
    'merge_criterion' : 'max_l2_norm',
    'verbose' : False
}

path = args.checkpoint_path[:-4] + '_small_cup_t_point_65.pth' 
model_modifier = cluster_model(resnet34,cluster_args)
resnet34_clustered = model_modifier.cluster_model()#[int(nodes*drop_percentage) for nodes in [500,300]])
optimizer = torch.optim.SGD(resnet34_clustered.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

resnet34_clustered = torch.nn.DataParallel(resnet34_clustered)
resnet34_clustered.cuda(args.gpu)

_,top1_acc_no_retrain,top5_acc_no_retrain = validate(val_loader, resnet34_clustered, criterion, args, verbose=False)

best_val_acc = 0

if not os.path.isfile(path):
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet34_clustered, criterion, optimizer, epoch, args)

        # evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet34_clustered, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet34_clustered, path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet34_imagenet_t_point_65_pytorch/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet34_imagenet_t_point_65_pytorch/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
else:   
    resnet34_clustered = torch.load(path)
    _,test_top1_acc,test_top5_acc = validate(val_loader, resnet34_clustered, criterion, args, verbose=False)
        
print('large top-1 {:.2f}, small top-1 {:.2f}, top-1 drop {:.2f}, large top-5 {:.2f}, small top-5 {:.2f}, top-5 drop {:.2f}'.format(val_top1,test_top1_acc,val_top1-test_top1_acc,val_top5,test_top5_acc,val_top5-test_top5_acc))       

 * Acc@1 0.298 Acc@5 2.016
 * Acc@1 71.992 Acc@5 90.472
large top-1 73.59, small top-1 71.99, top-1 drop 1.60, large top-5 91.44, small top-5 90.47, top-5 drop 0.97


In [10]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

print_model_param_flops(resnet34.cpu(),input_res=224)
print_model_param_flops(resnet34_clustered.module.cpu(),input_res=224)

  + Number of FLOPs: 7.34G
  + Number of FLOPs: 3.53G


3534334199.0

### Try pruning alongside training the model (CUP-SS)

### Summary

1. K=0.03, b = 0.3 : small top-1 71.98, small top-5 90.42

  + Number of FLOPs: 7.34G
  + Number of FLOPs: 4.30G


2. K=0.03, b = 0.4

#### K=0.03, b = 0.4

In [11]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

set_random_seed(args.seed)

resnet34 = torchvision.models.resnet34(pretrained=False)
resnet34_clustered = torch.nn.DataParallel(resnet34)
resnet34_clustered.cuda(args.gpu)


cluster_args = {
    'cluster_layers' : {4:0,9:0,14:0,19:0,26:0,31:0,36:0,41:0,48:0,53:0,58:0,63:0,68:0,73:0,80:0,85:0},
    'conv_feature_size' : 1,
    'features' : 'both',
    'channel_reduction' : 'fro',
    'use_bias' : False,
    'reshape_exists' : False,
    'linkage_method' : 'ward',
    'distance_metric' : 'euclidean',
    'cluster_criterion' : 'hierarchical',
    'distance_threshold' : 0.70,
    'merge_criterion' : 'max_l2_norm',
    'verbose' : False
}

path = args.checkpoint_path[:-4] + '_lr_point1_cupSS_K_point03_b_point_4.pth' 

T_values = {}
slope,b = 0.03, 0.4

for epoch in range(args.epochs+1):
    T_values[epoch] = slope * (epoch) + b

In [12]:
args.epochs = 90
args.lr = 0.1
best_val_acc = 0
flag = True


if not os.path.isfile(path):
    for epoch in range(1, args.epochs+1):          
        if epoch in T_values.keys() and flag:
            print('changing T value to {}'.format(T_values[epoch]))
            cluster_args['distance_threshold'] = T_values[epoch]        
            model_modifier = cluster_model(resnet34_clustered.module,cluster_args)
            resnet34_clustered = model_modifier.cluster_model()            
            resnet34_clustered = torch.nn.DataParallel(resnet34_clustered)
            resnet34_clustered.cuda(args.gpu)
            
            optimizer = torch.optim.SGD(resnet34_clustered.parameters(), args.lr,
                                        momentum=args.momentum,
                                        weight_decay=args.weight_decay)
            T = T_values[epoch]            
            flops = print_model_param_flops(copy.deepcopy(resnet34_clustered.module).cpu(),input_res=224)
            if (flops/1e9 <= 4.34):
                print('stop filter pruning')
                flag = False
                
                
        adjust_learning_rate_pytorch_retrain(optimizer, epoch, args)        

        # train for one epoch
        train_loss,train_top1,train_top5 = train(train_loader, resnet34_clustered, criterion, optimizer, epoch, args)

#         evaluate on validation set
        val_loss,val_top1,val_top5 = validate(val_loader, resnet34_clustered, criterion, args)
        
        if val_top1 > best_val_acc:  
            torch.save(resnet34_clustered, path, pickle_protocol=4)            
            best_val_acc = val_top1    

        writer.add_scalars('resnet34_imagenet_lr_point1_k_point_3_b_point4/loss',{'train_loss': train_loss,
                                        'val_loss' : val_loss}, epoch)
        writer.add_scalars('resnet34_imagenet_lr_point1_k_point_3_b_point4/accuracy',{'train_top1': train_top1,
                                                  'val_top1': val_top1,
                                                  'train_top5': train_top5,
                                                  'val_top5': val_top5}, epoch) 
       
        writer.add_scalars('resnet34_imagenet_lr_point1_k_point_3_b_point4/flops',{'flops': flops}, epoch) 
        writer.add_scalars('resnet34_imagenet_lr_point1_k_point_3_b_point4/T',{'T': T}, epoch) 
else:
    resnet34_clustered = torch.load(path).module
    val_loss,val_top1,val_top5 = validate(val_loader, resnet34_clustered, criterion, args, verbose=False)
        
print('small top-1 {:.2f}, small top-5 {:.2f}'.format(val_top1,val_top5))        

 * Acc@1 71.688 Acc@5 90.284
small top-1 71.69, small top-5 90.28


In [13]:
from src.compute_flops import print_model_param_nums,print_model_param_flops

print_model_param_flops(resnet34.cpu(),input_res=224)
print_model_param_flops(resnet34_clustered.cpu(),input_res=224)

  + Number of FLOPs: 7.34G
  + Number of FLOPs: 4.27G


4273916483.0