In [1]:
import torch
import torch.utils.data as torchdata
import torch.nn as nn
import torch.nn.functional as F
import tqdm
from timeit import default_timer as time
import torch.optim as optim
from torch.distributions import Bernoulli
import utils
import torch.backends.cudnn as cudnn
cudnn.benchmark = True

# All variables 

In [2]:
model = "R110_C10"
parallel = False #set true if in notebook with multicore GPU
alpha = 0.8
beta = 1e-1
lr = 1e-4
penalty = -1
batch_size = 256 #recommended: 2048
max_epochs_training = 100 #original blockdrop: 10000
max_epochs_finetuning = 20 #original blockdrop: 2000
wd = 0.0
cl_step = 1
cv_dir_training = 'cv/trained_policy/'+ model 
cv_dir_finetuning = 'cv/finetuned/'+ model 
start_epoch = 0

load = None
data_dir = 'data/'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

# Helper Functions

## Reward calculation
Reward calculation is done per batch (2048 images). All values are calculated within the tensor. However, the comments present the calculation for one image

Inputs: preds (predicted class), targets (real class labels), policy()

Really per complete batch!!!!!!!!!!!! len(match) = 2048

In [3]:
def get_reward(preds, targets, policy):
 
    block_use = policy.sum(1).float()/policy.size(1)   # no. of blocks used / all blocks for complete batch
    sparse_reward = 1.0-block_use**2     # reward multiplicator -> the less blocks used, the higher the multiplicator

    _, pred_idx = preds.max(1) #get predicted classes 
    match = (pred_idx==targets).data #get matrix of all correct predicted images
    reward = sparse_reward
    reward[torch.logical_not(match)] = penalty #all not correct predicted images get penalty
    reward = reward.unsqueeze(1) 
    
    #reward = performance-based reward for Policy Net; match = all correct predicted images 
    return reward, match.float()

## Testing at end of training steps:
At the end of the Policy training and finetuning, BlockDrop is tested once before saving the model for later use
1. input image in policy and calculate output (probability vector of keeping/ dropping each layer)
2. for each value of vector: if >0.5: 1 (keep), else : 0 (drop)
3. forward pass of ResNet with only layers which are kept 
4. evaluate result
5. save the model for future use

In [4]:
def test(epoch, cv_dir):

    agent.eval()

    matches, rewards, policies = [], [], []
    for batch_idx, (inputs, targets) in tqdm.tqdm(enumerate(testloader), total=len(testloader)):
        
        # 1. input image in policy and calculate output (probability vector of keeping/ dropping each layer)
        targets = targets.to(device, non_blocking=True) 

        if not parallel:
            inputs = inputs.cuda()

        probs, _ = agent(inputs)
        
        # 2. for each value of vector: if >0.5: 1 (keep), else : 0 (drop)
        policy = probs.data.clone()
        policy[policy<0.5] = 0.0
        policy[policy>=0.5] = 1.0
        
        # 3. if still in curriculum learning: first 1-x layers are 1 anyway
        if cl_step < num_blocks:
            policy[:, :-cl_step] = 1
        
        # 4. forward pass of ResNet with only layers which are kept
        preds = rnet.forward(inputs, policy)
        
        # preparation of evaluation  
        reward, match = get_reward(preds, targets, policy.data)

        matches.append(match)
        rewards.append(reward)
        policies.append(policy.data)

    # 5. evaluate result
    accuracy, reward, sparsity, variance, policy_set = utils.performance_stats(policies, rewards, matches)

    log_str = 'TS - Accuracy: %.3f | Reward: %.2E | Sparsity: %.3f | Variance: %.3f | #: %d'%(accuracy, reward, sparsity, variance, len(policy_set))
    print(log_str)
    
    # 6. save the model for future use
    agent_state_dict = agent.module.state_dict() if parallel else agent.state_dict()

    state = {
      'agent': agent_state_dict,
      'epoch': epoch,
      'reward': reward,
      'acc': accuracy
    }
    torch.save(state, cv_dir+'/ckpt_E_%d.t7'%(epoch))
    pretrained = cv_dir+'/ckpt_E_%d.t7'%(epoch)
    print("Model saved: ", pretrained)

# Policy Training

## Training script
For each batch:
1. run model on input data to get current stand of policy
2. for each value of vector: if >0.5: 1 (keep), else : 0 (drop)
3. create probs_new (= a new random sample) for the dropping strategy --> goal: compare if it is better or worse than current Policy
4. if still in curriculum learning: in first x (= num_blocks of ResNet) epochs: first 1-x layers are set to 1 of both, probs_new and probs
5. forward pass of ResNet with only layers which are kept for both dropping strategies
6. calculate rewards for both rnets and compare them (= advantage)
7. calculate loss and backpropagate

In [5]:
def train(epoch):
   
    agent.train()

    matches, rewards, policies = [], [], []
    
    # training for one one batch
    for batch_idx, (inputs, targets) in tqdm.tqdm(enumerate(trainloader), total=len(trainloader)):
        
        #move input and target data to the device
        targets = targets.to(device, non_blocking=True) 
        if not parallel:
            inputs = inputs.to(device)
        
        #run the model on the input data to get current stand of policy: probs = result of Policy, _ not used for this model
        probs, _ = agent(inputs)

        #---------------------------------------------------------------------#
        # results of Policy Net are taken to create dropping strategy for ResNet: <0.5 drop; >=0.5: keep
        policy_map = probs.data.clone()
        policy_map[policy_map<0.5] = 0.0
        policy_map[policy_map>=0.5] = 1.0
        
        #now create a sample: new dropping recommendations modified from the first one by using the alpha value  
        # alpha: parameter to bound the distribution and prevent it from saturating (Paper, p. 8820)
        probs_new = probs*alpha + (1-probs)*(1-alpha)
        # bernoulli distribution to set get the dropping strategy (0= drop or 1= keep)
        distr = Bernoulli(probs_new)
        policy_sample = distr.sample()

        #curriculum learning: in the first x (< than blocks of Resnet) iterations only the last x (num_blocks-iteration) layers
        # of the ResNet are used for training the PolicyNet
        # = only decisions for the last x blocks are saved; first 1-x layers are kept 
        if cl_step < num_blocks:
            policy_sample[:, :-cl_step] = 1
            policy_map[:, :-cl_step] = 1
            
            #policy_mask: set a new variable where only currently trained layers are set to 1; first 1-x are set to 0
            policy_mask = torch.ones(inputs.size(0), policy_sample.size(1)).to(device)
            policy_mask[:, :-cl_step] = 0
        else:
            policy_mask = None
        
        v_inputs = inputs.data      
        # rnet with blocks recommended by current Policy
        preds_map = rnet.forward(v_inputs, policy_map)
        
        # rnet blocks recommended by the sample (modified version based on Bernoulli)
        preds_sample = rnet.forward(v_inputs, policy_sample)

        # calculate reward for results of both rnets
        reward_map, _ = get_reward(preds_map, targets, policy_map.data) # = baseline estimate (result of dropping strategy of current policy network)

        reward_sample, match = get_reward(preds_sample, targets, policy_sample.data) #= reward (result of sample) 
        
        # advantage --> calculates if sample is better than current strategy (>0) or worse (<0)
        advantage = reward_sample - reward_map
        
        # if sample is better, the probabilites of the modified gradients are increased (as current*advantage >0 else: decreased
        loss = -distr.log_prob(policy_sample)
        loss = loss * advantage.expand_as(policy_sample) #x.expand_as(y): expand x to size of y
        
        # in case of curriculum learning stage: only loss for the currently modified x last layers is saved by discarding others
        if policy_mask is not None:
            loss = policy_mask * loss 

        loss = loss.sum()

        probs_new = probs_new.clamp(1e-15, 1-1e-15) #clamp(min, max) -> size values to fit between min and max
        entropy_loss = -probs_new*torch.log(probs_new)
        entropy_loss = beta*entropy_loss.sum()

        loss = (loss - entropy_loss)/inputs.size(0)

        #---------------------------------------------------------------------#
        #1. set gradients list to zero (delete gradients of former epoch), 2.do backpropagation, 3.update parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # save results in list for performance evaluation of this epoch 
        matches.append(match.cpu())
        rewards.append(reward_sample.cpu())
        policies.append(policy_sample.data.cpu())
    
    #calculate performance metrics for this epoch
    accuracy, reward, sparsity, variance, policy_set = utils.performance_stats(policies, rewards, matches)

    result_train = 'Epoch: %d - Accuracy: %.3f | Reward: %.2E | Sparsity: %.3f | Variance: %.3f | #: %d'%(epoch, accuracy, reward, sparsity, variance, len(policy_set))

    print(result_train)

## Run Policy Training:

In [6]:
trainset, testset = utils.get_dataset(model, data_dir)
num_workers = 4 if torch.device("cuda") else 1

trainloader = torchdata.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) #50000 images
testloader = torchdata.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers) #10000 images
rnet, agent = utils.get_model(model, device)
num_blocks = sum(rnet.layer_config)

if load is not None:
    checkpoint = torch.load(load)
    agent.load_state_dict(checkpoint['agent'])
    start_epoch = checkpoint['epoch'] + 1
    print('loaded agent from', load)

if parallel:
    agent = nn.DataParallel(agent)
    rnet = nn.DataParallel(rnet)

if torch.device("cuda"):
    rnet.eval().cuda()
    agent.cuda()
elif torch.device("cpu"):
    rnet.eval().to(device)
    agent.to(device)

optimizer = optim.Adam(agent.parameters(), lr=lr, weight_decay=wd)

start_training = timer()

for epoch in range(start_epoch, start_epoch+max_epochs_training+1):

    if cl_step < num_blocks:
        cl_step = 1 + 1 * (epoch // 1)
    else:
        cl_step = num_blocks

    print('training the last %d blocks ...' % cl_step)

    train(epoch)

    # the testing is done after last epoch
    if epoch > 0 and epoch % max_epochs_training == 0:
        pretrained = test(epoch, cv_dir_training)

end_training = timer()
training_time = end_training -start_training
print("Training in s: %.2f"%(training_time))


training the last 1 blocks ...


100%|██████████| 3125/3125 [03:18<00:00, 15.77it/s]


Epoch: 0 - Accuracy: 0.993 | Reward: 1.89E-02 | Sparsity: 53.277 | Variance: 0.448 | #: 2
training the last 2 blocks ...


100%|██████████| 3125/3125 [03:12<00:00, 16.27it/s]


Epoch: 1 - Accuracy: 0.992 | Reward: 4.59E-02 | Sparsity: 52.514 | Variance: 0.618 | #: 4


100%|██████████| 625/625 [00:18<00:00, 33.86it/s]


TS - Accuracy: 0.930 | Reward: -2.17E-03 | Sparsity: 52.000 | Variance: 0.000 | #: 1
Model saved:  cv/trained_policy/R110_C10/ckpt_E_1.t7
Training in s: 414.60


# Joint Finetuning

## Training of finetuning
same as in train.ipynb, but also training of rnet:

For each batch:
1. run model on input data to get current stand of policy
2. for each value of vector: if >0.5: 1 (keep), else : 0 (drop)
3. create probs_new (= a new random sample) for the dropping strategy --> goal: compare if it is better or worse than current Policy
4. if still in curriculum learning: in first x (= num_blocks of ResNet) epochs: first 1-x layers are set to 1 of both, probs_new and probs
5. forward pass of ResNet with only layers which are kept for both dropping strategies
6. calculate rewards for both rnets and compare them (= advantage)
7. calculate loss and backpropagate

In [7]:
def finetune(epoch):

    agent.train()
    rnet.train()

    matches, rewards, policies = [], [], []

    # training for one one batch
    for batch_idx, (inputs, targets) in tqdm.tqdm(enumerate(trainloader), total=len(trainloader)):

        #move input and target data to the device
        targets = targets.to(device, non_blocking=True) 
        if not parallel:
            inputs = inputs.to(device) 
        
        #run the model on the input data to get current stand of policy: probs = result of Policy, _ not used for this model
        probs, _ = agent(inputs)

        #---------------------------------------------------------------------#

        # results of Policy Net are taken to create dropping strategy for ResNet: <0.5 drop; >=0.5: keep
        policy_map = probs.data.clone()
        policy_map[policy_map<0.5] = 0.0
        policy_map[policy_map>=0.5] = 1.0

        #now create a sample: new dropping recommendations modified from the first one by using the alpha value  
        # alpha: parameter to bound the distribution and prevent it from saturating (Paper, p. 8820)
        probs = probs*alpha + (1-probs)*(1-alpha)
        distr = Bernoulli(probs)
        policy = distr.sample()

        v_inputs = inputs.data

        # rnet with blocks recommended by current Policy
        preds_map = rnet.forward(v_inputs, policy_map)

        # rnet blocks recommended by the sample (modified version based on Bernoulli)
        preds_sample = rnet.forward(inputs, policy)

        # calculate reward for results of both rnets
        reward_map, _ = get_reward(preds_map, targets, policy_map.data)
        reward_sample, match = get_reward(preds_sample, targets, policy.data)

        # advantage --> calculates if sample is better than current strategy (>0) or worse (<0)
        advantage = reward_sample - reward_map

        # if sample is better, the probabilites of the modified gradients are increased (as current*advantage >0 else: decreased
        loss = -distr.log_prob(policy).sum(1, keepdim=True) * advantage
        loss = loss.sum()
        loss += F.cross_entropy(preds_sample, targets)


        #---------------------------------------------------------------------#
        #1. set gradients list to zero (delete gradients of former epoch), 2.do backpropagation, 3.update parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        matches.append(match.cpu())
        rewards.append(reward_sample.cpu())
        policies.append(policy.data.cpu())

    #calculate performance metrics for this epoch
    accuracy, reward, sparsity, variance, policy_set = utils.performance_stats(policies, rewards, matches)

    result_finetune = 'Epoch: %d - Accuracy: %.3f | Reward: %.2E | Sparsity: %.3f | Variance: %.3f | #: %d'%(epoch, accuracy, reward, sparsity, variance, len(policy_set))

    print(result_finetune)

## Run Joint Finetuning:

In [9]:
rnet, agent = utils.get_model(model, device)

if pretrained is not None:
    checkpoint = torch.load(pretrained)
    key = 'net' if 'net' in checkpoint else 'agent'
    agent.load_state_dict(checkpoint[key])
    print('loaded pretrained model from', pretrained)

if load is not None:
    checkpoint = torch.load(load)
    rnet.load_state_dict(checkpoint['resnet'])
    agent.load_state_dict(checkpoint['agent'])
    start_epoch = checkpoint['epoch'] + 1
    print('loaded agent from', load)


if parallel:
    agent = nn.DataParallel(agent)
    rnet = nn.DataParallel(rnet)

rnet.to(device)
agent.to(device)

optimizer = optim.Adam(list(agent.parameters())+list(rnet.parameters()), lr=lr, weight_decay=wd)

start_finetuning = timer()
for epoch in range(start_epoch, start_epoch+max_epochs_finetuning+1):
    
    finetune(epoch)
    # the testing is done after last epoch
    if epoch > 0 and epoch % max_epochs_finetuning == 0:
        test(epoch, cv_dir_finetuning)
        
end_finetuning = timer()
finetuning_time = end_finetuning -start_finetuning
print("Finetuning in s: %.2f"%(finetuning_time))

100%|██████████| 3125/3125 [06:29<00:00,  8.02it/s]


Epoch: 0 - Accuracy: 0.754 | Reward: 2.67E-01 | Sparsity: 29.854 | Variance: 3.714 | #: 50000


100%|██████████| 3125/3125 [06:23<00:00,  8.14it/s]


Epoch: 1 - Accuracy: 0.870 | Reward: 4.63E-01 | Sparsity: 30.099 | Variance: 3.558 | #: 50000


100%|██████████| 625/625 [00:21<00:00, 29.51it/s]


TS - Accuracy: 0.891 | Reward: -5.02E-02 | Sparsity: 52.179 | Variance: 0.383 | #: 2
Model saved:  cv/finetuned/R110_C10/ckpt_E_1.t7
Finetuning in s: 800.63


# Times of complete training

In [10]:
print("Policy Training: %.2f | Finetuning: %.2f" %(training_time, finetuning_time))

Policy Training: 414.60 | Finetuning: 800.63
