# Fine-tuning an $\alpha$-pooling model

We use a custom caffe framework, which implements a SignedPowerLayer. Please make sure to clone and make it before using this script and add the path to caffe in the following box.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
sys.path.append('/home/simon/Research/lib/caffe/python')
sys.path.append('/home/simon/Research/finegrained/src/part_model_layer/part_autoencoder')
sys.path.append('/home/simon/Research/generic/src/bilinear_logm/')

import caffe
import scipy.misc
import h5py
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import glob
import time
%matplotlib inline  
import os
import matplotlib
from sklearn.metrics import confusion_matrix
import google.protobuf
import uuid
import pyprind
import random
import google.protobuf.text_format
caffe.set_device(0)
caffe.set_mode_gpu()

The following box contains most things you might want to adjust

In [None]:
# Initial value for alpha, called gamma in this file
gamma = 2.0
chop_off_layer = 'relu5_3'
# Resize images to this size before cropping for data augmentation
resize_size = 640
# Actual crop size
crop_size = 560
# The resolutions to extract alpha-pooling features from
resolutions = [224,560]
prefix_template = 'res%i/'
# Number of object classes
num_classes = 201
init_model = './vgg16-training/vgg16_imagenet.caffemodel'

In [None]:
# Create parameter files
# Net
netparams_in = caffe.proto.caffe_pb2.NetParameter()
protofile = './vgg16-training/train_val.prototxt'
google.protobuf.text_format.Merge(open(protofile).read(),netparams_in)

# Solver
params = caffe.proto.caffe_pb2.SolverParameter()

In [None]:
# Change to working dir
working_dir = 'finetuning/finetuning_%s'%(str(uuid.uuid4()))
try: os.makedirs(working_dir) 
except: pass
os.chdir(working_dir)

### Add second branch

In this section, we take a prepared prototxt and adjust it for our needs. You might want to adjust the path to the image data here.

In [None]:
# Prepare data layer
lyr = netparams_in.layer
lyr[0].image_data_param.source = '/home/simon/Datasets/CUB_200_2011/train_images.txt'
lyr[0].image_data_param.root_folder = '/home/simon/Datasets/CUB_200_2011/images/'
lyr[0].image_data_param.batch_size = 8
lyr[0].image_data_param.smaller_side_size[0] = resize_size
#lyr[0].image_data_param.smaller_side_size[1] = crop_size
lyr[0].transform_param.crop_size = crop_size
lyr[0].type = 'ImageData'

lyr[1].image_data_param.source = '/home/simon/Datasets/CUB_200_2011/test_images.txt'
lyr[1].image_data_param.root_folder = '/home/simon/Datasets/CUB_200_2011/images/'
lyr[1].image_data_param.batch_size = 1
lyr[1].image_data_param.smaller_side_size[0] = resize_size
#lyr[1].image_data_param.smaller_side_size[1] = crop_size
lyr[1].transform_param.crop_size = crop_size
lyr[1].type = 'ImageData'

In [None]:
# Add batch norm
netparams = caffe.proto.caffe_pb2.NetParameter()
netparams.name = netparams_in.name

bilinear_outputs = []

In [None]:
# Input layers
for idx, l in enumerate(netparams_in.layer):
    if l.type in ['ImageData', 'Data']:
        netparams.layer.add()
        netparams.layer[-1].MergeFrom(l)

for idx, l in enumerate(netparams_in.layer):
    if l.type in ['ImageData', 'Data']:
        netparams.layer.add()
        netparams.layer[-1].name = 'zeros'
        netparams.layer[-1].type = 'DummyData'
        netparams.layer[-1].top.append('zeros')
        netparams.layer[-1].dummy_data_param.shape.add()
        netparams.layer[-1].dummy_data_param.shape[0].dim.extend([l.image_data_param.batch_size,1])
        netparams.layer[-1].include.add()
        netparams.layer[-1].include[0].phase = l.include[0].phase

In [None]:
# Resize layers
for res_idx, res in enumerate(resolutions):
    prefix = prefix_template%res 
    netparams.layer.add()
    netparams.layer[-1].name = prefix + netparams_in.layer[0].top[0]
    netparams.layer[-1].type = 'SpatialTransformer'
    netparams.layer[-1].bottom.append(netparams_in.layer[0].top[0])
    netparams.layer[-1].bottom.append('zeros')
    netparams.layer[-1].top.append(netparams.layer[-1].name)
    netparams.layer[-1].st_param.theta_1_1 = 1
    netparams.layer[-1].st_param.theta_1_2 = 0
    netparams.layer[-1].st_param.theta_1_3 = 0
    netparams.layer[-1].st_param.theta_2_1 = 0
    netparams.layer[-1].st_param.theta_2_2 = 1
    #netparams.layer[-1].st_param.theta_2_3 = 0
    netparams.layer[-1].st_param.to_compute_dU = False
    netparams.layer[-1].st_param.output_H = res;
    netparams.layer[-1].st_param.output_W = res;

In [None]:
# for each resolution
for res_idx, res in enumerate(resolutions):
    # Add all  layers before chop_off
    for idx, l in enumerate(netparams_in.layer):
        if l.type in ['ImageData', 'Data']:
            continue
        netparams.layer.add()
        netparams.layer[-1].MergeFrom(l)
        prefix = prefix_template%res 
        netparams.layer[-1].name = prefix + netparams.layer[-1].name 
        for i in range(len(l.top)):
            netparams.layer[-1].top[i] = prefix + netparams.layer[-1].top[i]
        for i in range(len(l.bottom)):
            netparams.layer[-1].bottom[i] = prefix + netparams.layer[-1].bottom[i]
        for param_idx, p in enumerate(netparams.layer[-1].param):
            p.name = '%s_param%i'%(l.name,param_idx)

        if l.name == chop_off_layer:
            break

    # Add gamma layer
    netparams.layer.add()
    netparams.layer[-1].name = prefix + 'gamma_power'
    netparams.layer[-1].type = 'SignedPower'
    netparams.layer[-1].bottom.append(netparams.layer[-2].top[0])
    netparams.layer[-1].top.append(netparams.layer[-1].name)
    netparams.layer[-1].power_param.power = gamma - 1
    netparams.layer[-1].param.add()
    netparams.layer[-1].param[0].name = 'gamma_power'
    netparams.layer[-1].param[0].lr_mult = 10
    netparams.layer[-1].param[0].decay_mult = 0

    # Add bilinear layers 
    netparams.layer.add()
    netparams.layer[-1].name = prefix + 'bilinear'
    netparams.layer[-1].type = 'CompactBilinear'
    netparams.layer[-1].bottom.append(netparams.layer[-3].top[0])
    netparams.layer[-1].bottom.append(netparams.layer[-2].top[0])
    netparams.layer[-1].top.append(netparams.layer[-1].name)
    netparams.layer[-1].compact_bilinear_param.num_output = 8192

    bilinear_outputs.append(netparams.layer[-1].top[0])

In [None]:
# Normalization layers
if len(bilinear_outputs)>1:
    netparams.layer.add()
    netparams.layer[-1].name = 'bilinear_sum'
    netparams.layer[-1].type = 'Eltwise'
    for bi_out in bilinear_outputs:
        netparams.layer[-1].bottom.append(bi_out)
    netparams.layer[-1].top.append(netparams.layer[-1].name)

if True:
    netparams.layer.add()
    netparams.layer[-1].name = 'bilinear_gamma_root'
    netparams.layer[-1].type = 'SignedPower'
    netparams.layer[-1].bottom.append(netparams.layer[-2].name)
    netparams.layer[-1].top.append(netparams.layer[-1].name)
    netparams.layer[-1].power_param.power = 0.5 #1.0 / (gamma)
    netparams.layer[-1].param.add()
    netparams.layer[-1].param[0].lr_mult = 0
    netparams.layer[-1].param[0].decay_mult = 0

if True:
    netparams.layer.add()
    netparams.layer[-1].name = 'bilinear_l2'
    netparams.layer[-1].type = 'L2Normalize'
    netparams.layer[-1].bottom.append(netparams.layer[-2].top[0])
    netparams.layer[-1].top.append(netparams.layer[-1].name)

# fc8
netparams.layer.add()
netparams.layer[-1].name = 'fc8_ft'
netparams.layer[-1].type = 'InnerProduct'
netparams.layer[-1].bottom.append(netparams.layer[-2].top[0])
netparams.layer[-1].top.append(netparams.layer[-1].name) 
netparams.layer[-1].inner_product_param.num_output = num_classes
[netparams.layer[-1].param.add() for _ in range(2)]
netparams.layer[-1].param[0].lr_mult = 1
netparams.layer[-1].param[0].decay_mult = 1
netparams.layer[-1].param[1].lr_mult = 2
netparams.layer[-1].param[1].decay_mult = 2

# Accuracy
netparams.layer.add()
netparams.layer[-1].name = 'loss'
netparams.layer[-1].type = 'SoftmaxWithLoss'
netparams.layer[-1].bottom.append(netparams.layer[-2].top[0])
netparams.layer[-1].bottom.append('label')
netparams.layer[-1].top.append(netparams.layer[-1].name) 

# Softmax
netparams.layer.add()
netparams.layer[-1].name = 'Accuracy'
netparams.layer[-1].type = 'Accuracy'
netparams.layer[-1].bottom.append(netparams.layer[-3].top[0])
netparams.layer[-1].bottom.append('label')
netparams.layer[-1].top.append(netparams.layer[-1].name) 
netparams.layer[-1].include.add()
netparams.layer[-1].include[0].phase = 1

In [None]:
# Learning rates and decays and so on
for l in netparams.layer:
    if l.type in ['InnerProduct','Convolution','Scale']:
        [l.param.add() for _ in range(2 - len(l.param))]
        l.param[0].lr_mult = 1
        l.param[0].decay_mult = 1
        l.param[1].lr_mult = 2
        l.param[1].decay_mult = 2
    if l.type in ['InnerProduct']:
        l.inner_product_param.weight_filler.type = "gaussian"
        l.inner_product_param.weight_filler.ClearField('std')
        l.inner_product_param.weight_filler.std = 0.01
        l.inner_product_param.bias_filler.type = "constant"
        l.inner_product_param.bias_filler.value = 0.0
    if l.name in ['fc8_ft']:
        l.inner_product_param.weight_filler.type = "gaussian"
        l.inner_product_param.weight_filler.std = 0.000000001
        l.inner_product_param.bias_filler.type = "constant"
        l.inner_product_param.bias_filler.value = 0.01
    if l.type in ['Convolution']:
        l.convolution_param.weight_filler.type = "gaussian"
        l.convolution_param.weight_filler.ClearField('std')
        l.inner_product_param.weight_filler.std = 0.01
        l.convolution_param.bias_filler.type = "constant"
        l.convolution_param.bias_filler.value = 0.0
    if l.type == "BatchNorm":
        l.param[0].lr_mult = 0
        l.param[1].lr_mult = 0
        l.param[2].lr_mult = 0
        l.batch_norm_param.ClearField('use_global_stats')
#    if l.name in ['fc6','fc7']:
#        l.inner_product_param.num_output = 2048

In [None]:
# Solver for fine-tuning
solverfile = 'ft.solver'
params = caffe.proto.caffe_pb2.SolverParameter()
params.net = u'ft.prototxt'
params.test_iter.append(int(len([None for _ in open(netparams.layer[1].image_data_param.source,'rt')]) / lyr[0].image_data_param.batch_size))
params.test_interval = 10000
params.test_initialization = True
params.base_lr = 0.001
params.display = 100
params.max_iter = 1000000
params.lr_policy = "fixed"
params.power = 1
#params.stepsize = 100000
#params.gamma = 0.1
#params.momentum = 0.9
params.weight_decay = 0.0005
params.snapshot = 10000
#params.random_seed = 0
params.snapshot_prefix = "ft"
params.net = "ft.prototxt"
params.iter_size = int(8/lyr[0].image_data_param.batch_size)
#params.type = "Nesterov"
assert params.iter_size > 0
open(solverfile,'w').write(google.protobuf.text_format.MessageToString(params))
open(params.net,'w').write(google.protobuf.text_format.MessageToString(netparams))

Copy the weights from the pre-trained model

In [None]:
os.getcwd()

In [None]:
net_origin = caffe.Net('../../'+protofile, '../../'+init_model, caffe.TEST)

In [None]:
net_target = caffe.Net('ft.prototxt',caffe.TEST)

In [None]:
for origin_param in net_origin.params.keys():
    for res in resolutions:
        prefix = prefix_template%res
        target_param = prefix + origin_param
        if target_param in net_target.params:
            for idx in range(len(net_origin.params[origin_param])):
                #print('Copying %s[%i] to %s[%i]'%(origin_param, idx, target_param, idx))
                net_target.params[target_param][idx].data[...] = net_origin.params[origin_param][idx].data

In [None]:
if False: net_target.copy_from(init_model)

In [None]:
net_target.save('model_init')

In [None]:
del net_origin
del net_target

### Caffe LR init

To speed everything up, we calculate features for each image and learn only the classifier with it

In [None]:
#Calc the features
def calc_features(net, n_images, blobs):
    batchsize = net.blobs['data'].data.shape[0]
    feats = dict()
    for blob in blobs:
        out_shape = list(net.blobs[blob].data.shape)
        out_shape[0] = n_images
        feats[blob] = np.zeros(tuple(out_shape),dtype=np.float16 if not blob=='label' else np.int32)
    print('Need %.3f GiB'%(np.sum([x.nbytes for x in feats.values()])/1024/1024/1024))
        
    for it in pyprind.prog_bar(range(0,n_images,batchsize),update_interval=10):
        net.forward()
        for blob in blobs:
            feats[blob][it:it+batchsize,...] = net.blobs[blob].data[:feats[blob][it:it+batchsize,...].shape[0],...]
            
    return [feats[blob] for blob in blobs]

In [None]:
num_images = [len([None for _ in open(netparams.layer[i].image_data_param.source,'r')]) for i in [0,1]]

In [None]:
last_blob = [l.bottom[0] for l in netparams.layer if l.type == 'InnerProduct'][-1]
last_blob

In [None]:
solver = caffe.get_solver('ft.solver')
solver.net.copy_from('model_init')
train_feats,train_labels = calc_features(solver.net,num_images[0],[last_blob,'label'])
del solver

In [None]:
if False:
    solver = caffe.get_solver('ft.solver')
    solver.test_nets[0].copy_from('model_init')
    val_feats,val_labels = calc_features(solver.test_nets[0],num_images[1],[last_blob, 'label'])
    del solver.test_nets[0]
    del solver

In [None]:
netparams_fixed = caffe.proto.caffe_pb2.NetParameter()
netparams_fixed.layer.add()
netparams_fixed.layer[-1].name = 'data'
netparams_fixed.layer[-1].type = 'Input'
netparams_fixed.layer[-1].top.append(last_blob)
netparams_fixed.layer[-1].input_param.shape.add()
netparams_fixed.layer[-1].input_param.shape[0].dim.extend((32,) + train_feats.shape[1:])

netparams_fixed.layer.add()
netparams_fixed.layer[-1].name = 'label'
netparams_fixed.layer[-1].type = 'Input'
netparams_fixed.layer[-1].top.append('label')
netparams_fixed.layer[-1].input_param.shape.add()
netparams_fixed.layer[-1].input_param.shape[0].dim.extend((32,))
# Add all layers after fc8
approached_fc8 = False
for l in netparams.layer:
    if l.name == 'fc8_ft':
        l.param[0].lr_mult = 1
        l.param[0].decay_mult = 1
        l.param[1].lr_mult = 1
        l.param[1].decay_mult = 1
        l.inner_product_param.weight_filler.std = 0.0001
        l.inner_product_param.bias_filler.value = 0
    approached_fc8 = approached_fc8 or l.name == 'fc8_ft'
    if approached_fc8:
        netparams_fixed.layer.add()
        netparams_fixed.layer[-1].MergeFrom(l)

In [None]:
# Solver
solverfile = 'ft_fixed.solver'
params = caffe.proto.caffe_pb2.SolverParameter()
params.net = u'ft_fixed.prototxt'
#params.test_iter.append(1450)
#params.test_interval = 1000
params.test_initialization = False
params.base_lr = 1
params.display = 100
params.max_iter = 60000
params.lr_policy = "multistep"
params.stepvalue.extend([20000,30000,40000,50000])
#params.power = 1
#params.stepsize = 100000
params.gamma = 0.25
params.momentum = 0.9
params.weight_decay = 0.000005
params.snapshot = 10000000
#params.random_seed = 0
params.snapshot_prefix = "ft_fixed"
params.iter_size = 1
assert params.iter_size > 0
open(solverfile,'w').write(google.protobuf.text_format.MessageToString(params))
open(params.net,'w').write(google.protobuf.text_format.MessageToString(netparams_fixed))

In [None]:
solver = caffe.get_solver('ft_fixed.solver')

In [None]:
# Train
for it in pyprind.prog_bar(range(params.max_iter)):
    train_ids = random.sample(range(train_feats.shape[0]),32)
    solver.net.blobs[last_blob].data[...] = train_feats[train_ids,...]
    solver.net.blobs['label'].data[...] = train_labels[train_ids]
    solver.step(1)

In [None]:
solver.net.save('model_lr')

In [None]:
del solver

In [None]:
solver = caffe.get_solver('ft.solver')
solver.net.copy_from('model_init')
solver.net.copy_from('model_lr')
solver.net.save('model_lr')
del solver

In [None]:
if False:
    import sklearn

    model = sklearn.linear_model.LogisticRegression(C=1000, solver='lbfgs',multi_class='multinomial', max_iter = 10000, tol = 1e-10)
    %time model.fit(train_feats.reshape(train_feats.shape[0],-1), train_labels)
    print("LR Accuracy is ")
    print(model.score(val_feats.reshape(val_feats.shape[0],-1), val_labels))