# Classification: Instant Recognition with Caffe

In this example we'll classify an image with the bundled CaffeNet model (which is based on the network architecture of Krizhevsky et al. for ImageNet).

We'll compare CPU and GPU modes and then dig into the model to inspect features and the output.

### 1. Setup

* First, set up Python, `numpy`, and `matplotlib`.

In [1]:
import os
os.getcwd()

'/root/shared/Documents/final_proj/code/src'

In [2]:
# set up Python environment: numpy for numerical routines, and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt
# display plots in this notebook
%matplotlib inline

# set display defaults
plt.rcParams['figure.figsize'] = (10, 10)        # large images
plt.rcParams['image.interpolation'] = 'nearest'  # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray'  # use grayscale output rather than a (potentially misleading) color heatmap

* Load `caffe`.

In [3]:
# The caffe module needs to be on the Python path;
#  we'll add it here explicitly.
import sys
caffe_root = '/root/caffe/'  # this file should be run from {caffe_root}/examples (otherwise change this line)
sys.path.insert(0, caffe_root + 'python')

import caffe
# If you get "No module named _caffe", either you have not built pycaffe or you have the wrong path.

In [4]:
# social relation models

model_switch = 'attributes'

if model_switch == 'caffe_net':
    models_dir = '../../models/trained_models/caffeNet_models/'
    model_def = models_dir + 'end_to_end_training_prototxt/deploy_net_5.prototxt'
    model_weights = models_dir + 'finetune_iter_3000.caffemodel'
elif model_switch == 'vgg_net': 
    models_dir = '../../models/trained_models/VGG_models/'
    model_def = models_dir + 'end_to_end_training_prototxt/deploy_net.prototxt'
    model_weights = models_dir + 'finetune_iter_10000.caffemodel'
elif model_switch == 'attributes':
    def_dir = '../../models/trained_models/caffeNet_models/'
    model_def = def_dir + 'end_to_end_training_prototxt/template_single_stream_net.prototxt'
    
    attribute_dir = 'head_age_trained_on_pipa/'    
    weights_dir = '../../models/trained_models/attribute_models/' + attribute_dir
    model_weights = weights_dir + 'finetune_iter_1000.caffemodel'

elif model_switch == 'caffe_reference':
    model_def = caffe_root + 'models/bvlc_reference_caffenet/deploy.prototxt'
    model_weights = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'
    
if model_switch == 'attributes':
    splits_dir = '../../datasets/splits/annotator_consistency3/'
    image_listing_1 = splits_dir + 'single_body1_eval_16.txt'
    image_listing_2 = splits_dir + 'single_body2_eval_16.txt'
else:
    splits_dir = '../../datasets/splits/relation_consistency3/'
    image_listing_1 = splits_dir + 'domain_single_body1_eval_5.txt'
    image_listing_2 = splits_dir + 'domain_single_body2_eval_5.txt'

image_listing_pair = [image_listing_1, image_listing_2]

In [5]:
import os
os.listdir(os.path.dirname(os.path.dirname(weights_dir)))

['face_appearance_trained_on_CelebAFaces',
 'head_age_trained_on_pipa',
 'imsitu_body_activity(relation_consistency3)',
 'body_clothing_trained_on_berkeleyBodyAttributes',
 'head_gender_trained_on_pipa',
 'body_gender_trained_on_pipa',
 'imsitu_body_activity(annotator_consistency3)',
 'body_immediacy(annotator_consistency3)',
 'localation_scale_data(annotator_consistency3)',
 'body_age_trained_on_pipa',
 'readme.txt',
 'face_pose_trained_on_IMFDB',
 'face_emotion_trained_on_IMFDB']

In [6]:
import os
if os.path.isfile(model_weights):
    print 'Weights found.'
if os.path.isfile(model_def):
    print 'Model Definition found.'
if os.path.isfile(image_listing_1) and os.path.isfile(image_listing_2):
    print 'Test datasets found.'

Weights found.
Model Definition found.
Test datasets found.


### 2. Load net and set up input preprocessing

* Set Caffe to CPU/GPU mode and load the net from disk.

In [7]:
use_gpu = True
if use_gpu:
    caffe.set_device(0)  # if we have multiple GPUs, pick the first one
    caffe.set_mode_gpu()
else:
    caffe.set_mode_cpu()

INPUT_DIMS = (256, 256)
CROP_DIMS = (227, 227)

net = caffe.Net(model_def,      # defines the structure of the model
                model_weights,  # contains the trained weights
                caffe.TEST)     # use test mode (e.g., don't perform dropout)

* Set up input preprocessing. (We'll use Caffe's `caffe.io.Transformer` to do this, but this step is independent of other parts of Caffe, so any custom preprocessing code may be used).

    Our default CaffeNet is configured to take images in BGR format. Values are expected to start in the range [0, 255] and then have the mean ImageNet pixel value subtracted from them. In addition, the channel dimension is expected as the first (_outermost_) dimension.
    
    As matplotlib will load images with values in the range [0, 1] in RGB format with the channel as the _innermost_ dimension, we are arranging for the needed transformations here.

In [31]:
# load the mean ImageNet image (as distributed with Caffe) for subtraction
mu = np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
mu = mu.mean(1).mean(1)  # average over pixels to obtain the mean (BGR) pixel values
print 'mean-subtracted values:', zip('BGR', mu)

transformers = []
for data_field in ['data', 'data_1']:
    # create transformer for the input called 'data'
    transformer = caffe.io.Transformer({data_field: net.blobs[data_field].data.shape})

    transformer.set_transpose(data_field, (2,0,1))  # move image channels to outermost dimension
    transformer.set_channel_swap(data_field, (2,1,0))  # swap channels from RGB to BGR
    transformer.set_raw_scale(data_field, 255)      # rescale from [0, 1] to [0, 255]
    transformer.set_mean(data_field, mu)            # subtract the dataset-mean value in each channel
    transformers.append(transformer)

mean-subtracted values: [('B', 104.0069879317889), ('G', 116.66876761696767), ('R', 122.6789143406786)]


### 3. Classification

* Now we're ready to perform classification. Even though we'll only classify one image, we'll set a batch size of 50 to demonstrate batching.

In [32]:
# set the size of the input (we can skip this if we're happy
#  with the default; we can also change it later, e.g., for different batch sizes)
for data_field in ['data', 'data_1']:
    net.blobs[data_field].reshape(1,         # batch size
                                  3,         # 3-channel (BGR) images
                                  227, 227)  # image size is 227x227

* Load an image (that comes with Caffe) and perform the preprocessing we've set up.

In [33]:
with open(image_listing_1) as file_list:
    path_label_list_1 = [file_label.split() for file_label in file_list]

with open(image_listing_2) as file_list:
    path_label_list_2 = [file_label.split() for file_label in file_list]

In [34]:
image_idx = 0
image_path_1, label = path_label_list_1[image_idx]
image_path_2, label = path_label_list_2[image_idx]
image_paths = [image_path_1, image_path_2]
image_paths

['/root/shared/Documents/final_proj/datasets/images/all_single_body/f1_72157624551655535_4870147689.jpg',
 '/root/shared/Documents/final_proj/datasets/images/all_single_body/f2_72157624551655535_4870147689.jpg']

In [13]:
images = []
data_field = ['data', 'data_1']
for idx, data_field in enumerate(data_field):
    image = caffe.io.load_image(image_paths[idx])
    images.append(image)
    
    transformed_image = transformers[idx].preprocess(data_field, image)
    
    # copy the image data into the memory allocated for the net
    net.blobs[data_field].data[...] = transformed_image

ValueError: could not broadcast input array from shape (3,227,227) into shape (1,3,256,256)

In [None]:
cropped_img = resize_and_crop_image_2(images[0])
plt.imshow(cropped_img)

In [None]:
plt.subplot(1, 2, 1)
plt.imshow(images[0])
plt.subplot(1, 2, 2)
plt.imshow(images[1])

In [None]:
### perform classification
output = net.forward()

output_prob = output['probs'][0]  # the output probability vector for the first image in the batch

print 'predicted class is:', output_prob.argmax()

In [None]:
# sort top five predictions from softmax output
top_inds = output_prob.argsort()[::-1][:5]  # reverse sort and take five largest items

print 'probabilities and labels:'
zip(output_prob[top_inds], np.arange(16)[top_inds])

### 5. Examining intermediate output

* A net is not just a black box; let's take a look at some of the parameters and intermediate activations.

First we'll see how to read out the structure of the net in terms of activation and parameter shapes.

* For each layer, let's look at the activation shapes, which typically have the form `(batch_size, channel_dim, height, width)`.

    The activations are exposed as an `OrderedDict`, `net.blobs`.

In [35]:
# for each layer, show the output shape
for layer_name, blob in net.blobs.iteritems():
    print layer_name + '\t' + str(blob.data.shape)

data	(1, 3, 227, 227)
label	(1, 1)
label_data_1_split_0	(1, 1)
label_data_1_split_1	(1, 1)
data_1	(1, 3, 227, 227)
dummy_label_1	(1, 1)
conv1	(1, 96, 55, 55)
pool1	(1, 96, 27, 27)
norm1	(1, 96, 27, 27)
conv2	(1, 256, 27, 27)
pool2	(1, 256, 13, 13)
norm2	(1, 256, 13, 13)
conv3	(1, 384, 13, 13)
conv4	(1, 384, 13, 13)
conv5	(1, 256, 13, 13)
pool5	(1, 256, 6, 6)
pool5_reshape	(1, 9216, 1, 1)
conv1_1	(1, 96, 55, 55)
pool1_1	(1, 96, 27, 27)
norm1_1	(1, 96, 27, 27)
conv2_1	(1, 256, 27, 27)
pool2_1	(1, 256, 13, 13)
norm2_1	(1, 256, 13, 13)
conv3_1	(1, 384, 13, 13)
conv4_1	(1, 384, 13, 13)
conv5_1	(1, 256, 13, 13)
pool5_1	(1, 256, 6, 6)
pool5_reshape_1	(1, 9216, 1, 1)
cat_all	(1, 18432, 1, 1)
fc6	(1, 4096)
fc7	(1, 4096)
fc8	(1, 16)
fc8_fc8_RelationN_0_split_0	(1, 16)
fc8_fc8_RelationN_0_split_1	(1, 16)
fc8_fc8_RelationN_0_split_2	(1, 16)
probs	(1, 16)
loss	()
acc	()


* Now look at the parameter shapes. The parameters are exposed as another `OrderedDict`, `net.params`. We need to index the resulting values with either `[0]` for weights or `[1]` for biases.

    The param shapes typically have the form `(output_channels, input_channels, filter_height, filter_width)` (for the weights) and the 1-dimensional shape `(output_channels,)` (for the biases).

In [None]:
for layer_name, param in net.params.iteritems():
    print layer_name + '\t' + str(param[0].data.shape), str(param[1].data.shape)

* Since we're dealing with four-dimensional data here, we'll define a helper function for visualizing sets of rectangular heatmaps.

In [None]:
def vis_square(data):
    """Take an array of shape (n, height, width) or (n, height, width, 3)
       and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
    
    # normalize data for display
    data = (data - data.min()) / (data.max() - data.min())
    
    # force the number of filters to be square
    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = (((0, n ** 2 - data.shape[0]),
               (0, 1), (0, 1))                 # add some space between filters
               + ((0, 0),) * (data.ndim - 3))  # don't pad the last dimension (if there is one)
    data = np.pad(data, padding, mode='constant', constant_values=1)  # pad with ones (white)
    
    # tile the filters into an image
    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    
    plt.imshow(data); plt.axis('off')

* First we'll look at the first layer filters, `conv1`

In [None]:
# the parameters are a list of [weights, biases]
filters = net.params['conv1'][0].data
vis_square(filters.transpose(0, 2, 3, 1))

* The first layer output, `conv1` (rectified responses of the filters above, first 36 only)

In [None]:
feat = net.blobs['conv1'].data[0, :36]
vis_square(feat)

* The fifth layer after pooling, `pool5`

In [None]:
feat = net.blobs['pool5'].data[0]
vis_square(feat)

* The first fully connected layer, `fc6` (rectified)

    We show the output values and the histogram of the positive values

In [None]:
feat = net.blobs['fc7'].data[0]
plt.subplot(2, 1, 1)
plt.plot(feat.flat)
plt.subplot(2, 1, 2)
_ = plt.hist(feat.flat[feat.flat > 0], bins=100)

### 6. Try your own image

Now we'll grab an image from the web and classify it using the steps above.

* Try setting `my_image_url` to any JPEG image URL.

In [None]:
# download an image
#my_image_url = "..."  # paste your URL here
# for example:
my_image_url = "https://upload.wikimedia.org/wikipedia/commons/b/be/Orang_Utan%2C_Semenggok_Forest_Reserve%2C_Sarawak%2C_Borneo%2C_Malaysia.JPG"
!wget -O image.jpg $my_image_url

# transform it and copy it into the net
image = caffe.io.load_image('image.jpg')
net.blobs['data'].data[...] = transformer.preprocess('data', image)

# perform classification
net.forward()

# obtain the output probabilities
output_prob = net.blobs['prob'].data[0]

# sort top five predictions from softmax output
top_inds = output_prob.argsort()[::-1][:5]

plt.imshow(image)

print 'probabilities and labels:'
zip(output_prob[top_inds], labels[top_inds])

In [8]:
"""
Classifier is an image classifier specialization of Net.
"""

class MultiInputsClassifier(caffe.Net):
    """
    Classifier extends Net for image class prediction
    by scaling, center cropping, or oversampling.
    Parameters
    ----------
    image_dims : dimensions to scale input for cropping/sampling.
        Default is to scale to net input size for whole-image crop.
    mean, input_scale, raw_scale, channel_swap: params for
        preprocessing options.
    """
    def __init__(self, model_file, pretrained_file, image_dims=None,
                 mean=None, input_scale=None, raw_scale=None,
                 channel_swap=None):
        caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)

        # configure pre-processing
        self.transformers = [caffe.io.Transformer({in_: self.blobs[in_].data.shape}) for in_ in self.inputs]
        
        for idx, in_ in enumerate(self.inputs):
            self.transformers[idx].set_transpose(in_, (2, 0, 1))
            if mean is not None:
                self.transformers[idx].set_mean(in_, mean)
            if input_scale is not None:
                self.transformers[idx].set_input_scale(in_, input_scale)
            if raw_scale is not None:
                self.transformers[idx].set_raw_scale(in_, raw_scale)
            if channel_swap is not None:
                self.transformers[idx].set_channel_swap(in_, channel_swap)
        
        in_ = self.inputs[0]
        self.crop_dims = np.array(self.blobs[in_].data.shape[2:])
        if not image_dims:
            image_dims = self.crop_dims
        self.image_dims = image_dims

    def predict(self, oversample=True, *multiple_inputs):
        """
        Predict classification probabilities of inputs.
        Parameters
        ----------
        inputs : iterable of (H x W x K) input ndarrays.
        oversample : boolean
            average predictions across center, corners, and mirrors
            when True (default). Center-only prediction when False.
        Returns
        -------
        predictions: (N x C) ndarray of class probabilities for N images and C
            classes.
        """
        caffe_in_list = []
        
        for input_idx, inputs in enumerate(multiple_inputs):
            # Scale to standardize input dimensions.
            input_ = np.zeros((len(inputs),
                               self.image_dims[0],
                               self.image_dims[1],
                               inputs[0].shape[2]),
                              dtype=np.float32)
            
            for ix, in_ in enumerate(inputs):
                input_[ix] = caffe.io.resize_image(in_, self.image_dims)

            if oversample:
                # Generate center, corner, and mirrored crops.
                input_ = caffe.io.oversample(input_, self.crop_dims)
            else:
                # Take center crop.
                center = np.array(self.image_dims) / 2.0
                crop = np.tile(center, (1, 2))[0] + np.concatenate([
                    -self.crop_dims / 2.0,
                    self.crop_dims / 2.0
                ])
                crop = crop.astype(int)
                input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]

            # Classify
            caffe_in = np.zeros(np.array(input_.shape)[[0, 3, 1, 2]],
                                dtype=np.float32)
        
            for ix, in_ in enumerate(input_):
                caffe_in[ix] = self.transformers[input_idx].preprocess(self.inputs[input_idx], in_)
            
            caffe_in_list.append(caffe_in)
        
        out = self.forward_all(**{self.inputs[input_idx] : caffe_in_list[input_idx] for input_idx in range(len(self.inputs))})
        predictions = out[self.outputs[0]]

        # For oversampling, average predictions across crops.
        if oversample:
            predictions = predictions.reshape((len(predictions) // 10, 10, -1))
            predictions = predictions.mean(1)

        return predictions

In [9]:
imagenet_mean = np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
imagenet_mean = imagenet_mean.mean(1).mean(1)  # average over pixels to obtain the mean (BGR) pixel values

net = MultiInputsClassifier(model_def,      # defines the structure of the model
                            model_weights,  # contains the trained weights
                            image_dims=INPUT_DIMS,
                            mean=imagenet_mean,
                            raw_scale=255,
                            channel_swap=(2,0,1))

Exception: Transpose order needs to have the same number of dimensions as the input.