# Transfer Learning with TensorFlow

*Transfer learning* is the practice of starting with a network that has already been trained, and then applying that network to your own problem.

Because neural networks can often take days or even weeks to train, transfer learning (i.e. starting with a network that somebody else has already spent a lot of time training) can greatly shorten training time.

### Setup
In order to complete this lab, install Python 3, tensorflow, numpy, scipy, matplotlib, and pillow.

## AlexNet
Here, you're going to practice transfer learning with [AlexNet](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0ahUKEwiG34CS7vHPAhVKl1QKHW2JAJkQFggcMAA&url=https%3A%2F%2Fpapers.nips.cc%2Fpaper%2F4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf&usg=AFQjCNFlGsSmTUkJw0gLJ0Ry4cm961B7WA&bvm=bv.136593572,d.cGw).

AlexNet is a popular base network for transfer learning because its structure is relatively straightforward, it's not too big, and it performs well empirically.

Here is a TensorFlow implementation of AlexNet (adapted from [Michael Guerhoy and Davi Frossard](http://www.cs.toronto.edu/~guerzhoy/tf_alexnet/)):

In [1]:
from numpy import *
import os
from pylab import *
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import time
from scipy.misc import imread
from scipy.misc import imresize
import matplotlib.image as mpimg
from scipy.ndimage import filters
import urllib
from numpy import random

import tensorflow as tf

train_x = zeros((1, 227,227,3)).astype(float32)
train_y = zeros((1, 1000))
xdim = train_x.shape[1:]
ydim = train_y.shape[1]

net_data = load("bvlc-alexnet.npy", encoding="latin1").item()

def conv(input, kernel, biases, k_h, k_w, c_o, s_h, s_w,  padding="VALID", group=1):
    '''From https://github.com/ethereon/caffe-tensorflow
    '''
    c_i = input.get_shape()[-1]
    assert c_i%group==0
    assert c_o%group==0
    convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
    
    
    if group==1:
        conv = convolve(input, kernel)
    else:
        input_groups = tf.split(3, group, input)
        kernel_groups = tf.split(3, group, kernel)
        output_groups = [convolve(i, k) for i,k in zip(input_groups, kernel_groups)]
        conv = tf.concat(3, output_groups)
    return  tf.reshape(tf.nn.bias_add(conv, biases), [-1]+conv.get_shape().as_list()[1:])

x = tf.placeholder(tf.float32, (None,) + xdim)
resized = tf.image.resize_images(x, (227, 227))

def features():

    #conv1
    #conv(11, 11, 96, 4, 4, padding='VALID', name='conv1')
    k_h = 11; k_w = 11; c_o = 96; s_h = 4; s_w = 4
    conv1W = tf.Variable(net_data["conv1"][0])
    conv1b = tf.Variable(net_data["conv1"][1])
    conv1_in = conv(resized, conv1W, conv1b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=1)
    conv1 = tf.nn.relu(conv1_in)

    #lrn1
    #lrn(2, 2e-05, 0.75, name='norm1')
    radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
    lrn1 = tf.nn.local_response_normalization(conv1,
                                                      depth_radius=radius,
                                                      alpha=alpha,
                                                      beta=beta,
                                                      bias=bias)

    #maxpool1
    #max_pool(3, 3, 2, 2, padding='VALID', name='pool1')
    k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
    maxpool1 = tf.nn.max_pool(lrn1, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)


    #conv2
    #conv(5, 5, 256, 1, 1, group=2, name='conv2')
    k_h = 5; k_w = 5; c_o = 256; s_h = 1; s_w = 1; group = 2
    conv2W = tf.Variable(net_data["conv2"][0])
    conv2b = tf.Variable(net_data["conv2"][1])
    conv2_in = conv(maxpool1, conv2W, conv2b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv2 = tf.nn.relu(conv2_in)


    #lrn2
    #lrn(2, 2e-05, 0.75, name='norm2')
    radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
    lrn2 = tf.nn.local_response_normalization(conv2,
                                                      depth_radius=radius,
                                                      alpha=alpha,
                                                      beta=beta,
                                                      bias=bias)

    #maxpool2
    #max_pool(3, 3, 2, 2, padding='VALID', name='pool2')                                                  
    k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
    maxpool2 = tf.nn.max_pool(lrn2, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)

    #conv3
    #conv(3, 3, 384, 1, 1, name='conv3')
    k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 1
    conv3W = tf.Variable(net_data["conv3"][0])
    conv3b = tf.Variable(net_data["conv3"][1])
    conv3_in = conv(maxpool2, conv3W, conv3b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv3 = tf.nn.relu(conv3_in)

    #conv4
    #conv(3, 3, 384, 1, 1, group=2, name='conv4')
    k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 2
    conv4W = tf.Variable(net_data["conv4"][0])
    conv4b = tf.Variable(net_data["conv4"][1])
    conv4_in = conv(conv3, conv4W, conv4b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv4 = tf.nn.relu(conv4_in)


    #conv5
    #conv(3, 3, 256, 1, 1, group=2, name='conv5')
    k_h = 3; k_w = 3; c_o = 256; s_h = 1; s_w = 1; group = 2
    conv5W = tf.Variable(net_data["conv5"][0])
    conv5b = tf.Variable(net_data["conv5"][1])
    conv5_in = conv(conv4, conv5W, conv5b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv5 = tf.nn.relu(conv5_in)

    #maxpool5
    #max_pool(3, 3, 2, 2, padding='VALID', name='pool5')
    k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
    maxpool5 = tf.nn.max_pool(conv5, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)

    #fc6
    #fc(4096, name='fc6')
    fc6W = tf.Variable(net_data["fc6"][0])
    fc6b = tf.Variable(net_data["fc6"][1])
    fc6 = tf.nn.relu_layer(tf.reshape(maxpool5, [-1, int(prod(maxpool5.get_shape()[1:]))]), fc6W, fc6b)

    #fc7
    #fc(4096, name='fc7')
    fc7W = tf.Variable(net_data["fc7"][0])
    fc7b = tf.Variable(net_data["fc7"][1])
    fc7 = tf.nn.relu_layer(fc6, fc7W, fc7b)
    return fc7

def logits():
    #fc8
    #fc(1000, relu=False, name='fc8')
    fc8W = tf.Variable(net_data["fc8"][0])
    fc8b = tf.Variable(net_data["fc8"][1])
    fc8 = tf.nn.xw_plus_b(features(), fc8W, fc8b)
    return fc8

def probabilities():
    #prob
    #softmax(name='prob'))
    return tf.nn.softmax(logits())

## ImageNet Inference

![alt text](poodle.png "Poodle")
![alt text](weasel.png "Weasel")

To start, run a few ImageNet images through the network, and verify that the network classifies them correctly.

In [2]:
"""# NOTE: You don't need to edit this code.

from caffe_classes import class_names

# Initialize the Model
prob = probabilities()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

# Read Images
im1 = (imread("poodle.png")[:,:,:3]).astype(float32)
im1 = im1 - mean(im1)

im2 = (imread("weasel.png")[:,:,:3]).astype(float32)
im2 = im2 - mean(im2)

# Run Inference
t = time.time()
output = sess.run(prob, feed_dict = {x:[im1,im2]})

# Print Output
for input_im_ind in range(output.shape[0]):
    inds = argsort(output)[input_im_ind,:]
    print("Image", input_im_ind)
    for i in range(5):
        print("%s: %.3f" % (class_names[inds[-1-i]], output[input_im_ind, inds[-1-i]]))
    print()

print("Time: %.3f seconds" % (time.time()-t))
"""

'# NOTE: You don\'t need to edit this code.\n\nfrom caffe_classes import class_names\n\n# Initialize the Model\nprob = probabilities()\ninit = tf.initialize_all_variables()\nsess = tf.Session()\nsess.run(init)\n\n# Read Images\nim1 = (imread("poodle.png")[:,:,:3]).astype(float32)\nim1 = im1 - mean(im1)\n\nim2 = (imread("weasel.png")[:,:,:3]).astype(float32)\nim2 = im2 - mean(im2)\n\n# Run Inference\nt = time.time()\noutput = sess.run(prob, feed_dict = {x:[im1,im2]})\n\n# Print Output\nfor input_im_ind in range(output.shape[0]):\n    inds = argsort(output)[input_im_ind,:]\n    print("Image", input_im_ind)\n    for i in range(5):\n        print("%s: %.3f" % (class_names[inds[-1-i]], output[input_im_ind, inds[-1-i]]))\n    print()\n\nprint("Time: %.3f seconds" % (time.time()-t))\n'

## Traffic Sign Inference
![alt text](construction.jpg "Construction Sign")
![alt text](stop.jpg "Stop Sign")

Next, run two of the traffic sign images through the network, and see how well the classifier performs.

You'll notice, however, that the AlexNet model expects a 227x227x3 pixel image, whereas the traffic sign images are 32x32x3 pixels.

In order to feed our the traffic sign images into AlexNet, you'll need to resize the images to the dimensions that AlexNet expects.

You could resize the images outside of this program, but that would make for a huge collection of images. Instead, use the `tf.images.resize_images()` method to resize the images within the model.

In [3]:
"""
from caffe_classes import class_names

# TODO: Update the xdim, x, and resized variables to accomodate 32x32x3 pixel images.
train_x = zeros((1, 32,32,3)).astype(float32)
xdim = train_x.shape[1:]
x = tf.placeholder(tf.float32, (None,) + xdim)
resized = tf.image.resize_images(x, (227, 227))

# NOTE: You don't need to edit the code below.
# Initialize the Model
prob = probabilities()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

# Read Images
im1 = (imread("construction.jpg")[:,:,:3]).astype(float32)
im1 = im1 - mean(im1)

im2 = (imread("stop.jpg")[:,:,:3]).astype(float32)
im2 = im2 - mean(im2)

# Run Inference
t = time.time()
output = sess.run(prob, feed_dict = {x:[im1,im2]})

# Print Output
for input_im_ind in range(output.shape[0]):
    inds = argsort(output)[input_im_ind,:]
    print("Image", input_im_ind)
    for i in range(5):
        print("%s: %.3f" % (class_names[inds[-1-i]], output[input_im_ind, inds[-1-i]]))
    print()

print("Time: %.3f seconds" % (time.time()-t))
"""

'\nfrom caffe_classes import class_names\n\n# TODO: Update the xdim, x, and resized variables to accomodate 32x32x3 pixel images.\ntrain_x = zeros((1, 32,32,3)).astype(float32)\nxdim = train_x.shape[1:]\nx = tf.placeholder(tf.float32, (None,) + xdim)\nresized = tf.image.resize_images(x, (227, 227))\n\n# NOTE: You don\'t need to edit the code below.\n# Initialize the Model\nprob = probabilities()\ninit = tf.initialize_all_variables()\nsess = tf.Session()\nsess.run(init)\n\n# Read Images\nim1 = (imread("construction.jpg")[:,:,:3]).astype(float32)\nim1 = im1 - mean(im1)\n\nim2 = (imread("stop.jpg")[:,:,:3]).astype(float32)\nim2 = im2 - mean(im2)\n\n# Run Inference\nt = time.time()\noutput = sess.run(prob, feed_dict = {x:[im1,im2]})\n\n# Print Output\nfor input_im_ind in range(output.shape[0]):\n    inds = argsort(output)[input_im_ind,:]\n    print("Image", input_im_ind)\n    for i in range(5):\n        print("%s: %.3f" % (class_names[inds[-1-i]], output[input_im_ind, inds[-1-i]]))\n    pr

## Feature Extraction
The problem is that AlexNet was trained on the [ImageNet](http://www.image-net.org/) database, which has 1000 classes of images. You can see the classes in the `caffe_classes.py` file. None of those classes involves traffic signs.

In order to successfully classify our traffic sign images, you need to remove the final, 1000-neuron classification layer and replace it with a new, 43-neuron classification layer.

This is called feature extraction, because you're basically extracting the images features captured by the penultimate layer, and passing them to a new classification layer.

In [4]:
# TODO: Redefine the logits() function to create a new fully-connected layer.
def logits():
    #fc8
    #fc(1000, relu=False, name='fc8')
    num_inputs  = 4096
    num_outputs = 43
    shape       = [num_inputs, num_outputs]
    weights = tf.Variable(tf.truncated_normal(shape, stddev=0.05))
    biases  = tf.Variable(tf.constant(0.05, shape=[num_outputs]))
    layer   = tf.nn.xw_plus_b(features(), weights, biases)
    return layer
    #fc8W = tf.Variable(net_data["fc8"][0])
    #fc8b = tf.Variable(net_data["fc8"][1])
    #fc8 = tf.nn.xw_plus_b(features(), fc8W, fc8b)
    #return fc8
"""
# NOTE: You don't need to edit the code below.
# Initialize the Model
prob = probabilities()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

# Read Images
im1 = (imread("construction.jpg")[:,:,:3]).astype(float32)
im1 = im1 - mean(im1)

im2 = (imread("stop.jpg")[:,:,:3]).astype(float32)
im2 = im2 - mean(im2)

# Run Inference
t = time.time()
output = sess.run(prob, feed_dict = {x:[im1,im2]})

# Print Output
for input_im_ind in range(output.shape[0]):
    inds = argsort(output)[input_im_ind,:]
    print("Image", input_im_ind)
    for i in range(5):
        print("%s: %.3f" % (inds[-1-i], output[input_im_ind, inds[-1-i]]))
    print()

print("Time: %.3f seconds" % (time.time()-t))
"""

'\n# NOTE: You don\'t need to edit the code below.\n# Initialize the Model\nprob = probabilities()\ninit = tf.initialize_all_variables()\nsess = tf.Session()\nsess.run(init)\n\n# Read Images\nim1 = (imread("construction.jpg")[:,:,:3]).astype(float32)\nim1 = im1 - mean(im1)\n\nim2 = (imread("stop.jpg")[:,:,:3]).astype(float32)\nim2 = im2 - mean(im2)\n\n# Run Inference\nt = time.time()\noutput = sess.run(prob, feed_dict = {x:[im1,im2]})\n\n# Print Output\nfor input_im_ind in range(output.shape[0]):\n    inds = argsort(output)[input_im_ind,:]\n    print("Image", input_im_ind)\n    for i in range(5):\n        print("%s: %.3f" % (inds[-1-i], output[input_im_ind, inds[-1-i]]))\n    print()\n\nprint("Time: %.3f seconds" % (time.time()-t))\n'

## Training the Feature Extractor
The feature extractor you just created works, in the sense that data will flow through the network and result in predictions.

But the predictions aren't accurate, because you haven't yet trained the new classification layer.

In order to do that, you'll need to read in the training dataset and train the network with cross entropy.

Notice that in the network definition (look in the `features()` function), all of the layers are set to `trainable=False`. This freezes the weights of those layers, so you keep the trained AlexNet features and only train the final classification layer. This also makes training faster.

Training AlexNet (even just the final layer!) can take a little while, so it can be helpful to try out your code using only a small portion of the training set. Once you're confident your implementation works, you can train use the entire training dataset to train the network.

In [5]:
# TODO: Load the training dataset.

# Load pickled data
import pickle

# TODO: fill this in based on where you saved the training and testing data
training_file = 'train.p'
with open(training_file, mode='rb') as f:
    train = pickle.load(f)
X_train, y_train = train['features'], train['labels']
assert(X_train.shape[0] == y_train.shape[0]), "The number of images is not equal to the number of labels."
assert(X_train.shape[1:] == (32,32,3)), "The dimensions of the images are not 32 x 32 x 3."

In [6]:
# TODO: Pre-process the input data.
# TODO: Implement data normalization here.
import random
import numpy as np

def normalize_greyscale(image_data):
    x_min = np.min(image_data)
    x_max = np.max(image_data)
    a     = -0.5
    b     = 0.5
    return a + np.divide( (image_data - x_min ) * ( b - a), x_max - x_min) 

def preprocess_all_images(X_train, img_size):
    # Preallocate space for all images, for just a few tens of thousands of 
    # small images this should not need advanced caching techniques
    n_train = X_train.shape[0]
    train_features = np.zeros( [n_train, img_size, img_size, 3] )
    for i in range(n_train):
        train_features[i,] = normalize_greyscale(X_train[i])
    return train_features

X_train = preprocess_all_images(X_train, 32)
assert(round(np.mean(X_train)) == 0), "The mean of the input data is: %f" % np.mean(X_train)
assert(np.min(X_train) == -0.5 and np.max(X_train) == 0.5), "The range of the input data is: %.1f to %.1f" % (np.min(X_train), np.max(X_train))

In [7]:
# TODO: Once you are confident that the training works, update the training set to use all of the data.
# TODO: Compile and train the model to measure validation accuracy.
def random_batch(Image_train, labels_train, batch_size):
    # Number of images in the training-set.
    num_images = len(Image_train)
    # Create a random index.
    idx = np.random.choice(num_images,
                           size=batch_size,
                           replace=False)
    # Use the random index to select random images and labels.
    features_batch = Image_train[idx, :]
    labels_batch   = labels_train[idx]

    return features_batch, labels_batch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix

encoder = LabelBinarizer()
encoder.fit(y_train)
Y_train = encoder.transform(y_train)
Y_train = Y_train.astype(np.float32)

In [8]:
train_x = zeros((1, 32,32,3)).astype(float32)
xdim    = train_x.shape[1:]
x       = tf.placeholder(tf.float32, (None,) + xdim)
resized = tf.image.resize_images(x, (227, 227))

num_classes = 43
y_true      = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls  = tf.argmax(y_true, dimension=1)

layer_logits       = logits()
y_pred             = tf.nn.softmax(layer_logits)
y_pred_cls         = tf.argmax(y_pred, dimension=1)
cross_entropy      = tf.nn.softmax_cross_entropy_with_logits(logits=layer_logits, labels=y_true)
cost               = tf.reduce_mean(cross_entropy)
optimizer          = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy           = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [9]:
# Saver for ease of retraining
saver = tf.train.Saver(max_to_keep=10)
save_dir = 'checkpoints/'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

save_path = save_dir + 'alexnet_transfer'

In [10]:
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

In [None]:
# TODO: Train the network.
# Taken from Hvass Labs, with minor modifications, as it is vastly 
# easier to define the network this way
# https://github.com/Hvass-Labs/TensorFlow-Tutorials

from datetime import timedelta
import time

train_batch_size = 2

def optimize(num_iterations):

    # Start-time used for printing time-usage below.
    start_time     = time.time()
    end_batch_time = start_time

    for i in range(num_iterations):
        
        start_batch_time = end_batch_time

        # Get a batch of training examples.
        # x_batch now holds a batch of images and
        # y_true_batch are the true labels for those images.
        x_batch, y_true_batch = random_batch(X_train, Y_train, train_batch_size)

        # Put the batch into a dict with the proper names
        # for placeholder variables in the TensorFlow graph.
        feed_dict_train = {x: x_batch,
                           y_true: y_true_batch}

        # Run the optimizer using this batch of training data.
        # TensorFlow assigns the variables in feed_dict_train
        # to the placeholder variables and then runs the optimizer.
        # We also want to retrieve the global_step counter.
        sess.run(optimizer, feed_dict=feed_dict_train)

        # Print status every 100 iterations and after last iteration.
        if (i % 100 == 0) or (i == (num_iterations - 1)):

            end_batch_time = time.time()
            
            # Calculate the accuracy on the training-batch.
            acc_train = sess.run(accuracy, feed_dict=feed_dict_train)
            
            time_batch_dif = end_batch_time - start_batch_time
            
            # Message for printing.
            msg = "Global Step: {0:>6}, Training Batch Accuracy: {1:>6.1%}"
            print(msg.format(i, acc_train))
            print("Steps time usage: " + str(timedelta(seconds=int(round(time_batch_dif)))))
            
        # Save a checkpoint to disk every 1000 iterations (and last).
        if (i % 1000 == 0) or (i == num_iterations - 1):
            # Save all variables of the TensorFlow graph to a
            # checkpoint. Append the global_step counter
            # to the filename so we save the last several checkpoints.
            #saver.save(sess,
            #           save_path=save_path)

            #print("Saved checkpoint.")
            ;

    # Ending time.
    end_time = time.time()

    # Difference between start and end-times.
    time_dif = end_time - start_time

    # Print the time-usage.
    print("Total time usage: " + str(timedelta(seconds=int(round(time_dif)))))

In [None]:
optimize(num_iterations=1000)

Global Step:      0, Training Batch Accuracy: 100.0%
Steps time usage: 0:00:01
Global Step:    100, Training Batch Accuracy:   0.0%
Steps time usage: 0:00:58


**Validation Accuracy:** (fill in here)

## Congratulations!
You've trained AlexNet as a feature extractor!

Don't be discouraged if your validation accuracy still isn't as high as you'd like.

Coming up, you'll explore other networks to use for transfer learning, as well as approaches to improve accuracy.