# Transfer Learning using Keras
In this notebook I will train a dog breed classifier using Keras and transfer learning techniques.  In total there ae 20K images from 119 dog breeds.  The steps in the notebook are:
 * Preprocess images: normalization, uniform scaling
 * Train the classifier
 * Evaluate the model and Validation
 * Improvements using Data Augmentation to help low stat classes

In [1]:
# import dependencies
%matplotlib inline
import csv as csv
import numpy as np
import pandas as pd
import pylab as py
import operator, re, progressbar, sys
import multiprocessing
from collections import Counter
import matplotlib.pyplot as plt
from operator import itemgetter
import pickle, logging
from skimage import color, exposure, transform, io
from time import time
import codecs, glob
from tempfile import TemporaryFile
import os

from keras.utils import to_categorical
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model 
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import backend as k 
from keras.engine.topology import Input
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

# My modules
sys.path.insert(0,"/Users/HAL3000/Dropbox/coding/my_modules/")
import keras_modules as my_keras_modules
import misc_modules as misc

Using TensorFlow backend.
  return f(*args, **kwds)


In [12]:
# Paths
root_dir = '/Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images/'
train_data_dir = root_dir+'/train/'
test_data_dir  = root_dir+'/test/'

# Dataset Constants
NUM_CLASSES = 120
IMG_SIZE = 48

# Network Inputs
#nb_train_samples = 4125
#nb_validation_samples = 466 
k.set_image_dim_ordering('tf')
batch_size = 120
epochs = 1

train_labels = np.array([0] * 1000 + [1] * 1000)
print(train_labels[999])

0


## Import the Data
Images vary in size and brightness.  Let's crop, scale, normalizr, and grey each image.

In [3]:
def preprocess_img(img):
    
    # Histogram normalization in v channel
    # This normalizes the intensity , but leaves color alone
    hsv = color.rgb2hsv(img)
    hsv[:, :, 2] = exposure.equalize_hist(hsv[:, :, 2])
    img = color.hsv2rgb(hsv)

    # central square crop
    min_side = min(img.shape[:-1])
    centre = img.shape[0] // 2, img.shape[1] // 2
    img = img[centre[0] - min_side // 2:centre[0] + min_side // 2,
              centre[1] - min_side // 2:centre[1] + min_side // 2,
              :]

    # rescale to standard size
    img = transform.resize(img, (IMG_SIZE, IMG_SIZE))

    # roll color axis to axis 0
    img = np.rollaxis(img, -1)

    return img

### Now we will get each class with its label from the path and store into numpy arrays

In [4]:
def get_class(img_path):     
    '''Returns class labels as ints from dir names'''
    temp = img_path.split('/')[-2]
    #print('Saving Class Label:',int(re.sub('[^0-9]','', temp) )
    return int(re.sub('[^0-9]','', temp))

train_imgs   = []
train_labels = []
test_imgs    = []
tets_labels  = []


def process_all_images():
    ''' Creates numpy arrays of image data and class labels
    '''
    all_img_paths = glob.glob(os.path.join(root_dir, '*/*.jpg'))
    train_path = glob.glob(os.path.join(train_data_dir, '*/*.jpg'))
    test_path  = glob.glob(os.path.join(test_data_dir, '*/*.jpg'))
    np.random.shuffle(all_img_paths)

    print('Train Path:',train_path)    
    print('Test Path:',train_path)    
    
    for i,img_path in enumerate(train_path):
        #if i == 2: break
        img = preprocess_img(io.imread(img_path))
        label = get_class(img_path)
        if i%100==0:
            print('---> Processing Image:',img_path)
            print('Saving Class Label:', label)
        train_imgs.append(img)
        train_labels.append(label)      
            
    for i,img_path in enumerate(test_path):
        #if i == 2: break
        img = preprocess_img(io.imread(img_path))
        label = get_class(img_path)
        if i%100==0:
            print('---> Processing Image:',img_path)
            print('Saving Class Label:', label)
        test_imgs.append(img)
        test_labels.append(label)     
        

In [5]:
#process_all_images()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
  warn("The default mode, 'constant', will be changed to 'reflect' in "


---> Processing Image: /Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images//train/30-Norwich_terrier/n02094258_3165.jpg
Saving Class Label: 30
---> Processing Image: /Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images//train/30-Norwich_terrier/n02094258_312.jpg
Saving Class Label: 30
---> Processing Image: /Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images//train/77-English_setter/n02100735_634.jpg
Saving Class Label: 77
---> Processing Image: /Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images//train/91-Samoyed/n02111889_5267.jpg
Saving Class Label: 91
---> Processing Image: /Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images//train/112-French_bulldog/n02108915_311.jpg
Saving Class Label: 112
---> Processing Image: /Users/HAL3000/Dropbox/coding/Insight/Tinder/data/dog_breeds/Images//train/112-French_bulldog/n02108915_350.jpg
Saving Class Label: 112
---> Processing Image: /Users/HAL3000/Dropbox/coding/Insi

KeyboardInterrupt: 

In [None]:
# Check to see if numpy arrays already exits
if not os.path.exists('weights/x_train.npz'):
    print(labels,NUM_CLASSES)

    # Now make the x,y train arrays and save
    X = np.array(imgs, dtype='float32')
    
    # Make one hot targets
    #Y = np.eye(NUM_CLASSES, dtype='uint8')[labels]
    Y = to_categorical(labels)
    
    # save the train features and labels    
    np.savez("weights/x_train", X)
    np.savez("weights/y_train", Y)
else:
    x_train = np.load("weights/x_train.npz")['arr_0']
    y_train = np.load("weights/y_train.npz")['arr_0']
    
# Check the shapes
print('X train shape:', x_train.shape)
print('Y train shape:', y_train.shape)

### Initialize the frozen model and my fully connected layers

In [None]:
model = applications.VGG16(weights='imagenet', include_top=False, input_shape = (IMG_SIZE, IMG_SIZE,3))

In [None]:
# Freeze the layers which you don't want to train. Here I am freezing all but final two layers
for layer in model.layers:
   layer.trainable = False

x = model.output
x = Flatten()(x)
x = Dense(150, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(150, activation="relu")(x)
predictions = Dense(NUM_CLASSES, activation="softmax")(x)

# creating the final model 
model_final = Model(inputs = model.input, outputs = predictions)

# compile the model 
model_final.compile(loss = "categorical_crossentropy", 
                    optimizer = optimizers.SGD(lr=0.01, momentum=0.9), metrics=["accuracy"])

In [6]:
# Initialize test and training data
train_datagen = ImageDataGenerator(
rescale = 1./255,
horizontal_flip = True,
fill_mode = "nearest",
zoom_range = 0.3,
width_shift_range = 0.3,
height_shift_range=0.3,
rotation_range=30)

train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size = (IMG_SIZE, IMG_SIZE),
batch_size = batch_size, 
class_mode = "categorical")

test_datagen = ImageDataGenerator(
rescale = 1./255,
#horizontal_flip = True,
#fill_mode = "nearest",
#zoom_range = 0.3,
#width_shift_range = 0.3,
#height_shift_range=0.3,
#rotation_range=30
)

test_generator = test_datagen.flow_from_directory(
test_data_dir,
target_size = (IMG_SIZE, IMG_SIZE),
batch_size=batch_size,
class_mode = "categorical")


Found 14458 images belonging to 122 classes.
Found 6122 images belonging to 122 classes.


In [None]:
# Save our model using specified conditions
checkpoint = ModelCheckpoint("vgg16_12.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_acc', min_delta=0, patience=10, verbose=1, mode='auto')

# Re-train our layers
model_final.fit_generator(
train_generator,
samples_per_epoch = 14458,
epochs = epochs,
validation_data = test_generator,
nb_val_samples = 6122,
callbacks = [checkpoint, early]
)