In [1]:
from contextlib import suppress
import numpy as np
import os
import shutil
from zipfile import ZipFile

from PIL import Image
from skimage.external.tifffile import imread

from keras import applications
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense

Using TensorFlow backend.


# Data Preparation

### Download the [UC Merced Land Use dataset](http://vision.ucmerced.edu/datasets/landuse.html)

### Extract image files from the zipped archive, if necessary

In [2]:
with suppress(FileExistsError):
    os.mkdir('data')
source_dir = os.path.join('data', 'UCMerced_LandUse', 'Images')
if not os.path.isdir(source_dir):
    with ZipFile('UCMerced_LandUse.zip') as z:
        z.extractall(path='data')

#### NOTE:  I discovered that not all images files are 256x256 pixels as claimed at the UC Merced Land Use Dataset site. To work with the Keras models, all images must have the same dimensions, so I resize them to a common shape below.

In [3]:
def get_image_filepaths(start_dir):
    """
    Helper function to walk a directory structure, collecting
    file pathnames for all TIFF images.
    
    Input:
        start_dir: directory where walking starts
    
    Returns:
        List of TIFF file pathnames
    """
    return [os.path.join(root, file) for root, _, files in os.walk(start_dir)
                                          for file in files
                                              if file.endswith('.tif')
           ]

In [4]:
def class_name_from_path(filepath):
    """
    Helper function to extract and return an image's class name
    from the name of the directory in which the image is stored.
    """
    head, _ = os.path.split(filepath)
    _, class_name = os.path.split(head)
    return class_name

### Transform each image to a common shape; place in train or validate folder

In [5]:
out_dir = os.path.join('data', 'transformed')
train_dir = os.path.join(out_dir, 'train')
validate_dir = os.path.join(out_dir, 'validate')

# Delete the existing "transformed" directory (and all subdirectories and files)
with suppress(FileNotFoundError):
    shutil.rmtree(out_dir)

# Make new, empty directories
os.mkdir(out_dir)
os.mkdir(train_dir)
os.mkdir(validate_dir)

# Create train and validate label lists
train_labels = []
validate_labels = []

# Get a list of the source image file pathnames from "data/UCMerced_LandUse/Images"
filepaths = get_image_filepaths(source_dir)

# Randomly shuffle the image file pathnames (see for reproducibility)
np.random.seed(8)
np.random.shuffle(filepaths)

# Pass 1:
# 1) Collect image classes {name: number} in the "class_num_by_class_name" dictionary
# 2) Determine the smallest image dimension

class_num_by_class_name = dict()
class_num_by_filepath = dict()

new_dim = 256
for filepath in filepaths:
    # Derive image file's class name from the file pathname
    class_name = class_name_from_path(filepath)

    # Add (potentially) new class to "class_num_by_class_name" dictionary
    class_num_by_class_name.setdefault(class_name, len(class_num_by_class_name))
    
    # Store class number for future reference
    class_num_by_filepath[filepath] = class_num_by_class_name[class_name]

    # Find the minimum height or width dimension of all images
    with Image.open(filepath) as img:
        new_dim = min(new_dim, min(img.size))

# Pass 2:
# 1) Randomly split (e.g. 80/20) images between the train and validate directories
# 2) Resize all images to a common (new_dim, new_dim) size
# 3) Save class label information for each image

for image_num, filepath in enumerate(filepaths):
    with Image.open(filepath) as img:
        # Separate images between train/validate directories 
        target = train_dir if np.random.rand() < 0.80 else validate_dir
        
        # Name images in numbered format nnnn.tif
        _, ext = os.path.splitext(filepath)
        path = os.path.join(target, str(image_num).zfill(4) + ext)

        # Resize image to common shape and save to target directory
        img.resize((new_dim, new_dim)).save(path)
        
        # Capture class label to "train_labels" or "validate_labels" list
        class_num = class_num_by_filepath[filepath]
        train_labels.append(class_num) if target == train_dir else validate_labels.append(class_num)

# Get Pretrained CNN bottleneck feature weights

In [6]:
def image_generator(image_dir, batch_size=64):
    images = []
    for filename in os.listdir(image_dir):
        image = imread(os.path.join(image_dir, filename))
        # Min-Max scale the image to range 0.0 - 1.0
        images.append(image/255.0)
        if len(images) == batch_size:
            X = np.array(images)
            images = []
            yield X
    if images:
        yield np.array(images)

In [9]:
# model = applications.VGG16(include_top=False, weights='imagenet')
model = applications.Xception(include_top=False, weights='imagenet')

In [10]:
bn_features = dict()
for job_name, image_dir in [('training', train_dir), ('validation', validate_dir)]:
    print(f'Generating {job_name} bottleneck predictions')
    pred_batches = []
    for n, X in enumerate(image_generator(image_dir, batch_size=64)):
        pred = model.predict_on_batch(X)
        pred_batches.append(pred)
        print(f'Batch {n+1}, shape {pred.shape}', end='\r')

    # Save weights to bn_features dictionary
    bn_features[job_name] = np.concatenate(pred_batches)
    filename = 'bn_' + job_name + '.npy'
    with open(filename, 'wb') as f:
        print(f'\nFeature weights saved to {filename}')
        np.save(f, bn_features[job_name])

Generating training bottleneck predictions
Batch 27, shape (39, 8, 8, 2048)
Feature weights saved to bn_training.npy
Generating validation bottleneck predictions
Batch 7, shape (13, 8, 8, 2048)
Feature weights saved to bn_validation.npy


# Train a new fully-connected model, using bottleneck features as input

In [11]:
# Convert class label vectors to categorical one-hot arrays
num_classes = len(class_num_by_class_name)
Y_train = to_categorical(train_labels, num_classes)
Y_valid = to_categorical(validate_labels, num_classes)

In [14]:
def train_top_model():

    with open('bn_training.npy', 'rb') as f:
        train_data = np.load(f)
    train_labels = Y_train

    with open('bn_validation.npy', 'rb') as f:
        validation_data = np.load(f)
    validation_labels = Y_valid

    model = Sequential()
    model.add(Flatten(input_shape=train_data.shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    top_model_weights_path = 'bottleneck_model.h5'
    epochs = 25
    batch_size = 64

    model.fit(train_data, train_labels,
              epochs=epochs,
              batch_size=batch_size,
              validation_data=(validation_data, validation_labels))
    
    model.save_weights(top_model_weights_path)

In [15]:
train_top_model()

Train on 1703 samples, validate on 397 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
