In [1]:
from collections import defaultdict
from contextlib import suppress
import numpy as np
import os
from zipfile import ZipFile

from PIL import Image
from skimage.external.tifffile import imread

from keras import applications
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense

Using TensorFlow backend.


# Data Preparation

### Download the [UC Merced Land Use dataset](http://vision.ucmerced.edu/datasets/landuse.html)

### Extract image files from the zipped archive, if necessary

In [2]:
with suppress(FileExistsError):
    os.mkdir('data')
source_dir = os.path.join('data', 'UCMerced_LandUse', 'Images')
if not os.path.isdir(source_dir):
    with ZipFile('UCMerced_LandUse.zip') as z:
        z.extractall(path='data')

#### NOTE:  I discovered that not all images files are 256x256 pixels as claimed at the UC Merced Land Use Dataset site. To work with the Keras models, all images must have the same dimensions, so I resize them to a common shape below.

In [3]:
def get_image_filepaths(start_dir):
    """
    Helper function to walk a directory structure, collecting
    file pathnames for all TIFF images.
    
    Input:
        start_dir: directory where walking starts
    
    Returns:
        List of TIFF file pathnames
    """
    return [os.path.join(root, file) for root, _, files in os.walk(start_dir)
                                          for file in files
                                              if file.endswith('.tif')
           ]

In [4]:
def class_name_from_path(filepath):
    """
    Helper function to extract and return an image's class name
    from the name of the directory in which the image is stored.
    """
    head, _ = os.path.split(filepath)
    _, class_name = os.path.split(head)
    return class_name

In [5]:
def choose_target():
    """
    Helper function to randomize splits between train/validate/test directories.
    """
    # Use an 80% training data split
    train_split = 0.8
    if np.random.rand() < train_split:
        return 'train'
    # 50/50 split for validation and test data
    return 'validate' if np.random.rand() < 0.5 else 'test'

In [6]:
def make_image_filename(image_num, class_num):
    return str(image_num).zfill(4) + '_' + str(class_num).zfill(2) + '.tif'

In [7]:
def classnum_from_filename(filename):
    # Extract class number from last 2 characters before filename extension
    return int(filename.split('.')[0][-2:])

### Define transformed-image target directory paths

In [8]:
"""
Create image directory hierarchy that looks like this:
data/transformed/
                 train/
                       ...
                 validate/
                       ...
                 test/
                       ...
"""
out_dir = os.path.join('data', 'transformed')
target_dirs = {target: os.path.join(out_dir, target) for target in ['train', 'validate', 'test']}

### Transform each image to a common shape; place in train, validate, or test folder

In [9]:
# Set random seed for reproducibility
np.random.seed(2017)

In [10]:
labels = defaultdict(list)

# Assume if directory "transformed" exists, it contains all the processed images.
if not os.path.isdir(out_dir):
    # Make new directories
    os.mkdir(out_dir)
    for target in target_dirs:
        os.mkdir(target_dirs[target])
    
    # Get a list of the source image file pathnames from "data/UCMerced_LandUse/Images"
    filepaths = get_image_filepaths(source_dir)

    # Randomly shuffle the image file pathnames 
    np.random.shuffle(filepaths)

    # Pass 1:
    # 1) Collect image classes {name: number} in the "class_num_by_class_name" dictionary
    # 2) Determine the smallest image dimension

    class_num_by_class_name = dict()
    class_num_by_filepath = dict()

    new_dim = 256
    for filepath in filepaths:
        # Derive image file's class name from the file pathname
        class_name = class_name_from_path(filepath)

        # Add (potentially) new class to "class_num_by_class_name" dictionary
        class_num_by_class_name.setdefault(class_name, len(class_num_by_class_name))

        # Store class number for future reference
        class_num_by_filepath[filepath] = class_num_by_class_name[class_name]

        # Find the minimum height or width dimension of all images
        with Image.open(filepath) as img:
            new_dim = min(new_dim, min(img.size))

    # Pass 2:
    # 1) Randomly split images between the train, validate, and test directories
    # 2) Resize all images to a common (new_dim, new_dim) size
    # 3) Save class label information for each image

    for image_num, filepath in enumerate(filepaths):
        with Image.open(filepath) as img:
            # Separate images between train/validate/test directories 
            target = choose_target()

            # Capture class label number
            class_num = class_num_by_filepath[filepath]
            labels[target].append(class_num)
            
            # Name images in numbered format <image#>_<class#>.tif
            path = os.path.join(target_dirs[target], make_image_filename(image_num, class_num))

            # Resize image to common shape and save to target directory
            img.resize((new_dim, new_dim)).save(path)

# else if directory "transformed" exists
else:
    # get labels
    for target_name, target_dir in target_dirs.items():
        for filename in os.listdir(target_dir):
            labels[target_name].append(classnum_from_filename(filename))

# Get Pretrained CNN bottleneck feature weights

In [11]:
def image_generator(image_dir, batch_size=64):
    images = []
    for filename in os.listdir(image_dir):
        image = imread(os.path.join(image_dir, filename))
        # Min-Max scale the image to range 0.0 - 1.0
        images.append(image/255.0)
        if len(images) == batch_size:
            X = np.array(images)
            images = []
            yield X
    if images:
        yield np.array(images)

In [12]:
# model = applications.VGG16(include_top=False, weights='imagenet')
model = applications.Xception(include_top=False, weights='imagenet')

In [13]:
bn_features = dict()
for task in ['train', 'validate']:
    print(f'Generating "{task}" bottleneck predictions:')
    pred_batches = []
    for n, X in enumerate(image_generator(target_dirs[task], batch_size=64)):
        pred = model.predict_on_batch(X)
        pred_batches.append(pred)
        print(f'\tBatch {n+1}, shape {pred.shape}', end='\r')

    # Save weights to bn_features dictionary
    bn_features[task] = np.concatenate(pred_batches)
    filename = 'bn_' + task + '.npy'
    with open(filename, 'wb') as f:
        print(f'\n\tFeature weights, shape {bn_features[task].shape}, saved to {filename}\n')
        np.save(f, bn_features[task])

Generating "train" bottleneck predictions:
	Batch 27, shape (11, 8, 8, 2048)
	Feature weights, shape (1675, 8, 8, 2048), saved to bn_train.npy

Generating "validate" bottleneck predictions:
	Batch 4, shape (24, 8, 8, 2048)
	Feature weights, shape (216, 8, 8, 2048), saved to bn_validate.npy



# Train a fully-connected model using bottleneck features

In [14]:
# Use bottleneck features as fully-connected model input data
X_train = bn_features['train']
X_validate = bn_features['validate']

# Convert class label vectors to categorical one-hot arrays
num_classes = len(np.unique(labels['train']))
Y_train = to_categorical(labels['train'], num_classes)
Y_validate = to_categorical(labels['validate'], num_classes)

# Build, compile, and fit the model
model = Sequential()
model.add(Flatten(input_shape=X_train.shape[1:]))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, Y_train, batch_size=64, epochs=10, verbose=2, validation_data=(X_validate, Y_validate))

# Save model weights for future use
model.save_weights('bottleneck_model.h5')

Train on 1675 samples, validate on 216 samples
Epoch 1/10
17s - loss: 7.0606 - acc: 0.4406 - val_loss: 3.4446 - val_acc: 0.7176
Epoch 2/10
17s - loss: 3.8867 - acc: 0.6854 - val_loss: 2.7050 - val_acc: 0.7685
Epoch 3/10
17s - loss: 2.3886 - acc: 0.7630 - val_loss: 1.4027 - val_acc: 0.8194
Epoch 4/10
16s - loss: 1.1102 - acc: 0.8287 - val_loss: 0.7074 - val_acc: 0.8426
Epoch 5/10
17s - loss: 0.6422 - acc: 0.8531 - val_loss: 0.5172 - val_acc: 0.8704
Epoch 6/10
17s - loss: 0.4154 - acc: 0.8812 - val_loss: 0.4317 - val_acc: 0.9028
Epoch 7/10
17s - loss: 0.3655 - acc: 0.9039 - val_loss: 0.4419 - val_acc: 0.9213
Epoch 8/10
16s - loss: 0.2635 - acc: 0.9278 - val_loss: 0.4228 - val_acc: 0.9120
Epoch 9/10
17s - loss: 0.1978 - acc: 0.9421 - val_loss: 0.4969 - val_acc: 0.9028
Epoch 10/10
17s - loss: 0.2031 - acc: 0.9415 - val_loss: 0.5811 - val_acc: 0.9120


####  The fully-connected model achieves ~ 90% validation accuracy!

### Up next, testing this model on the images set aside in the test directory (coming soon) ...