In [1]:
from collections import defaultdict
from contextlib import suppress
import numpy as np
import os
from zipfile import ZipFile

from PIL import Image
from skimage.external.tifffile import imread

from keras import applications
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense

from sklearn.metrics import accuracy_score, classification_report

Using TensorFlow backend.


# Data Preparation

### Download the [UC Merced Land Use dataset](http://vision.ucmerced.edu/datasets/landuse.html)

### Extract image files from the zipped archive, if necessary

In [2]:
with suppress(FileExistsError):
    os.mkdir('data')
source_dir = os.path.join('data', 'UCMerced_LandUse', 'Images')

# Download the zipped dataset from http://vision.ucmerced.edu/datasets/landuse.html 
if not os.path.isdir(source_dir):
    with ZipFile('UCMerced_LandUse.zip') as z:
        z.extractall(path='data')

#### NOTE:  I discovered that not all images files are 256x256 pixels as claimed at the UC Merced Land Use Dataset site. To work with the Keras models, all images must have the same dimensions, so I resize them to a common shape below.

### Helper functions

In [3]:
def get_image_filepaths(start_dir):
    """
    Helper function to walk a directory structure, collecting
    file pathnames for all TIFF images.
    
    Input:
        start_dir: directory where walking starts
    
    Returns:
        List of TIFF file pathnames
    """
    return [os.path.join(root, file) for root, _, files in os.walk(start_dir)
                                          for file in files
                                              if file.endswith('.tif')
           ]

In [4]:
def class_name_from_path(filepath):
    """
    Extract and return an image's class name from the name
    of the directory in which the image is stored.
    """
    head, _ = os.path.split(filepath)
    _, class_name = os.path.split(head)
    return class_name

In [5]:
def choose_target():
    """
    Randomize splits between train, validate, and test directories.
    """
    # Use an 80% training data split
    train_split = 0.8
    if np.random.rand() < train_split:
        return 'train'
    # 50/50 split for validation and test data
    return 'validate' if np.random.rand() < 0.5 else 'test'

In [6]:
def make_image_filename(image_num, class_num):
    """
    Return a TIF file name string in the format <image_num>_<class_num>.tif
    """
    return str(image_num).zfill(4) + '_' + str(class_num).zfill(2) + '.tif'

In [7]:
def classnum_from_filename(filename):
    """
    Extract and return the int class number from an image filename
    formatted by make_image_filename().
    """
    return int(filename.split('.')[0][-2:])

In [8]:
def image_generator(image_dir, batch_size=64):
    """
    Generate a stream of min-max scaled images from image_dir,
    returned as numpy.array of sample size batch_size. 
    """
    images = []
    for filename in os.listdir(image_dir):
        image = imread(os.path.join(image_dir, filename))
        # Min-Max scale the image to range 0.0 - 1.0
        images.append(image/255.0)
        if len(images) == batch_size:
            X = np.array(images)
            images = []
            yield X
    if images:
        yield np.array(images)

### Define transformed-image target directory paths

In [9]:
"""
Create image directory hierarchy that looks like this:
data/transformed/
                 train/
                       ...
                 validate/
                       ...
                 test/
                       ...
"""
target_base = os.path.join('data', 'transformed')
target_dirs = {target: os.path.join(target_base, target) for target in ['train', 'validate', 'test']}

### Transform each image to a common shape; place in train, validate, or test folder

In [10]:
# Set random seed for reproducibility
np.random.seed(2017)

In [11]:
labels = defaultdict(list)

# Assume if directory "transformed" exists, it contains all the processed images.
if not os.path.isdir(target_base):
    # Make new directories
    os.mkdir(target_base)
    for target in target_dirs:
        os.mkdir(target_dirs[target])
    
    # Get a list of the source image file pathnames from "data/UCMerced_LandUse/Images"
    filepaths = get_image_filepaths(source_dir)

    # Randomly shuffle the image file pathnames 
    np.random.shuffle(filepaths)

    # Pass 1:
    # 1) Collect image classes {name: number} in the "class_num_by_class_name" dictionary
    # 2) Determine the smallest image dimension

    class_num_by_class_name = dict()
    class_num_by_filepath = dict()

    new_dim = 256
    for filepath in filepaths:
        # Derive image file's class name from the file pathname
        class_name = class_name_from_path(filepath)

        # Add (potentially) new class to "class_num_by_class_name" dictionary
        class_num_by_class_name.setdefault(class_name, len(class_num_by_class_name))

        # Store class number for future reference
        class_num_by_filepath[filepath] = class_num_by_class_name[class_name]

        # Find the minimum height or width dimension of all images
        with Image.open(filepath) as img:
            new_dim = min(new_dim, min(img.size))

    # Pass 2:
    # 1) Randomly split images between the train, validate, and test directories
    # 2) Resize all images to a common (new_dim, new_dim) size
    # 3) Save class label information for each image

    for image_num, filepath in enumerate(filepaths):
        with Image.open(filepath) as img:
            # Separate images between train/validate/test directories 
            target = choose_target()

            # Capture class label number
            class_num = class_num_by_filepath[filepath]
            labels[target].append(class_num)
            
            # Name images in numbered format <image#>_<class#>.tif
            path = os.path.join(target_dirs[target], make_image_filename(image_num, class_num))

            # Resize image to common shape and save to target directory
            img.resize((new_dim, new_dim)).save(path)

# else if directory "transformed" exists
else:
    # get labels
    for target_name, target_dir in target_dirs.items():
        for filename in os.listdir(target_dir):
            labels[target_name].append(classnum_from_filename(filename))

# Get training set bottleneck features from pretrained CNN

In [12]:
def extract_bottleneck_features(model, dataset='train', batch_size=64):
    """
    Extract botteleneck features for the input dataset (train/validate/test)
    by predicting on the convolutional portion only of a pretrained model.
        
    Inputs:
        model: Pre-trained deep learning model, excluding fully-connected top model
               e.g. applications.VGG16(include_top=False, weights='imagenet')
        dataset = string label for dataset image directory ['train', 'validate', 'test']
    
    Return:
        Return bottleneck features as numpy.array
    """
    
    print(f'Generating "{dataset}" bottleneck predictions')
    dir_ = target_dirs[dataset]
    pred_batches = [model.predict_on_batch(X) for X in image_generator(dir_, batch_size=batch_size)]
    
    # Concatenate predictions list to numpy.array
    bn_features = np.concatenate(pred_batches)
    print(f'   Features of shape {bn_features.shape} extracted for model {model.name}')
    
    return bn_features

### Select a pre-trained model from the Keras.applications module; e.g. Xception, VGG16 ...

In [13]:
# Xception V1 is a smaller-footprint model with high accuracy
pretrained_model = applications.Xception(include_top=False, weights='imagenet')

### Extract bottleneck features for each dataset: train, validate, and test

In [14]:
num_classes = len(np.unique(labels['train']))
X, Y = dict(), dict()
for dataset in ['train', 'validate', 'test']:
    # Extract bottleneck features from pretrained model, predicting on images from "dataset" directory
    X[dataset] = extract_bottleneck_features(pretrained_model, dataset)
    # Convert class label vectors to categorical one-hot arrays
    Y[dataset] = to_categorical(labels[dataset], num_classes)

Generating "train" bottleneck predictions
   Features of shape (1675, 8, 8, 2048) extracted for model xception
Generating "validate" bottleneck predictions
   Features of shape (216, 8, 8, 2048) extracted for model xception
Generating "test" bottleneck predictions
   Features of shape (209, 8, 8, 2048) extracted for model xception


# Train a fully-connected model using bottleneck features

In [15]:
def build_fully_connected(input_shape, num_classes):
    """
    Create a fully-connected model to train or test on UC Merced dataset.
    """
    model = Sequential()
    model.add(Flatten(input_shape=input_shape))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [16]:
# Build, compile, and fit the model

model = build_fully_connected(input_shape=X['train'].shape[1:], num_classes=num_classes)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X['train'], Y['train'], batch_size=64, epochs=10,
          verbose=2, validation_data=(X['validate'], Y['validate'])
         )

# Save model weights for test dataset predictions
fit_model_weights = model.get_weights()

Train on 1675 samples, validate on 216 samples
Epoch 1/10
17s - loss: 6.2914 - acc: 0.4764 - val_loss: 3.3246 - val_acc: 0.7361
Epoch 2/10
17s - loss: 3.4602 - acc: 0.7069 - val_loss: 1.6056 - val_acc: 0.8056
Epoch 3/10
17s - loss: 1.6102 - acc: 0.7767 - val_loss: 0.7901 - val_acc: 0.8148
Epoch 4/10
17s - loss: 0.7913 - acc: 0.8066 - val_loss: 0.6921 - val_acc: 0.8657
Epoch 5/10
17s - loss: 0.5441 - acc: 0.8555 - val_loss: 0.5959 - val_acc: 0.8657
Epoch 6/10
17s - loss: 0.4473 - acc: 0.8878 - val_loss: 0.6793 - val_acc: 0.8380
Epoch 7/10
17s - loss: 0.3845 - acc: 0.9045 - val_loss: 0.4596 - val_acc: 0.8796
Epoch 8/10
17s - loss: 0.2460 - acc: 0.9284 - val_loss: 0.4562 - val_acc: 0.8889
Epoch 9/10
17s - loss: 0.2659 - acc: 0.9319 - val_loss: 0.5339 - val_acc: 0.8889
Epoch 10/10
17s - loss: 0.3110 - acc: 0.9260 - val_loss: 0.5375 - val_acc: 0.9074


###  Validation accuracy ~90% !

### Evaluate the model on the test images

In [17]:
# Start by building the same fully-connected model
model = build_fully_connected(input_shape=X['test'].shape[1:], num_classes=num_classes)

# Load weights from the model fit on the training data
model.set_weights(fit_model_weights)

# Predict on the test images
y_pred = model.predict_classes(X['test'], verbose=0)

### Print classification results

In [18]:
accuracy = accuracy_score(labels['test'], y_pred)
print(f'Model predication accuracy: {accuracy:.3f}')
print(f'\nClassification report:\n {classification_report(labels["test"], y_pred)}')

Model predication accuracy: 0.866

Classification report:
              precision    recall  f1-score   support

          0       0.90      0.60      0.72        15
          1       1.00      1.00      1.00         9
          2       1.00      0.75      0.86         8
          3       0.87      0.87      0.87        15
          4       1.00      0.38      0.55         8
          5       1.00      1.00      1.00        11
          6       0.77      1.00      0.87        10
          7       1.00      0.70      0.82        10
          8       0.38      1.00      0.55         3
          9       0.88      1.00      0.93        14
         10       0.36      0.80      0.50         5
         11       1.00      0.88      0.93         8
         12       1.00      0.92      0.96        12
         13       1.00      1.00      1.00        10
         14       1.00      1.00      1.00         7
         15       1.00      0.89      0.94         9
         16       1.00      1.00      1

###  Test accuracy ~87% vs. 90% validation accuracy suggests overfitting in the model and room for improvement. Nonetheless, 87% test accuracy is impressive for a small dataset.