In [1]:
from collections import OrderedDict
from contextlib import suppress

import numpy as np
import pandas as pd
import os
from zipfile import ZipFile
from PIL import Image

from skimage.transform import rescale, resize
from skimage.external.tifffile import imread, imsave

from sklearn.model_selection import train_test_split

from keras import applications
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense

Using TensorFlow backend.


# Data Preparation

### Download the [UC Merced Land Use dataset](http://vision.ucmerced.edu/datasets/landuse.html)

### Extract image files from the zipped archive, if necessary

In [2]:
with suppress(FileExistsError):
    os.mkdir('data')
start_dir = os.path.join('data', 'UCMerced_LandUse', 'Images')
if not os.path.isdir(start_dir):
    with ZipFile('UCMerced_LandUse.zip') as z:
        z.extractall(path='data')

### Get list of all TIFF image files

In [3]:
# Save a {number: name} class dictionary for later reference
classes = OrderedDict()
labels = []
filenames = []
for index, (root, _, files) in enumerate(os.walk(start_dir, topdown=False)):
    if files:
        class_name = os.path.basename(root)
        classes[index] = class_name
        filenames.extend([os.path.join(root, file) for file in files])
        labels.extend([index]*len(files))

### Resize all images to common shape; output to "transformed" directory

In [4]:
transformed_dir = os.path.join('data', 'transformed')

In [5]:
def transform_images(image_files, outdir):
    with suppress(FileExistsError):
        os.mkdir(outdir)
    # Check if directory is empty before processing
    if not os.listdir(outdir):
        # Pass 1 over images determines smallest image dimension
        new_dim = 256
        for image_file in image_files:            
            with Image.open(image_file) as img:
                new_dim = min(new_dim, min(img.size))
        
        # Pass 2 resizes all images to common dimensions
        for image_num, image_file in enumerate(image_files):
            with Image.open(image_file) as img:
                _, ext = os.path.splitext(image_file)
                path = os.path.join(outdir, str(image_num).zfill(4) + ext)
                img.resize((new_dim, new_dim)).save(path)

In [6]:
transform_images(filenames, transformed_dir)

# Get Pretrained CNN bottleneck feature weights

### Randomly split images into training and validation sets

In [7]:
image_idx = np.arange(len(labels))
X_train_idx, X_valid_idx, y_train, y_valid = train_test_split(image_idx, labels, test_size=500, stratify=labels)

In [60]:
def image_generator(image_dir, indexes, batch_size=32):
    filenames = os.listdir(image_dir)
    images = []
    for n, index in enumerate(indexes):
        image = imread(os.path.join(image_dir, filenames[index]))
        images.append(image/255.0)
        if len(images) == batch_size:
            X = np.array(images)
            images = []
            yield X
    if images:
        yield np.array(images)

In [62]:
model = applications.VGG16(include_top=False, weights='imagenet')

In [65]:
predict_jobs = [('bottleneck_features_train.npy', X_train_idx),
                ('bottleneck_features_valid.npy', X_valid_idx)]

for filename, indexes in predict_jobs:
    print(f'Generating bottleneck predictions for {filename}')
    pred_batches = []
    for n, X in enumerate(image_generator(transformed_dir, indexes, batch_size=64)):
        pred = model.predict_on_batch(X)
        pred_batches.append(pred)
        print('Batch', n+1, pred.shape)

    with open(filename, 'wb') as f:
        np.save(f, np.concatenate(pred_batches))

Generating bottleneck predictions for bottleneck_features_train.npy
Batch 1 (64, 7, 7, 512)
Batch 2 (64, 7, 7, 512)
Batch 3 (64, 7, 7, 512)
Batch 4 (64, 7, 7, 512)
Batch 5 (64, 7, 7, 512)
Batch 6 (64, 7, 7, 512)
Batch 7 (64, 7, 7, 512)
Batch 8 (64, 7, 7, 512)
Batch 9 (64, 7, 7, 512)
Batch 10 (64, 7, 7, 512)
Batch 11 (64, 7, 7, 512)
Batch 12 (64, 7, 7, 512)
Batch 13 (64, 7, 7, 512)
Batch 14 (64, 7, 7, 512)
Batch 15 (64, 7, 7, 512)
Batch 16 (64, 7, 7, 512)
Batch 17 (64, 7, 7, 512)
Batch 18 (64, 7, 7, 512)
Batch 19 (64, 7, 7, 512)
Batch 20 (64, 7, 7, 512)
Batch 21 (64, 7, 7, 512)
Batch 22 (64, 7, 7, 512)
Batch 23 (64, 7, 7, 512)
Batch 24 (64, 7, 7, 512)
Batch 25 (64, 7, 7, 512)
Generating bottleneck predictions for bottleneck_features_valid.npy
Batch 1 (64, 7, 7, 512)
Batch 2 (64, 7, 7, 512)
Batch 3 (64, 7, 7, 512)
Batch 4 (64, 7, 7, 512)
Batch 5 (64, 7, 7, 512)
Batch 6 (64, 7, 7, 512)
Batch 7 (64, 7, 7, 512)
Batch 8 (52, 7, 7, 512)


In [66]:
# Convert class label vectors to categorical one-hot arrays
num_classes = len(classes)
Y_train = to_categorical(y_train, num_classes)
Y_valid = to_categorical(y_valid, num_classes)

In [71]:
def train_top_model():

    with open('bottleneck_features_train.npy', 'rb') as f:
        train_data = np.load(f)
    train_labels = Y_train

    with open('bottleneck_features_valid.npy', 'rb') as f:
        validation_data = np.load(f)
    validation_labels = Y_valid

    model = Sequential()
    model.add(Flatten(input_shape=train_data.shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    top_model_weights_path = 'bottleneck_model.h5'
    epochs = 25
    batch_size = 64

    model.fit(train_data, train_labels,
              epochs=epochs,
              batch_size=batch_size,
              validation_data=(validation_data, validation_labels))
    
    model.save_weights(top_model_weights_path)

In [72]:
train_top_model()

Train on 1600 samples, validate on 500 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
