### Download the UC Merced Land Use dataset archive from http://vision.ucmerced.edu/datasets/landuse.html

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from zipfile import ZipFile

from skimage.transform import resize
from skimage.external.tifffile import imread

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

### Extract image files from zipped, if necessary

In [2]:
start_dir = os.path.join('data', 'UCMerced_LandUse', 'Images')
if not os.path.isdir(start_dir):
    with ZipFile('UCMerced_LandUse.zip') as z:
        z.extractall(path='data')

### Get list of all TIFF image files

In [3]:
image_labels = []
image_names = []
for class_name in os.listdir(start_dir):
    path = os.path.join(start_dir, class_name)
    files = os.listdir(path)
    image_names.extend([os.path.join(path, file) for file in files])
    image_labels.extend([class_name]*len(files))

### Read in all images from files

In [4]:
images = np.array([imread(image_name) for image_name in image_names])

### We're told these are 256x256x3 images. Let's check to be sure...

In [5]:
set(image.shape for image in images)

{(247, 247, 3),
 (247, 256, 3),
 (249, 256, 3),
 (251, 256, 3),
 (253, 256, 3),
 (254, 256, 3),
 (255, 256, 3),
 (256, 242, 3),
 (256, 247, 3),
 (256, 249, 3),
 (256, 250, 3),
 (256, 252, 3),
 (256, 253, 3),
 (256, 254, 3),
 (256, 255, 3),
 (256, 256, 3),
 (257, 257, 3)}

### Nope! Need to resize the images to the same shape.

In [6]:
# Check min and max pixel values before resizing
np.min([np.min(image) for image in images]), np.max([np.max(image) for image in images])

(0, 255)

#### Note that skimage.transform.resize scales pixel values into range 0.0 - 1.0

In [7]:
new_shape = min(image.shape[:-1] for image in images)
images = np.array([resize(image, new_shape, mode='constant') for image in images])
images.shape

(2100, 247, 247, 3)

In [8]:
# Confirm pixel values are rescaled [0.0 - 1.0] after resizing
np.min([np.min(image) for image in images]), np.max([np.max(image) for image in images])

(0.0, 1.0)

### Randomly shuffle image list

In [9]:
shuffle_index = np.random.permutation(len(images))
images = images[shuffle_index]
labels = np.array(image_labels)[shuffle_index]

### Split data intro train and test sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(images, labels, stratify=labels)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1575, 247, 247, 3), (525, 247, 247, 3), (1575,), (525,))