In [1]:
import os

from pathlib import Path, PurePath

In [2]:
import matplotlib.pyplot as plt

from skimage import color, io
from skimage.transform import rescale, resize, downscale_local_mean

### Input and output directories (../data/raw and ../data/processed, respectively). 

In [3]:
raw_data_dir = Path(r'../data/raw')
craftsman_dir = raw_data_dir / 'Craftsman House'
victorian_dir = raw_data_dir / 'Victorian House'

processed_data_dir = Path(r'../data/processed')
craftsman_output_dir = processed_data_dir / 'Craftsman House'
victorian_output_dir = processed_data_dir / 'Victorian House'

In [4]:
oos_data_dir = raw_data_dir / 'Testing Images'
oos_data_dir_out = processed_data_dir / 'Testing Images'

In [None]:
image = io.imread(victorian_dir / os.listdir(victorian_dir)[0])
image_resized = resize(image, (128, 128))

fig, axes = plt.subplots(nrows=1, ncols=2)

ax = axes.ravel()
ax[0].imshow(image, cmap='gray')
ax[0].set_title("Original image")

ax[1].imshow(image_resized, cmap='gray')
ax[1].set_title("New image")

In [27]:
def resize_directory(directory, output_directory, size, display_step=100):
    """Resizes an entire directory of images. 

    Args:
        directory (`pathlib.Path`): A directory to preprocess.  Should
            contain images.
        size (`list`): The size of the target size.
        display_stp (:obj:`int`, optional): How many images to process before 
            displaying an image.  

    Raises:
        ValueError: If `size` is the wrong shape (not 1x2).
        ValueError: If `output_directory` is not a pathlib.Path object.
    """
    
    if len(size) != 2:
        raise ValueError('`size` should be of length 2.  '
                         'Found: {0}'.format(len(size)))
    if not isinstance(output_directory, PurePath):
        raise ValueError('output_directory should be a pathlib.Path object.'
                        'Found: {0}'.format(type(output_directory)))
    
    output_directory.mkdir(parents=True, exist_ok=True)
    
    i = 0
    existing_files = os.listdir(output_directory)
    for image_name in os.listdir(directory):
        i += 1
        image_name_split = ''.join(image_name.split('.')[:-1]) # Remove filetype
        image_out_name = image_name + str(size[0]) + 'x' + str(size[1]) + '.jpg'
        
        if image_out_name in existing_files:
            print(f'Skipped: {image_name} because it is already in {output_directory}')
            continue
        
        image = io.imread(directory / image_name)
        image_resized = resize(image, (size[0], size[1]))
        # Receiving some errors due to JPG alpha channels.  Skip those for now.  
        # TODO: Research formats. 
        if image_resized.shape[2] == 4:
            image_resized = color.rgba2rgb(image_resized)
        try:
            io.imsave(output_directory / image_out_name, image_resized)        
        except OSError as e:
            print(e)
        print(f'Step: {i}.  Resized: {image_name}.  '
              f'Old size: {image.shape[0]}x{image.shape[1]}.  '
              f'New size: {image_resized.shape[0]}x{image_resized.shape[1]}.')

        if i%display_step == 0:
            fig, axes = plt.subplots(nrows=1, ncols=2)

            ax = axes.ravel()
            ax[0].imshow(image, cmap='gray')
            ax[0].set_title("Original image")

            ax[1].imshow(image_resized, cmap='gray')
            ax[1].set_title("New image")

In [28]:
#resize_directory(victorian_dir, victorian_output_dir, size=(128, 128))

In [29]:
resize_directory(oos_data_dir, oos_data_dir_out, size=(128, 128))



Skipped: craftsman1.PNG because it is already in ..\data\processed\Testing Images
Skipped: craftsman2.PNG because it is already in ..\data\processed\Testing Images
Skipped: victorian1.PNG because it is already in ..\data\processed\Testing Images
Skipped: victorian2.PNG because it is already in ..\data\processed\Testing Images
Step: 5.  Resized: victorian3.jpg.  Old size: 177x284.  New size: 128x128.




Step: 6.  Resized: victorian3.PNG.  Old size: 710x819.  New size: 128x128.
Step: 7.  Resized: victorian4.PNG.  Old size: 772x757.  New size: 128x128.


In [None]:
#resize_directory(craftsman_dir, craftsman_output_dir, size=(128, 128))

In [None]:
import shutil
import pdb

def split_directory(directory, class_name, train_percent):
    if train_percent >= 1:
        raise ValueError(f"train_percent should be < 1.0, as a percentage." \
                         f"Received: {train_percent}")
    test_percent = 1 - train_percent
    files = os.listdir(directory)
    n_files = len(files)
    random = np.random.rand(n_files)
    train_use = random < train_percent
    
    train = [file for file, use in zip(files, train_use) if use]
    test = [file for file, use in zip(files, train_use) if not use]

    # Define train_dir
    train_dir = directory.parent / 'train' / class_name
    # Remove all files in train_dir
    try:
        shutil.rmtree(str(train_dir))
        train_dir.mkdir(parents=True, exist_ok=True)
    except FileNotFoundError:
        print(f'{str(train_dir)} does not exist.')
        # Make train_dir if it doesn't exist
        train_dir.mkdir(parents=True, exist_ok=True)
    
    # Do all of the same stuff for test_dir
    test_dir = directory.parent / 'test' / class_name
    try:
        shutil.rmtree(str(test_dir))
        test_dir.mkdir(parents=True, exist_ok=True)
    except FileNotFoundError:
        print(f'{str(test_dir)} does not exist.')
        test_dir.mkdir(parents=True, exist_ok=True)
    
    for file in train:
        shutil.copy(str(directory / file), str(train_dir / file))
    for file in test:
        shutil.copy(str(directory / file), str(test_dir / file))
        
    train_files = os.listdir(train_dir)
    test_files = os.listdir(test_dir)
    
    test_in_train = [test in train_files for test in test_files]
    train_in_test = [train in test_files for train in train_files]
    
    assert not any(test_in_train)
    assert not any(train_in_test)

In [None]:
split_directory(craftsman_output_dir, 'Craftsman', 0.7)

In [None]:
split_directory(victorian_output_dir, 'Victorian', 0.7)

### Not really used.  Was working on automatic data sifting to remove pictures of floorplans, lamps, etc. 

In [None]:
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np

In [None]:
model = ResNet50(weights='imagenet')

In [None]:
def is_house(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)

    x = preprocess_input(x)

    preds = model.predict(x)
    # decode the results into a list of tuples (class, description, probability)
    # (one such list for each sample in the batch)
    pred = [pred[1] for pred in decode_predictions(preds, top=5)[0]]
    house = ['house' in pred for pred in pred]
    house = any(house)
    if not house: 
        print('Predicted:', decode_predictions(preds, top=5)[0])
        
        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax.imshow(img)
        ax.set_title("Not House")
        plt.show()
        
    return house, pred

In [None]:
preds = []
for name in os.listdir(craftsman_output_dir)[0:1000]:
    file_name = craftsman_output_dir / name
    house, pred = is_house(file_name)
    preds = preds + pred