In [2]:
'''
DATA PREPROCESSING/EXPLORATION STEP
'''

'''
Implement data augmentation on Waldo images using Keras
'''
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from sklearn.datasets import load_files
from glob import glob
import numpy as np
import cv2
from keras.models import load_model
from keras.preprocessing import image                  
from tqdm import tqdm # Used for the progress bar visualization
import random

# Declare the data generation object for data augmentation
datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        rescale=1./255, # rescale the image before feeding it to the cnn
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

Using TensorFlow backend.


In [102]:
'''
Generate augmented images for waldo (do it only once)
'''
# waldo_filenames = np.array(glob("waldo_dataset/original_waldo/*"))
# for filename in waldo_filenames:
#     img = load_img(filename)
#     x = img_to_array(img)
#     x = x.reshape((1,) + x.shape)
#     i = 0
#     for batch in datagen.flow(x, batch_size=1,
#                               save_to_dir='waldo_dataset/train/waldo', 
#                               save_prefix='new_waldo', 
#                               save_format='jpg'):
#         i += 1
#         if i > 20:
#             break  # the flow may yield batches indefinitely

'\nGenerate augmented images for waldo (do it only once)\n'

In [6]:
# The following function loads a data set along with the targets 
# Note that the directory has to have at least two folders containing different classes
# The following function was taken from dog breed classification project
def load_dataset(path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np.array(data['target'])
    return files, targets

# Declare the training (all_files) and test datasets along with their labels
all_files, all_targets = load_dataset('waldo_dataset/train_32')
test_files, test_targets = load_dataset('waldo_dataset/test')


# Shuffle the data to avoid a bias
rand = np.random.RandomState(10)
shuffle = rand.permutation(len(all_files))
all_files, all_targets = all_files[shuffle], all_targets[shuffle]


# Show statistics about the data
waldoFiles = np.array(glob("waldo_dataset/train_32/waldo/*"))
notWaldoFiles = np.array(glob("waldo_dataset/train_32/notwaldo/*"))
print('Waldo dataset contains ' + str(len(waldoFiles)) + ' images')
print('Notwaldo dataset contains ' + str(len(notWaldoFiles)) + ' images')

Waldo dataset contains 609 images
Notwaldo dataset contains 5337 images


In [104]:
# Use this function to resize an image into 32x32 size
def resize_img(filename, key):
    image = cv2.imread(filename)
    r = 100.0 / image.shape[1]
    dim = (100, int(image.shape[0] *r))
    imageresized = cv2.resize(image,(32,32),dim,interpolation = cv2.INTER_AREA)
    cv2.imwrite('imageresized_{}.jpg'.format(key), imageresized)
    
# Use this code to resize all the images in the directory (use it only once)
# waldoFilesnew = np.array(glob("waldo_dataset/test/waldo/*"))
# key = 0    
# for image_file in waldoFilesnew:
#     resize_img(image_file, key)
#     key += 1

In [34]:
'''
CREATE THE SVM CLASSIFIER
'''
from skimage.feature import hog
import skimage as skimage

# Calculate HOGs for all the images in the training set
hog_descriptors = []
for img in all_files:
    timg = skimage.color.rgb2grey(cv2.imread(img))
    hog_descriptors.append(skimage.feature.hog(timg, orientations=9, pixels_per_cell=(8, 8), 
                                      cells_per_block=(3, 3), 
                                      visualise=False, 
                                      transform_sqrt=False, 
                                      feature_vector=True, 
                                      normalise=None))

# Calculate HOGs for all the images in the testing set
hog_descriptors_test = []
for img in test_files:
    timg = skimage.color.rgb2grey(cv2.imread(img))
    hog_descriptors_test.append(skimage.feature.hog(timg, orientations=9, pixels_per_cell=(8, 8), 
                                      cells_per_block=(3, 3), 
                                      visualise=False, 
                                      transform_sqrt=False, 
                                      feature_vector=True, 
                                      normalise=None))  

/Users/ekaterina/anaconda/lib/python3.6/site-packages/skimage/feature/_hog.py:119: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15
  'be changed to `L2-Hys` in v0.15', skimage_deprecation)


In [96]:
# Train SVM model
from sklearn.svm import SVC
from time import time

t0=time()
clf = SVC(probability=True, random_state=169)
clf.fit(hog_descriptors, all_targets)
t1=time()
print('The model was trained in ' + str(t1 - t0) + ' seconds')

The model was trained in 15.020182132720947 seconds


In [97]:
'''
TEST THE SVM CLASSIFIER
'''
count_correct = 0
threshold = 0.5
for i in range(len(hog_descriptors_test)):
    # waldo probability
    waldo_prob = np.squeeze(clf.predict_proba(hog_descriptors_test[i].reshape(1, -1)))[1]
    if ((waldo_prob < threshold and test_targets[i] == 0) or (waldo_prob >= threshold and test_targets[i] == 1)):
        count_correct += 1
print('the accuracy score of the SVM classifier is ', count_correct / len(test_files))

the accuracy score of the SVM classifier is  0.8


/Users/ekaterina/anaconda/lib/python3.6/site-packages/skimage/feature/_hog.py:119: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15
  'be changed to `L2-Hys` in v0.15', skimage_deprecation)


In [105]:
'''
CREATE THE CNN CLASSIFIER
'''

# Preprocess the images in order to use Keras with tensorflow backend
# (I took this code from the dog breed classification project)

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(32, 32))
    # convert PIL.Image.Image type to 3D tensor with shape (64, 64, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 64, 64, 3) and return 4D tensor
    result = np.expand_dims(x, axis=0)
    return result

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

train_tensors = paths_to_tensor(all_files).astype('float32')/255
test_tensors = paths_to_tensor(test_files).astype('float32')/255


  0%|          | 0/5946 [00:00<?, ?it/s][A
  6%|▌         | 345/5946 [00:00<00:01, 3402.96it/s][A
 12%|█▏        | 702/5946 [00:00<00:01, 3490.02it/s][A
 18%|█▊        | 1074/5946 [00:00<00:01, 3562.64it/s][A
 24%|██▍       | 1446/5946 [00:00<00:01, 3600.39it/s][A
 31%|███       | 1817/5946 [00:00<00:01, 3621.64it/s][A
 36%|███▋      | 2158/5946 [00:00<00:01, 3585.63it/s][A
 42%|████▏     | 2512/5946 [00:00<00:00, 3578.97it/s][A
 49%|████▊     | 2887/5946 [00:00<00:00, 3599.46it/s][A
 55%|█████▍    | 3250/5946 [00:00<00:00, 3603.10it/s][A
 61%|██████    | 3603/5946 [00:01<00:00, 3594.29it/s][A
 67%|██████▋   | 3973/5946 [00:01<00:00, 3604.88it/s][A
 73%|███████▎  | 4331/5946 [00:01<00:00, 3601.83it/s][A
 79%|███████▉  | 4694/5946 [00:01<00:00, 3603.53it/s][A
 85%|████████▍ | 5054/5946 [00:01<00:00, 3602.87it/s][A
 91%|█████████▏| 5438/5946 [00:01<00:00, 3618.25it/s][A
 98%|█████████▊| 5822/5946 [00:01<00:00, 3631.73it/s][A
100%|██████████| 5946/5946 [00:01<00:00, 3627

In [106]:
# Declare the convolutional neural network
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential

model = Sequential()

model.add(Conv2D(filters=16, kernel_size=4, strides=1, padding='same', activation='relu', input_shape=(32, 32, 3)))
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_19 (Conv2D)           (None, 32, 32, 16)        784       
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 16, 16, 16)        0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 16, 16, 32)        8224      
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 8, 8, 32)          0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 8, 8, 64)          32832     
_________________________________________________________________
max_pooling2d_21 (MaxPooling (None, 4, 4, 64)          0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 1024)              0         
__________

In [107]:
# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [108]:
from keras.callbacks import ModelCheckpoint
# Use this to save the model with the best score for future retraining
checkpointer = ModelCheckpoint(filepath='saved_models/best.hdf5', 
                               verbose=1, save_best_only=True)

# Train the model
model.fit(train_tensors, all_targets, 
          validation_split=0.2,
          epochs=4, 
          batch_size=20, 
          callbacks=[checkpointer], 
          verbose=1)

Train on 4756 samples, validate on 1190 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x12e09cba8>

In [11]:
'''
Test the CNN Classifier
'''
def img_to_tensor(img):
    result = np.expand_dims(img, axis=0)
    return result

def predict_waldo(img):
    x = img_to_array(img)
    x = x.reshape((1,) + x.shape)
    return np.squeeze(model.predict(x))

In [None]:
count_correct = 0
threshold = 0.6
for i in range(len(test_files)):
    file_path = test_files[i]
    file = load_img(file_path, target_size=(32, 32))
    file = image.img_to_array(file).astype('float32')/255
    score = predict_waldo(file)
    if ((score < threshold and test_targets[i] == 0) or (score >= threshold and test_targets[i] == 1)):
        count_correct += 1
        
print('the accuracy score of the CNN classifier is ', count_correct / len(test_files))

In [3]:
'''
OBJECT DETECTION PART
'''
# Note: since CNN showed better results than SVM with HOG I'm going to use CNN for the object detection part

# Create a sliding window
def sliding_window(image, stepSize, windowSize):
    # slide a window across the image
    for y in range(0, image.shape[0], stepSize):
        for x in range(0, image.shape[1], stepSize):
            # yield the current window
            yield (x, y, image[y:y + windowSize[1], x:x + windowSize[0]])

# Create the image pyramid for more accurate results
def pyramid(image, scale, minSize=(30, 30)):
    # yield the original image
    yield(image)
 
    # keep looping over the pyramid
    while True:
        # compute the new dimensions of the image and resize it
        w = int(image.shape[1] / scale)
        image = resize_im(image, width=w)
 
        # if the resized image does not meet the supplied minimum
        # size, then stop constructing the pyramid
        if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
            break
 
        # yield the next image in the pyramid
        yield(image)

In [4]:
''' Save image from a window for hard negative mining'''
img = image.load_img('test.jpg', target_size=(32, 32))
img = image.img_to_array(img).astype('float32')/255
def save_img(img, key):    
#     cv2.imwrite('waldo_dataset/hard_neg/hardmin_{}.jpg'.format(key), cv2.cvtColor(img.astype('float32')*255, cv2.COLOR_RGB2BGR))
    cv2.imwrite('waldo_dataset/hard_neg9/notwaldo/hardmin9_{}.jpg'.format(key), img.astype('float32')*255)

# save_img(img)

In [None]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

In [None]:
'''
TEST THE OBJECT DETECTION PART
'''
originalFiles = np.array(glob("waldo_dataset/original_images/*"))
threshold = 0.5

from time import sleep
image = cv2.imread(originalFiles[9])
winW = 32
winH = 32
y_pred = []
key = 0

''' CHOOSE THE MODEL HERE '''
model = load_model('saved_models/best.hdf5')
# Probability at which images are going to be saved for hard negative mining
percentage_chance = 0.6
for resized in pyramid(image, scale=0.5):
    for (x, y, window) in sliding_window(resized, stepSize=16, windowSize=(winW, winH)):
            # if the window does not meet our desired window size, ignore it
            window = window.astype('float32')/255
            if window.shape[0] != winH or window.shape[1] != winW:
                continue
            if predict_waldo(window) > threshold:
                if random.random() < percentage_chance:
                    save_img(window, key)
                cv2.rectangle(image, (x, y), (x + winW, y + winH), (255, 0, 0), 2)
                key += 1
                
            # since we do not have a classifier, we'll just draw the window
            clone = resized.copy()
            cv2.rectangle(clone, (x, y), (x + winW, y + winH), (0, 255, 0), 2)
            cv2.imshow("Window", clone)
            cv2.waitKey(33)
            sleep(0.025)

In [110]:
'''
Implement Hard Negative Mining
'''
# Upload the images generated during hard negative miining
hard_neg9_files, hard_neg9_targets = load_dataset('waldo_dataset/hard_neg9')
hard_neg9_train_tensors = paths_to_tensor(hard_neg9_files).astype('float32')/255

# Load the pretrained model
model = load_model('saved_models/best.hdf5')

checkpointer = ModelCheckpoint(filepath='saved_models/hard_neg9.hdf5', 
                               verbose=1, save_best_only=True)

# Retrain the model
model.fit(hard_neg9_train_tensors, hard_neg9_targets, 
          validation_split=0.2,
          epochs=4, 
          batch_size=20, 
          callbacks=[checkpointer], 
          verbose=1)


  0%|          | 0/424 [00:00<?, ?it/s][A
 84%|████████▍ | 358/424 [00:00<00:00, 3552.11it/s][A
100%|██████████| 424/424 [00:00<00:00, 3507.78it/s][A

Train on 339 samples, validate on 85 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x12da8b128>