In [14]:
from keras.applications.resnet50 import ResNet50
from keras.applications.imagenet_utils import preprocess_input
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool
from scipy.misc import imread, imresize
import numpy as np
import os

dataset_path = 'dataset/'
vectors_path = 'vectors/'
image_shape = (224, 224, 3)

In [15]:
# read dataset
with open('dataset.txt') as f:
    dataset = f.readlines()
dataset = [s.strip() for s in dataset]

In [16]:
len(dataset)

679688

In [18]:
# create dirs for vector files
vector_dirs = [vectors_path + os.path.dirname(s) for s in dataset]
vector_dirs = set(vector_dirs)
for d in vector_dirs:
    if not os.path.isdir(d):
        os.makedirs(d)

In [19]:
feature_extractor = ResNet50(include_top=False, weights='imagenet', input_shape=image_shape)

In [22]:
def read_resize(fn):
    img = imread(dataset_path + fn, mode='RGB')
    img = imresize(img, image_shape)
    return img

pool = ThreadPool(cpu_count())

In [23]:
batch_size = 64

# read dataset in batches
for i in range(0, len(dataset), batch_size):
    batch = dataset[i: i+batch_size]
    # load/resize images in parallel threads
    x = pool.map(read_resize, batch)
    # preprocess
    x = np.array(x, dtype='float32')
    x = preprocess_input(x)
    # extract feature vectors
    y = feature_extractor.predict(x)
    # save vectors
    for j, fn in enumerate(batch):        
        np.save(vectors_path + fn, y[j][0, 0])
    
    if not i % (batch_size*16):
        print(i)

0
1024
2048
3072
4096
5120


KeyboardInterrupt: 