In [1]:
import numpy as np # liniear algebra
import pandas as pd # csv I/O
import os # file operations
import h5py # handle data that doesn't fit in memory

TRAIN_DIR = r'D:\LICENTA\train_photos'
TEST_DIR = r'D:\LICENTA\test_photos'
IMG_SIZE = 224

DATA_DIR = r'D:\LICENTA\processed_data\size_224x224'

train_labels = pd.read_csv('train.csv')

In [None]:
%%time

from keras.applications.vgg16 import VGG16 # predefined CNN Model
from keras.preprocessing import image # get the array representation of the image
from keras.applications.vgg16 import preprocess_input # get the array in a format compatible with the model

# takes around 45 seconds
model = VGG16(weights='imagenet', include_top=False, pooling='max')

model.summary()

In [2]:
%%time

from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input

model = ResNet50(weights='imagenet', include_top=False, pooling='max')

model.summary()

Using TensorFlow backend.


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None, None, 3) 0                                            
____________________________________________________________________________________________________
conv1 (Conv2D)                   (None, None, None, 64 9472        input_1[0][0]                    
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, None, None, 64 256         conv1[0][0]                      
____________________________________________________________________________________________________
activation_1 (Activation)        (None, None, None, 64 0           bn_conv1[0][0]                   
___________________________________________________________________________________________

res4c_branch2a (Conv2D)          (None, None, None, 25 262400      activation_28[0][0]              
____________________________________________________________________________________________________
bn4c_branch2a (BatchNormalizatio (None, None, None, 25 1024        res4c_branch2a[0][0]             
____________________________________________________________________________________________________
activation_29 (Activation)       (None, None, None, 25 0           bn4c_branch2a[0][0]              
____________________________________________________________________________________________________
res4c_branch2b (Conv2D)          (None, None, None, 25 590080      activation_29[0][0]              
____________________________________________________________________________________________________
bn4c_branch2b (BatchNormalizatio (None, None, None, 25 1024        res4c_branch2b[0][0]             
___________________________________________________________________________________________

In [3]:
MODEL_NAME = 'resnet50'
FEATURE_VECTOR_SIZE = 2048

In [4]:
def extract_features(imgs_path, img_size):
    
    features = []
    
    for path in imgs_path:
        img = image.load_img(path, target_size=(IMG_SIZE, IMG_SIZE))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features.append(model.predict(img_array).reshape(FEATURE_VECTOR_SIZE,))
    
    return np.array(features)

In [5]:
# TRAIN PHOTOS

import time
from datetime import datetime

with h5py.File(DATA_DIR + r'\train_images_{name}_features.h5'.format(name=MODEL_NAME), 'w') as f:
    img_names = f.create_dataset('photo_id', (0,), maxshape=(None,), dtype='|S54')
    feature = f.create_dataset('feature', (0, FEATURE_VECTOR_SIZE), maxshape=(None, FEATURE_VECTOR_SIZE))

train_photos = pd.read_csv('train_photo_to_biz_ids.csv')
train_images = [os.path.join(TRAIN_DIR, str(x) + '.jpg') for x in train_photos['photo_id']]

num_train = len(train_images)
print('Number of train images:', num_train, 'started at', str(datetime.now()))
batch_size = 4000
num_done = 0
t = time.time()

for i in range(0, num_train, batch_size):
    images = train_images[i:min(i + batch_size, num_train)]
    features = extract_features(images, IMG_SIZE)
    num_done = i + features.shape[0]
    
    with h5py.File(DATA_DIR + r'\train_images_{name}_features.h5'.format(name=MODEL_NAME), 'r+') as f:
        f['photo_id'].resize((num_done,))
        f['photo_id'][i:num_done] = np.array(images).astype('|S54')
        f['feature'].resize((num_done, features.shape[1]))
        f['feature'][i:num_done,:] = features
    
    if num_done % batch_size == 0 or num_done == num_train:
        print('Train images proccesed:', num_done, 'time passed: ', '{0:.1f}'.format(time.time() - t), 'sec hour:', str(datetime.now()))


Number of train images: 234842 started at 2017-12-04 18:06:11.122812
Train images proccesed: 4000 time passed:  556.6 sec hour: 2017-12-04 18:15:27.693568
Train images proccesed: 8000 time passed:  1041.8 sec hour: 2017-12-04 18:23:32.892652
Train images proccesed: 12000 time passed:  1629.6 sec hour: 2017-12-04 18:33:20.686626
Train images proccesed: 16000 time passed:  2219.5 sec hour: 2017-12-04 18:43:10.605864
Train images proccesed: 20000 time passed:  2795.1 sec hour: 2017-12-04 18:52:46.235141
Train images proccesed: 24000 time passed:  3368.0 sec hour: 2017-12-04 19:02:19.082030
Train images proccesed: 28000 time passed:  3941.5 sec hour: 2017-12-04 19:11:52.574851
Train images proccesed: 32000 time passed:  4508.8 sec hour: 2017-12-04 19:21:19.931749
Train images proccesed: 36000 time passed:  5077.9 sec hour: 2017-12-04 19:30:48.994136
Train images proccesed: 40000 time passed:  5642.0 sec hour: 2017-12-04 19:40:13.101046
Train images proccesed: 44000 time passed:  6206.0 sec

In [None]:
# Check the file content

with h5py.File(DATA_DIR + r'\train_images_{name}_features.h5'.format(name=MODEL_NAME),'r') as f:
    print(r'\train_images_{name}_features.h5'.format(name=MODEL_NAME))
    for key in f.keys():
        print(key, f[key].shape)

    print("\nA photo:", f['photo_id'][0])
    print("Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ...")

In [None]:
# TEST PHOTOS

with h5py.File(DATA_DIR + r'\test_images_{name}_features.h5'.format(name=MODEL_NAME), 'w') as f:
    img_names = f.create_dataset('photo_id', (0,), maxshape=(None,), dtype='|S54')
    feature = f.create_dataset('feature', (0, FEATURE_VECTOR_SIZE), maxshape=(None, FEATURE_VECTOR_SIZE))
    
test_photos = pd.read_csv('test_photo_to_biz.csv')
test_images = [os.path.join(TEST_DIR, str(x) + '.jpg') for x in test_photos['photo_id'].unique()]

num_test = len(test_images)
print('Number of test images: ', num_test)
batch_size = 4000

for i in range(0, num_test, batch_size):
    images = test_images[i:min(i + batch_size, num_test)]
    features = extract_features(images, IMG_SIZE)
    num_done = i + features.shape[0]
    
    with h5py.File(DATA_DIR + r'\test_images_{name}_features.h5'.format(name=MODEL_NAME), 'r+') as f:
        f['photo_id'].resize((num_done,))
        f['photo_id'][i:num_done] = np.array(images).astype('|S54')
        f['feature'].resize((num_done, features.shape[1]))
        f['feature'][i:num_done,:] = features
    
    if num_done % batch_size == 0 or num_done == num_train:
        print('Test images proccesed:', num_done)

In [None]:
# Check the file content

with h5py.File(DATA_DIR + r'\test_images_{name}_features.h5'.format(name=MODEL_NAME),'r') as f:
    for key in f.keys():
        print(key, f[key].shape)

    print "\nA photo:", f['photo_id'][0]
    print("feature vector: (first 10-dim)", f['feature'][0][0:10], " ...")