In [None]:
import numpy as np # liniear algebra
import pandas as pd # csv I/O
import os # file operations
import h5py # handle data that doesn't fit in memory

TRAIN_DIR = r'D:\LICENTA\train_photos'
TEST_DIR = r'D:\LICENTA\test_photos'
IMG_SIZE = 224

DATA_DIR = r'D:\LICENTA\processed_data\size_224x224'

train_labels = pd.read_csv('train.csv')

In [None]:
%%time

from keras.applications.vgg16 import VGG16 # predefined CNN Model
from keras.preprocessing import image # get the array representation of the image
from keras.applications.vgg16 import preprocess_input # get the array in a format compatible with the model

# takes around 45 seconds
model = VGG16(weights='imagenet', include_top=False, pooling='max')

model.summary()

In [None]:
def extract_features(imgs_path, img_size):
    
    features = []
    
    for path in imgs_path:
        img = image.load_img(path, target_size=(IMG_SIZE, IMG_SIZE))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features.append(model.predict(img_array).reshape(512,))
    
    return np.array(features)

In [None]:
# TRAIN PHOTOS

import time

with h5py.File(DATA_DIR + r'\train_images_vgg16_features.h5', 'w') as f:
    img_names = f.create_dataset('photo_id', (0,), maxshape=(None,), dtype='|S54')
    feature = f.create_dataset('feature', (0, 512), maxshape=(None, 512))

train_photos = pd.read_csv('train_photo_to_biz_ids.csv')
train_images = [os.path.join(TRAIN_DIR, str(x) + '.jpg') for x in train_photos['photo_id']]

num_train = len(train_images)
print('Number of train images:', num_train)
batch_size = 2000
num_done = 0
t = time.time()

for i in range(0, num_train, batch_size):
    images = train_images[i:min(i + batch_size, num_train)]
    features = extract_features(images, IMG_SIZE)
    num_done = i + features.shape[0]
    
    with h5py.File(DATA_DIR + r'\train_images_vgg16_features.h5', 'r+') as f:
        f['photo_id'].resize((num_done,))
        f['photo_id'][i:num_done] = np.array(images).astype('|S54')
        f['feature'].resize((num_done, features.shape[1]))
        f['feature'][i:num_done,:] = features
    
    if num_done % 2000 == 0 or num_done == num_train:
        print('Train images proccesed:', num_done, 'time passed: ', '{0:.1f}'.format(time.time() - t), 'sec')


In [None]:
# Check the file content

with h5py.File(DATA_DIR + r'\train_images_vgg16_features.h5','r') as f:
    print(r'\train_image_vgg16_features.h5')
    for key in f.keys():
        print(key, f[key].shape)

    print("\nA photo:", f['photo_id'][0])
    print("Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ...")

In [None]:
# TEST PHOTOS

with h5py.File(DATA_DIR + r'\test_images_vgg16_features.h5', 'w') as f:
    img_names = f.create_dataset('photo_id', (0,), maxshape=(None,), dtype='|S54')
    feature = f.create_dataset('feature', (0, 512), maxshape=(None, 512))
    
test_photos = pd.read_csv('test_photo_to_biz.csv')
test_images = [os.path.join(TEST_DIR, str(x) + '.jpg') for x in test_photos['photo_id'].unique()]

num_test = len(test_images)
print('Number of test images: ', num_test)
batch_size = 2000

for i in range(0, num_test, batch_size):
    images = test_images[i:min(i + batch_size, num_test)]
    features = extract_features(images, IMG_SIZE)
    num_done = i + features.shape[0]
    
    with h5py.File(DATA_DIR + r'\test_images_vgg16_features.h5', 'r+') as f:
        f['photo_id'].resize((num_done,))
        f['photo_id'][i:num_done] = np.array(images).astype('|S54')
        f['feature'].resize((num_done, features.shape[1]))
        f['feature'][i:num_done,:] = features
    
    if num_done % 20000 == 0 or num_done == num_train:
        print('Test images proccesed:', num_done)

In [None]:
# Check the file content

with h5py.File(DATA_DIR + r'\test_images_vgg16_features.h5','r') as f:
    for key in f.keys():
        print(key, f[key].shape)

    print "\nA photo:", f['photo_id'][0]
    print("feature vector: (first 10-dim)", f['feature'][0][0:10], " ...")