In [2]:
import pandas as pd
import os
import time

import numpy as np
import matplotlib.pyplot as plt
import h5py
# import cv2
from skimage.feature import hog
from skimage.transform import resize
from skimage import io, data, color, exposure
# hog = cv2.HOGDescriptor()

In [5]:
def get_imlist(paths):
    return [os.path.join(path,f) for path in paths for f in os.listdir(path) if (f.endswith('.jpg') or f.endswith('.png'))]

In [6]:
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image = color.rgb2gray(image)
    image_resized = resize(image, (256, 256))
    return hog(image_resized, orientations=8,
        pixels_per_cell=(16, 16), cells_per_block=(1, 1))

In [10]:
# extract image features and save it to .h5

# Initialize files
data_root = 'data/'

f = h5py.File(data_root+'train_image_HOGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()


train_photos = pd.read_csv(data_root + 'train_photo_to_biz_ids.csv')
train_folder = data_root+'train_photos/'
# train_images = [os.path.join(train_folder, str(x)+'.jpg') for x in train_photos['photo_id']]  # get full filename

train_images = get_imlist([train_folder])
num_train = len(train_images)
print "Number of training images: ", num_train

tic = time.time()

# Training Images
for i in range(0, num_train):
    feature = extract_hog_features(train_images[i])
    num_done = i+1
    f= h5py.File(data_root+'train_image_HOGfeatures.h5','r+')
    f['photo_id'].resize((num_done,))
    f['photo_id'][i] = train_images[i]
    f['feature'].resize((num_done,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if num_done%10000==0 or num_done==num_train:
        print "Train images processed: ", num_done

toc = time.time()
print '\nFeatures extracted in %fs' % (toc - tic)

Number of training images:  8701
Train images processed:  8701

Features extracted in 276.779347s


In [11]:
f = h5py.File(data_root+'train_image_HOGfeatures.h5','r')
print 'train_image_features.h5:'
for key in f.keys():
    print key, f[key].shape
    
print "\nA photo:", f['photo_id'][0]
print "Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ..."
f.close()

train_image_features.h5:
feature (8701, 2048)
photo_id (8701,)

A photo: data/train_photos/118032.jpg
Its feature vector (first 10-dim):  [ 0.11053389  0.17010261  0.17975943  0.09998228  0.1717571   0.07637832
  0.09547274  0.08248253  0.10122443  0.10356585]  ...


In [12]:
f = h5py.File(data_root+'test_image_HOGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()

test_photos = pd.read_csv(data_root+'test_photo_to_biz.csv')
test_folder = data_root+'test_photos/'
# test_images = [os.path.join(test_folder, str(x)+'.jpg') for x in test_photos['photo_id'].unique()]

test_images = get_imlist([test_folder])
num_test = len(test_images)
print "Number of test images: ", num_test

tic = time.time()

# Test Images
for i in range(0, num_test):
    feature = extract_hog_features(test_images[i])
    num_done = i+1
    f= h5py.File(data_root+'test_image_HOGfeatures.h5','r+')
    f['photo_id'].resize((num_done,))
    f['photo_id'][i] = test_images[i]
    f['feature'].resize((num_done,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if num_done%20000==0 or num_done==num_test:
        print "Test images processed: ", num_done

toc = time.time()
print '\nFeatures extracted in %fs' % (toc - tic)  

Number of test images:  3807
Test images processed:  3807

Features extracted in 128.981083s
