# Training images using scikit-learn 

Self-training image with scikit-learn

Reference: 
* http://scikit-image.org/docs/dev/auto_examples/plot_hog.html
* http://www.vlfeat.org/overview/hog.html
* http://nbviewer.jupyter.org/github/BVLC/caffe/blob/master/examples/00-classification.ipynb



In [8]:
import os
import time 
import numpy as np 
import pandas as pd
import h5py

import matplotlib.pyplot as plt

from skimage.feature import hog
from skimage.transform import resize
from skimage import io, data, color, exposure

In [2]:
DATA = '/Users/edison/Desktop/Yelp/project/'
PHOTOS_PATH = DATA + 'train_photos/'

* Codes to demonstrate features of HOG. This portion do not affect the results of the image classification.

In [2]:
image = io.imread(PHOTOS_PATH + '10.jpg')
image = color.rgb2gray(image)
fd, hog_image = hog(image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualise=True)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)


#input image 
ax1.axis('off')
ax1.imshow(image, cmap=plt.cm.gray)
ax1.set_title('Input image')
ax1.set_adjustable('box-forced')
hog_image = exposure.rescale_intensity(hog_image, in_range=(0, 0.02))


#output image 
ax2.axis('off')
ax2.imshow(hog_image, cmap=plt.cm.gray)
ax2.set_title('Histogram of Oriented Gradients')
ax1.set_adjustable('box-forced')
plt.show()

When given a image path on the system directory, function converts image to grayscale, resize it to 256 x 256, and returns a HOG object.

In [3]:
def extract_features(image_path): 
    image = io.imread(image_path)
    image = color.rgb2gray(image)
    image = resize(image, (256, 256)) #resize to run faster 
    return hog(image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1))


* Initialise h5py files

In [13]:
f = h5py.File(DATA+'train_image_HOGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()

In [14]:
train_photos_csv = pd.read_csv(DATA+'train_photo_to_biz_ids.csv')
train_photos_csv.head()

Unnamed: 0,photo_id,business_id
0,204149,3034
1,52779,2805
2,278973,485
3,195284,485
4,19992,485


In [6]:
train_images = [os.path.join(PHOTOS_PATH, str(x)+'.jpg') for x in train_photos_csv['photo_id']]

In [15]:
train_images[1:5]

['/Users/edison/Desktop/Yelp/project/train_photos/52779.jpg',
 '/Users/edison/Desktop/Yelp/project/train_photos/278973.jpg',
 '/Users/edison/Desktop/Yelp/project/train_photos/195284.jpg',
 '/Users/edison/Desktop/Yelp/project/train_photos/19992.jpg']

In [8]:
train_images[1]

'/Users/edison/Desktop/Yelp/project/train_photos/52779.jpg'

In [9]:
# Training Images 
print "Number of training images: ", len(train_images)
start = time.time()
counter = 0
for i in range(0, len(train_images)): 
    feature = extract_features(train_images[i])
    counter = counter + 1 
    f = h5py.File(DATA + 'train_image_HOGfeatures.h5', 'r+')
    f['photo_id'].resize((counter,))
    f['photo_id'][i] = train_images[i]
    f['feature'].resize((counter,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if counter%10000==0 or counter==len(train_images): 
        print "Images Processed ", counter


end = time.time() 
print '\nTime taken for feature extraction (HOG): %fs' % (end - start)

Number of training images:  234842
Images Processed  10000
Images Processed  20000
Images Processed  30000
Images Processed  40000
Images Processed  50000
Images Processed  60000
Images Processed  70000
Images Processed  80000
Images Processed  90000
Images Processed  100000
Images Processed  110000
Images Processed  120000
Images Processed  130000
Images Processed  140000
Images Processed  150000
Images Processed  160000
Images Processed  170000
Images Processed  180000
Images Processed  190000
Images Processed  200000
Images Processed  210000
Images Processed  220000
Images Processed  230000
Images Processed  234842

Time taken for feature extraction (HOG): 7207.196549s


### Extract feature function 
* photo_to_biz: photo ID to business ID (csv file) 
* directory (e.g. train_photos/) 
* output filename for the h5 file

Generic function to extract features and write into an output features h5 files

In [29]:
def extract_feature_function(photo_to_biz, directory, output_file): 
    
    # Output h5py file 
    f = h5py.File(DATA+ output_file,'w')
    filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
    feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
    f.close()
    
    photos = pd.read_csv(DATA + photo_to_biz)
    folder = DATA + directory 
    images = [os.path.join(folder, str(x)+'.jpg') for x in photos['photo_id']]
    
    total_count = len(images)
    
    print "Number of training images: ", total_count
    start = time.time()
    counter = 0
    for i in range(0, total_count): 
        feature = extract_features(images[i])
        counter = counter + 1 
        f = h5py.File(DATA + output_file, 'r+')
        f['photo_id'].resize((counter,))
        f['photo_id'][i] = images[i]
        f['feature'].resize((counter,feature.shape[0]))
        f['feature'][i, :] = feature
        f.close()
        if counter%20000==0 or counter==len(images): 
            print "Images Processed ", counter


    end = time.time() 
    print '\nTime taken for feature extraction (HOG): %fs' % (end - start)

In [30]:
#test data 
extract_feature_function('test_photo_to_biz.csv', 'test_photos/', 'test_image_HOGfeatures.h5')

Number of training images:  1190225
Images Processed  20000
Images Processed  40000
Images Processed  60000
Images Processed  80000
Images Processed  100000
Images Processed  120000
Images Processed  140000
Images Processed  160000
Images Processed  180000
Images Processed  200000
Images Processed  220000
Images Processed  240000
Images Processed  260000
Images Processed  280000
Images Processed  300000
Images Processed  320000
Images Processed  340000
Images Processed  360000
Images Processed  380000
Images Processed  400000
Images Processed  420000
Images Processed  440000
Images Processed  460000
Images Processed  480000
Images Processed  500000
Images Processed  520000
Images Processed  540000
Images Processed  560000
Images Processed  580000
Images Processed  600000
Images Processed  620000
Images Processed  640000
Images Processed  660000
Images Processed  680000
Images Processed  700000
Images Processed  720000
Images Processed  740000
Images Processed  760000
Images Processed  