In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os import listdir
from os.path import isfile, join
from lxml import etree

from PIL import Image
from PIL import ImageOps as IO
import cv2

from sklearn.cluster import KMeans

In [2]:
# Define data directories
my_data = "../data"
annotations = my_data + '/Annotation'        # BO file to be merged with additional information
images = my_data + '/Images' # SAP file with order fulfilment dates
cropped = my_data + '/Cropped'

In [3]:
# Convert all pictures into cropped and autocontrasted version
# Explore all picture folders
folders = [f for f in listdir(annotations) if (~isfile(join(annotations, f)) &
                                                f.lower().startswith('n02'))]

for folder in folders:
    print(folder)
    current_folder = join(annotations, folder)
    files = [f for f in listdir(current_folder) if (isfile(join(current_folder, f)))]
    for file in files:
        # Read each picture
        current_file = join(current_folder, file)      
        picture_file = join(images, folder, file + '.jpg')
        img = Image.open(picture_file) 
        
        # Get crop region for each picture
        tree = etree.parse(current_file)
        for coord in tree.xpath("/annotation/object/bndbox"):
            xmin_el = int(coord.xpath("xmin")[0].text)
            xmax_el = int(coord.xpath("xmax")[0].text)
            ymin_el = int(coord.xpath("ymin")[0].text)
            ymax_el = int(coord.xpath("ymax")[0].text)
        
        img_cropped = img.crop((xmin_el,ymin_el,xmax_el,ymax_el))
        try:
            img_cropped = IO.grayscale(img_cropped)
            img_cropped = IO.equalize(img_cropped)
            # img_cropped = IO.autocontrast(img_cropped, 1)
        except 'OSError':
            pass
       
        out_picture_file = join(cropped, folder, file + '.jpg')
        img_cropped.save(out_picture_file)


n02085620-Chihuahua
n02085782-Japanese_spaniel
n02085936-Maltese_dog
n02086079-Pekinese
n02086240-Shih-Tzu
n02086646-Blenheim_spaniel
n02086910-papillon
n02087046-toy_terrier
n02087394-Rhodesian_ridgeback
n02088094-Afghan_hound
n02088238-basset
n02088364-beagle
n02088466-bloodhound
n02088632-bluetick
n02089078-black-and-tan_coonhound
n02089867-Walker_hound
n02089973-English_foxhound
n02090379-redbone
n02090622-borzoi
n02090721-Irish_wolfhound
n02091032-Italian_greyhound
n02091134-whippet
n02091244-Ibizan_hound
n02091467-Norwegian_elkhound
n02091635-otterhound
n02091831-Saluki
n02092002-Scottish_deerhound
n02092339-Weimaraner
n02093256-Staffordshire_bullterrier
n02093428-American_Staffordshire_terrier
n02093647-Bedlington_terrier
n02093754-Border_terrier
n02093859-Kerry_blue_terrier
n02093991-Irish_terrier
n02094114-Norfolk_terrier
n02094258-Norwich_terrier
n02094433-Yorkshire_terrier
n02095314-wire-haired_fox_terrier
n02095570-Lakeland_terrier
n02095889-Sealyham_terrier
n02096051-Aired

In [40]:
# List number of pictures per folder

folders = [f for f in listdir(annotations) if (~isfile(join(annotations, f)) &
                                                f.lower().startswith('n02'))]

for folder in folders:
    current_folder = join(annotations, folder)
    files = [f for f in listdir(current_folder) if (isfile(join(current_folder, f)))]
    print("{0} images for {1}".format(len(files), folder))
    

152 images for n02085620-Chihuahua
185 images for n02085782-Japanese_spaniel
252 images for n02085936-Maltese_dog
149 images for n02086079-Pekinese
214 images for n02086240-Shih-Tzu
188 images for n02086646-Blenheim_spaniel
196 images for n02086910-papillon
172 images for n02087046-toy_terrier
172 images for n02087394-Rhodesian_ridgeback
239 images for n02088094-Afghan_hound
175 images for n02088238-basset
195 images for n02088364-beagle
187 images for n02088466-bloodhound
171 images for n02088632-bluetick
159 images for n02089078-black-and-tan_coonhound
153 images for n02089867-Walker_hound
157 images for n02089973-English_foxhound
148 images for n02090379-redbone
151 images for n02090622-borzoi
218 images for n02090721-Irish_wolfhound
182 images for n02091032-Italian_greyhound
187 images for n02091134-whippet
188 images for n02091244-Ibizan_hound
196 images for n02091467-Norwegian_elkhound
151 images for n02091635-otterhound
200 images for n02091831-Saluki
232 images for n02092002-Sc

In [50]:
def gen_sift_features(gray_img):
    sift = cv2.xfeatures2d.SIFT_create()
    # kp is the keypoints
    #
    # desc is the SIFT descriptors, they're 128-dimensional vectors
    # that we can use for our final features
    kp, desc = sift.detectAndCompute(gray_img, None)
    return kp, desc

def show_sift_features(gray_img, color_img, kp):
    return plt.imshow(cv2.drawKeypoints(gray_img, kp, color_img.copy()))

def load_img(img_folder, img_file):
    picture_file = join(cropped, img_folder, img_file)
    img = Image.open(picture_file)
    img_sift = np.array(img)
    return img_sift

folder = "n02085782-Japanese_spaniel"
dog_file = "n02085782_23.jpg"
# generate SIFT keypoints and descriptors
dog1_img = load_img(folder, dog_file)
dog1_kp, dog1_desc = gen_sift_features(dog1_img)

dog_file = "n02085782_3030.jpg"
dog2_img = load_img(folder, dog_file)
dog2_kp, dog2_desc = gen_sift_features(dog2_img)

print ('SIFT Features:')
show_sift_features(dog1_img, dog1_desc, dog1_kp)



SIFT Features:


<matplotlib.image.AxesImage at 0xd2bd1d0>

In [51]:
len(dog1_desc[0])


128

In [52]:

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(dog1_desc)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [53]:
clusters = kmeans.labels_

In [54]:
len(clusters)

317

In [66]:
np.unique(clusters, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([32, 35, 35, 32, 36, 20, 30, 17, 41, 39], dtype=int64))

In [67]:
img_histogram_values, img_histogram_count  = np.unique(clusters, return_counts=True)
norm_histogram = np.divide(img_histogram_count, float(len(clusters)))
norm_histogram

array([ 0.10094637,  0.11041009,  0.11041009,  0.10094637,  0.11356467,
        0.06309148,  0.09463722,  0.05362776,  0.12933754,  0.12302839])

In [68]:
img_histogram_values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [62]:
img_histogram

(array([32, 35, 35, 32, 36, 20, 30, 17, 41, 39], dtype=int64),
 array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.]))

In [61]:
img_histogram = np.histogram(clusters, bins=n_clusters, range=(0,n_clusters))
norm_histogram = np.divide(img_histogram, np.array([len(clusters), 1.0]))
norm_histogram

array([ array([ 0.10094637,  0.11041009,  0.11041009,  0.10094637,  0.11356467,
        0.06309148,  0.09463722,  0.05362776,  0.12933754,  0.12302839]),
       array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.])], dtype=object)

In [15]:
np.divide(img_histogram, np.array([27, 1.0]))

array([ array([ 0.85185185,  1.07407407,  1.59259259,  1.18518519,  1.18518519,
        0.88888889,  0.96296296,  1.22222222,  1.03703704,  1.74074074]),
       array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.])], dtype=object)

In [16]:
kmeans.cluster_centers_

array([[ 26.39130592,  27.17391205,  21.04347801, ...,  17.86956406,
         28.47826004,  19.60869598],
       [ 14.17241383,   9.00000477,   5.10344315, ...,  15.44827461,
          3.62069178,   5.37931061],
       [ 12.37208843,  22.93023491,  26.65116501, ...,  25.62790489,
         22.79069519,  17.1860466 ],
       ..., 
       [ 20.42424393,  10.00000381,  10.96969604, ...,   8.30302906,
         10.39393902,  16.39393997],
       [ 33.6071434 ,  41.28572083,  33.99999619, ...,   9.        ,
         12.21428394,  24.7142868 ],
       [ 19.72340584,   9.68085575,   8.12766361, ...,   3.85106659,
          5.1914897 ,  11.95744705]], dtype=float32)

In [17]:
show_sift_features(dog2_img, dog2_desc, dog2_kp)

<matplotlib.image.AxesImage at 0x9955470>

In [8]:
import scipy.io
mat_Train = scipy.io.loadmat(my_data + '/train_list.mat')

In [9]:
mat_Train


{'__globals__': [],
 '__header__': 'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct  9 08:36:13 2011',
 '__version__': '1.0',
 'annotation_list': array([[array([u'n02085620-Chihuahua/n02085620_5927'],
       dtype='<U34')],
        [array([u'n02085620-Chihuahua/n02085620_4441'],
       dtype='<U34')],
        [array([u'n02085620-Chihuahua/n02085620_1502'],
       dtype='<U34')],
        ..., 
        [ array([u'n02116738-African_hunting_dog/n02116738_6754'],
       dtype='<U44')],
        [ array([u'n02116738-African_hunting_dog/n02116738_9333'],
       dtype='<U44')],
        [ array([u'n02116738-African_hunting_dog/n02116738_2503'],
       dtype='<U44')]], dtype=object),
 'file_list': array([[array([u'n02085620-Chihuahua/n02085620_5927.jpg'],
       dtype='<U38')],
        [array([u'n02085620-Chihuahua/n02085620_4441.jpg'],
       dtype='<U38')],
        [array([u'n02085620-Chihuahua/n02085620_1502.jpg'],
       dtype='<U38')],
        ..., 
        [ array([u'n02116738-A

In [10]:
dogs_annotation = []

for img_annotation in mat_Train['annotation_list']:
    dog_annotation = str(img_annotation[0][0])
    dogs_annotation.append(dog_annotation)

In [11]:
dogs_full_path = []
dogs_folder = []
dogs_file = []

for img_file in mat_Train['file_list']:
    dog_full_path = str(img_file[0][0])
    folder, file =  dog_full_path.split('/')
    dogs_full_path.append(dog_full_path)
    dogs_folder.append(folder)
    dogs_file.append(file)

In [76]:
train_df = pd.DataFrame({'file' : dogs_file,
                         'folder' : dogs_folder,
                         'full_path' : dogs_full_path,
                         'annotation' : dogs_annotation,
                         'centroid' : np.NaN,
                         'histogram' : np.NaN})

In [13]:
train_df_labels = pd.DataFrame(mat_Train['labels'], columns = ['label'])

In [85]:
def load_img(img_folder, img_file):
    picture_file = join(images, img_folder, img_file)
    img = Image.open(picture_file)
    img = IO.grayscale(img)
    img = IO.equalize(img)
    img_sift = np.array(img)
    return img_sift

In [96]:
descriptors_list = []
img_index = 0

sift = cv2.xfeatures2d.SIFT_create()

for dog_folder, dog_file in np.array(train_df[['folder', 'file']]):
    print(dog_file)
    gray_img = load_img(dog_folder, dog_file)

    # kp is the keypoints
    #
    # desc is the SIFT descriptors, they're 128-dimensional vectors
    keypoints, descriptors = sift.detectAndCompute(gray_img, None)
    descriptors_list.append(descriptors)
    img_index += 1
    
print("{0} images processed".format(img_index))


n02085620_5927.jpg
n02085620_4441.jpg
n02085620_1502.jpg
n02085620_1916.jpg
n02085620_13151.jpg
n02085620_1569.jpg
n02085620_9654.jpg
n02085620_3975.jpg
n02085620_3942.jpg
n02085620_9351.jpg
n02085620_574.jpg
n02085620_4998.jpg
n02085620_7.jpg
n02085620_1617.jpg
n02085620_11140.jpg
n02085620_5771.jpg
n02085620_275.jpg
n02085620_9399.jpg
n02085620_10621.jpg
n02085620_4266.jpg
n02085620_7738.jpg
n02085620_3110.jpg
n02085620_12101.jpg
n02085620_8558.jpg
n02085620_7700.jpg
n02085620_3208.jpg
n02085620_5312.jpg
n02085620_1152.jpg
n02085620_4572.jpg
n02085620_2706.jpg
n02085620_473.jpg
n02085620_1455.jpg
n02085620_7440.jpg
n02085620_9414.jpg
n02085620_2208.jpg
n02085620_3593.jpg
n02085620_2921.jpg
n02085620_500.jpg
n02085620_9357.jpg
n02085620_949.jpg
n02085620_3423.jpg
n02085620_3651.jpg
n02085620_3407.jpg
n02085620_2973.jpg
n02085620_326.jpg
n02085620_7292.jpg
n02085620_4814.jpg
n02085620_7613.jpg
n02085620_2614.jpg
n02085620_1558.jpg
n02085620_11238.jpg
n02085620_3838.jpg
n02085620_4673.j

KeyboardInterrupt: 

In [101]:
descriptors_list

89

In [None]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters)
# Number of descriptors should be more than n_clusters
    if len(desc) > 3 * n_clusters:
        # Do a clustering of the feature descriptors
        kmeans.fit(desc)
        clusters = kmeans.labels_
        centroids.append(kmeans.cluster_centers_)
        # train_df.loc[train_df['file'] == dog_file, 'centroid'] = [kmeans.cluster_centers_]
        # train_df.set_value(img_index, 'centroid', [1, 2, 3])

        img_histogram_values, img_histogram_count  = np.unique(clusters, return_counts=True)
        norm_histogram = np.divide(img_histogram_count, float(len(clusters)))
        histograms.append(norm_histogram)
        # train_df.loc[train_df['file'] == dog_file, 'histogram'] = norm_histogram