In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
from os import listdir
from os.path import isfile, join
from lxml import etree

from PIL import Image
from PIL import ImageOps as IO
import cv2

import scipy.io
from scipy.cluster.vq import vq, kmeans2, whiten

# remove this ?
# from sklearn.cluster import KMeans

In [2]:
# Define data directories
my_data = "../data"
annotations = my_data + '/Annotation'        # BO file to be merged with additional information
images = my_data + '/Images' # SAP file with order fulfilment dates
cropped = my_data + '/Cropped'

In [3]:
nb_breeds = 30

In [4]:
# method to read picture information from a matlab file
def load_img_from_mat(mat_file):

    mat_data = scipy.io.loadmat(mat_file)
    # load annotations for each picture
    dogs_annotation = []
    
    for img_annotation in mat_data['annotation_list']:
        dog_annotation = str(img_annotation[0][0])
        dogs_annotation.append(dog_annotation)

    # load picture filenames and path for each picture contained in mat file
    dogs_full_path = []
    dogs_folder = []
    dogs_file = []

    for img_file in mat_data['file_list']:
        dog_full_path = str(img_file[0][0])
        folder, filen =  dog_full_path.split('/')
        dogs_full_path.append(dog_full_path)
        dogs_folder.append(folder)
        dogs_file.append(filen)

    # Create dataframe with these informations
    file_df = pd.DataFrame({'file' : dogs_file,
                             'folder' : dogs_folder,
                             'full_path' : dogs_full_path,
                             'annotation' : dogs_annotation})
    
    # Load label information and create a complete dataframe
    labels_df = pd.DataFrame(mat_data['labels'], columns = ['label'])
    file_df = file_df.join(labels_df)
    
    return file_df

In [5]:
# method to read a file picture
def load_img(img_folder, img_file):
    picture_file = join(cropped, img_folder, img_file)
    img = Image.open(picture_file)
    img = IO.grayscale(img)
    img = IO.equalize(img)
    img_sift = np.array(img)
    return img_sift

In [6]:
def sift_extract(pict_df):
    start_time = time.time()
    lap_time = start_time
    
    descriptors_list = []
    descriptors_number = []
    img_index = 0

    sift = cv2.xfeatures2d.SIFT_create()

    for dog_folder, dog_file in np.array(pict_df[['folder', 'file']]):
        gray_img = load_img(dog_folder, dog_file)

        # Get SIFT keypoints and descriptors for each image
        keypoints, descriptors = sift.detectAndCompute(gray_img, None)

        # Add this image descriptors to the list
        descriptors_number.append(len(descriptors))
        for descriptor in descriptors:
            descriptors_list.append(descriptor)
        img_index += 1

        if (img_index % 500) == 0:
            print("{0} images processed in {1} seconds per batch of 500".format(img_index, time.time() - lap_time))
            lap_time = time.time()

    end_time = time.time()
    print("temps de traitement SIFT features: %15.2f secondes" % (end_time - start_time))
    
    return descriptors_number, descriptors_list

In [8]:
# Load train data and get SIFT descriptors
train_df = load_img_from_mat(my_data + '/train_list.mat')
sample_train_df = train_df[train_df['label'] < nb_breeds]
train_descriptors_number, train_descriptors_list = sift_extract(sample_train_df)

print 'Number of pictures of train data:', len(train_descriptors_number)

print 'Number of SIFT descriptors for train data:', len(train_descriptors_list)

Number of pictures of train data: 2900
Number of SIFT descriptors for train data: 2304719


In [9]:
from scipy.cluster.vq import vq, kmeans2, whiten
n_clusters = 50

# Do a clustering of the feature descriptors
centroids, clusters = kmeans2(train_descriptors_list, n_clusters)


In [10]:
len(clusters)

2304719

In [11]:
# Build matrix of histograms.
# Each line is a picture, each columns is a cluster ID
# Values are the number of times a picture belongs to a cluster, i.e. a visual word
desc_idx = 0
im_histos = np.zeros((len(train_descriptors_number), n_clusters), "float64")
for img_idx in range(len(train_descriptors_number)):
    img_desc_nb = train_descriptors_number[img_idx]
    for cluster_idx in range(desc_idx, img_desc_nb + desc_idx):
        c = clusters[cluster_idx]
        im_histos[img_idx][c] += 1.0/img_desc_nb
    desc_idx = img_desc_nb + desc_idx

In [12]:
img_desc_nb

1758

In [13]:
train_descriptors_number[1500]

509

In [14]:
sum(im_histos[500])

1.0000000000000004

In [15]:
im_histos[500]

array([0.01317957, 0.0214168 , 0.01317957, 0.01482702, 0.0247117 ,
       0.01812191, 0.02635914, 0.0247117 , 0.01976936, 0.01812191,
       0.0247117 , 0.00494234, 0.01976936, 0.02306425, 0.00823723,
       0.01976936, 0.0247117 , 0.0214168 , 0.0247117 , 0.01647446,
       0.01976936, 0.02306425, 0.01812191, 0.02965404, 0.0247117 ,
       0.01317957, 0.00494234, 0.01647446, 0.0247117 , 0.02635914,
       0.02965404, 0.0247117 , 0.02306425, 0.01153213, 0.02800659,
       0.01482702, 0.00988468, 0.02635914, 0.02635914, 0.01482702,
       0.0214168 , 0.02635914, 0.00988468, 0.01482702, 0.03130148,
       0.01812191, 0.01647446, 0.03294893, 0.00658979, 0.02965404])

In [16]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
# Do a multi label classification
X = im_histos
Y = sample_train_df['label']

MultiLabelClassif = OneVsRestClassifier(SVC(kernel='linear', probability=True))
start =  time.time()
MultiLabelClassif.fit(X, Y)
end =  time.time()
print("Fitting duration {0} seconds".format(end - start) )

Fitting duration 4.95799994469 seconds


In [17]:
np.shape(Y)

(2900L,)

In [19]:
# Load test data and get SIFT descriptors
test_df = load_img_from_mat(my_data + '/test_list.mat')
sample_test_df = test_df[test_df['label'] < nb_breeds]
test_descriptors_number, test_descriptors_list = sift_extract(sample_test_df)

print 'Number of pictures of test data:', len(test_descriptors_number)

print 'Number of SIFT descriptors for test data:', len(test_descriptors_list)

Number of pictures of test data: 2384
Number of SIFT descriptors for test data: 1927652


In [20]:
# Build matrix of histograms for test pictures
# Each line is a picture, each columns is a cluster ID
# Values are the number of times a picture belongs to a cluster, i.e. a visual word
desc_idx_test = 0
im_histos_test = np.zeros((len(test_descriptors_number), n_clusters), "float64")
for img_idx in range(len(test_descriptors_number)):
    img_desc_nb = test_descriptors_number[img_idx]
    my_descriptors = test_descriptors_list[desc_idx_test: img_desc_nb + desc_idx_test]
    cvect, dist = vq(my_descriptors, centroids)
    for c in cvect:
        im_histos_test[img_idx][c] += 1.0/img_desc_nb
    desc_idx_test = img_desc_nb + desc_idx_test

In [21]:
im_histos_test[0]

array([0.00880282, 0.01584507, 0.02464789, 0.01056338, 0.01408451,
       0.01584507, 0.02992958, 0.02288732, 0.02816901, 0.0193662 ,
       0.02112676, 0.01760563, 0.01760563, 0.00352113, 0.03697183,
       0.00704225, 0.01408451, 0.02992958, 0.00880282, 0.01760563,
       0.03169014, 0.02464789, 0.00528169, 0.01584507, 0.01760563,
       0.01056338, 0.01056338, 0.00880282, 0.00704225, 0.0334507 ,
       0.01408451, 0.01232394, 0.01056338, 0.05105634, 0.01056338,
       0.02112676, 0.0193662 , 0.0193662 , 0.02992958, 0.01408451,
       0.01408451, 0.0334507 , 0.03169014, 0.01584507, 0.01408451,
       0.03873239, 0.02816901, 0.02464789, 0.04753521, 0.0193662 ])

In [22]:
np.shape(im_histos_test)

(2384L, 50L)

In [23]:
dogs_labels = train_df[['folder', 'label']].drop_duplicates().set_index('label')
#dogs_labels[dogs_labels['label']==2]
dogs_labels.loc[2, ['folder']]


folder    n02085782-Japanese_spaniel
Name: 2, dtype: object

In [24]:
# Predic proba for each tag to be relevant for the question
test_predict_proba = MultiLabelClassif.predict_proba(im_histos_test)

In [25]:
proposed_type = ''
for i in test_predict_proba[10].argsort()[:-10-1:-1]:
    proposed_type = dogs_labels.loc[i, ['folder']]
    print(proposed_type, test_predict_proba[10][i])

(folder    n02089078-black-and-tan_coonhound
Name: 15, dtype: object, 0.04924153831950025)
(folder    n02085620-Chihuahua
Name: 1, dtype: object, 0.04632587539960555)
(folder    n02086240-Shih-Tzu
Name: 5, dtype: object, 0.04482853558014607)
(folder    n02088238-basset
Name: 11, dtype: object, 0.04439512584814764)
(folder    n02092339-Weimaraner
Name: 28, dtype: object, 0.044018910218815724)
(folder    n02089867-Walker_hound
Name: 16, dtype: object, 0.04395922975797525)
(folder    n02091831-Saluki
Name: 26, dtype: object, 0.041961646893488455)
(folder    n02091134-whippet
Name: 22, dtype: object, 0.04015838097472137)
(folder    n02086910-papillon
Name: 7, dtype: object, 0.03866206386887495)
(folder    n02088632-bluetick
Name: 14, dtype: object, 0.0384037253244496)


In [26]:
sample_test_df.loc[2]

annotation        n02085620-Chihuahua/n02085620_1765
file                              n02085620_1765.jpg
folder                           n02085620-Chihuahua
full_path     n02085620-Chihuahua/n02085620_1765.jpg
label                                              1
Name: 2, dtype: object

In [27]:
np.shape(test_predict_proba)

(2384L, 29L)

In [28]:
test_predict_proba[0]

array([0.03044937, 0.0585624 , 0.00302788, 0.03588743, 0.03117359,
       0.04368595, 0.02395971, 0.03875954, 0.03487004, 0.03716219,
       0.03209668, 0.03944756, 0.03255982, 0.03286392, 0.03988588,
       0.04358016, 0.03812752, 0.02908883, 0.0330573 , 0.02962716,
       0.03633774, 0.03707579, 0.03495838, 0.02407168, 0.03575281,
       0.03572604, 0.03424537, 0.04396085, 0.0299984 ])

In [29]:
test_predict_proba[10]

array([0.03839787, 0.04632588, 0.00945957, 0.0285218 , 0.03416962,
       0.04482854, 0.02137135, 0.03866206, 0.0372451 , 0.03233565,
       0.03521128, 0.04439513, 0.03138595, 0.03679752, 0.03840373,
       0.04924154, 0.04395923, 0.02848204, 0.03530734, 0.01903651,
       0.03684424, 0.02950556, 0.04015838, 0.02176741, 0.03432336,
       0.0301878 , 0.04196165, 0.02769501, 0.04401891])

In [30]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
rfc_predict = rfc.predict(im_histos_test)

In [32]:
len(rfc_predict)

2384

In [33]:
len(im_histos_test)

2384

In [34]:
rfc.score(im_histos_test,sample_test_df['label'])

0.09437919463087248