In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
from os import listdir
from os.path import isfile, join
from lxml import etree

from PIL import Image
from PIL import ImageOps as IO
import cv2

import scipy.io
from scipy.cluster.vq import vq, kmeans2, whiten

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier


In [2]:
# Define data directories
my_data = "../data"
annotations = my_data + '/Annotation'        # BO file to be merged with additional information
images = my_data + '/Images' # SAP file with order fulfilment dates
cropped = my_data + '/Cropped'

In [3]:
# Parameters

# Number of dogs breeds to include
nb_breeds = 3

# Number of clusters for the kMeans clustering of image descriptors
n_clusters = 1000


In [4]:
# method to read picture information from a matlab file
def load_img_from_mat(mat_file):

    mat_data = scipy.io.loadmat(mat_file)
    # load annotations for each picture
    dogs_annotation = []
    
    for img_annotation in mat_data['annotation_list']:
        dog_annotation = str(img_annotation[0][0])
        dogs_annotation.append(dog_annotation)

    # load picture filenames and path for each picture contained in mat file
    dogs_full_path = []
    dogs_folder = []
    dogs_file = []

    for img_file in mat_data['file_list']:
        dog_full_path = str(img_file[0][0])
        folder, filen =  dog_full_path.split('/')
        dogs_full_path.append(dog_full_path)
        dogs_folder.append(folder)
        dogs_file.append(filen)

    # Create dataframe with these informations
    file_df = pd.DataFrame({'file' : dogs_file,
                             'folder' : dogs_folder,
                             'full_path' : dogs_full_path,
                             'annotation' : dogs_annotation})
    
    # Load label information and create a complete dataframe
    labels_df = pd.DataFrame(mat_data['labels'], columns = ['label'])
    file_df = file_df.join(labels_df)
    
    return file_df

In [5]:
# method to read a file picture
def load_img(img_folder, img_file):
    picture_file = join(cropped, img_folder, img_file)
    img = Image.open(picture_file)
    img = IO.grayscale(img)
    img = IO.equalize(img)
    img_sift = np.array(img)
    return img_sift

In [6]:
# Extract SIFT features from a dataframe of pictures
# Passed dataframe must have 'folder' and 'file columns'
def sift_extract(pict_df):
    start_time = time.time()
    lap_time = start_time
    
    # Variable to hold all descriptors
    descriptors_list = []
    
    # Variable to hold for each picture the number of descriptors
    descriptors_number = []
    img_index = 0

    sift = cv2.xfeatures2d.SIFT_create()

    for dog_folder, dog_file in np.array(pict_df[['folder', 'file']]):
        gray_img = load_img(dog_folder, dog_file)

        # Get SIFT keypoints and descriptors for each image
        keypoints, descriptors = sift.detectAndCompute(gray_img, None)

        # Add this image descriptors to the list
        descriptors_number.append(len(descriptors))
        for descriptor in descriptors:
            descriptors_list.append(descriptor)
        img_index += 1

        if (img_index % 500) == 0:
            print("{0} images processed in {1} seconds per batch of 500".format(img_index, time.time() - lap_time))
            lap_time = time.time()

    end_time = time.time()
    print("temps de traitement SIFT features: %15.2f secondes" % (end_time - start_time))
    
    return descriptors_number, descriptors_list

In [7]:
def build_histograms(descriptors_number, clusters):
    # Build matrix of histograms.
    # Each line is a picture, each columns is a cluster ID
    # Values are the number of times a picture belongs to a cluster, 
    # i.e. a visual word, divided by the number of pictures for this image
    
    desc_idx = 0
    im_histos = np.zeros((len(descriptors_number), n_clusters), "float64")
    
    for img_idx in range(len(descriptors_number)):
        img_desc_nb = descriptors_number[img_idx]
        for cluster_idx in range(desc_idx, img_desc_nb + desc_idx):
            c = clusters[cluster_idx]
            im_histos[img_idx][c] += 1.0/img_desc_nb
        desc_idx = img_desc_nb + desc_idx
    return im_histos

In [8]:
def build_histograms_2(descriptors_number, descriptors_list, centroids):
    # Build matrix of histograms for test pictures
    # Each line is a picture, each columns is a cluster ID
    # Values are the number of times a picture belongs to a cluster, i.e. a visual word
    
    desc_idx = 0
    im_histos = np.zeros((len(descriptors_number), n_clusters), "float64")
    
    for img_idx in range(len(descriptors_number)):
        # Get array of descriptors for this image
        img_desc_nb = descriptors_number[img_idx]
        my_descriptors = descriptors_list[desc_idx: img_desc_nb + desc_idx]
        
        # Find the nearest centroid for each descriptor
        cvect, dist = vq(my_descriptors, centroids)
        for c in cvect:
            im_histos[img_idx][c] += 1.0/img_desc_nb
        desc_idx = img_desc_nb + desc_idx
    return im_histos

In [9]:
# Load train data
train_df = load_img_from_mat(my_data + '/train_list.mat')
sample_train_df = train_df[train_df['label'] < nb_breeds]

print 'Number of pictures of train data:', len(sample_train_df)

Number of pictures of train data: 200


In [10]:
# Extract SIFT Features from train Data and create histograms
train_descriptors_number, train_descriptors_list = sift_extract(sample_train_df)

print 'Number of SIFT descriptors for train data:', len(train_descriptors_list)
print("Maximum number of descriptors for one picture {0}".format(np.max(train_descriptors_number)))
print("Minimum number of descriptors for one picture {0}".format(np.min(train_descriptors_number)))

temps de traitement SIFT features:           12.01 secondes
Number of SIFT descriptors for train data: 129327
Maximum number of descriptors for one picture 4191
Minimum number of descriptors for one picture 28


In [11]:
# Load test data and get SIFT descriptors
test_df = load_img_from_mat(my_data + '/test_list.mat')
sample_test_df = test_df[test_df['label'] < nb_breeds]

print 'Number of pictures of test data:', len(sample_test_df)

test_descriptors_number, test_descriptors_list = sift_extract(sample_test_df)

print 'Number of SIFT descriptors for test data:', len(test_descriptors_list)

Number of pictures of test data: 137
temps de traitement SIFT features:            9.61 secondes
Number of SIFT descriptors for test data: 105352


In [12]:
def eval_score(n_clusters):
    # Do a clustering of the feature descriptors
    train_centroids, train_clusters = kmeans2(whiten(train_descriptors_list), n_clusters)

    train_histos = build_histograms_2(train_descriptors_number, train_descriptors_list, train_centroids)

    test_histos = build_histograms_2(test_descriptors_number, test_descriptors_list, train_centroids)

    # Define X array and Y vector for the supervised classification
    X = train_histos
    Y = sample_train_df['label']

    # Fit a Random Forest
    rdc = RidgeClassifier()
    rdc.fit(X, Y)

    score = rdc.score(test_histos,sample_test_df['label'])
    
    print("Score for {0} clusters : {1:.2%}".format(n_clusters, score))

In [13]:
for cluster_nb in range (nb_breeds, nb_breeds*10, nb_breeds):
    eval_score(cluster_nb)

Score for 3 clusters : 54.01%
Score for 6 clusters : 57.66%
Score for 9 clusters : 62.77%
Score for 12 clusters : 67.15%
Score for 15 clusters : 64.96%
Score for 18 clusters : 67.88%
Score for 21 clusters : 66.42%
Score for 24 clusters : 67.15%
Score for 27 clusters : 72.99%


In [14]:
train_histos[50]

NameError: name 'train_histos' is not defined