In [1]:
import cv2
import os
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.svm import LinearSVC, SVC
from sklearn.externals import joblib
from scipy.cluster.vq import *
from sklearn.preprocessing import StandardScaler

# image function

In [2]:
def imlist(path):
    """
    The function imlist returns all the names of the files in 
    the directory path supplied as argument to the function.
    """
    return [os.path.join(path, f) for f in os.listdir(path)]

# bag-of-word (BOW) function

In [11]:
# Create feature extraction and keypoint detector objects
def BOW(image_paths, sift):
    des_list = []     # save descriptor list
    for image_path in image_paths:
        im = cv2.imread(image_path)
        gray= cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
        (kps, des) = sift.detectAndCompute(gray, None)
        des_list.append((image_path, des))  
    return des_list

# histogram of BOW function

In [4]:
def hist_bow(des_list, image_paths):
    # Stack all the descriptors vertically in a numpy array
    descriptors = des_list[0][1]
    for image_path, descriptor in des_list[1:]:
        descriptors = np.vstack((descriptors, descriptor))  
    
    # Perform k-means clustering
    k = 15
    voc, variance = kmeans(descriptors, k, 1) 
    # Calculate the histogram of features
    im_features = np.zeros((len(image_paths), k), "float32")
    for i in range(len(image_paths)):
        words, distance = vq(des_list[i][1],voc)
        for w in words:
            im_features[i][w] += 1
    return im_features

# Load train data and test data

In [5]:
# Get the path of the training set
train_path = r'dataset\data\traindata'
train_names = os.listdir(train_path)

# Get the path of the test set
test_path = r'dataset\data\testdata'
test_names = os.listdir(test_path)

In [6]:
# Get all the path to the images and save them in a list; save the image label to a list
train_image_paths = []
train_image_classes = []
class_id = 0
for train_name in train_names:
    dir = os.path.join(train_path, train_name)
    class_path = imlist(dir)
    train_image_paths+=class_path
    train_image_classes+=[class_id]*len(class_path)
    class_id+=1

# Get the path of the testing image(s) and store them in a list
test_image_paths = []
test_image_classes = []
class_id = 0
for test_name in test_names:
    test_dir = os.path.join(test_path, test_name)
    test_class_path = imlist(test_dir)
    test_image_paths += test_class_path
    test_image_classes += [class_id]*len(test_class_path)
    class_id+=1

In [7]:
# initiate the sift algorithm
sift = cv2.xfeatures2d.SURF_create()

In [8]:
# train data
train_descriptor = BOW(train_image_paths, sift)
train_im_features = hist_bow(train_descriptor, train_image_paths)

# Scaling the words
stdSlr = StandardScaler().fit(train_im_features)
train_im_features = stdSlr.transform(train_im_features)

In [None]:
len(test_descriptor[0][1])

In [9]:
# apply model to test data 
test_descriptor = BOW(test_image_paths, sift)
test_im_features = hist_bow(test_descriptor, test_image_paths)

# Scaling the words
stdSlr = StandardScaler().fit(test_im_features)
test_im_features = stdSlr.transform(test_im_features)

In [10]:
# Train the Linear SVM
clf = SVC(C=50, gamma=0.001)
clf.fit(train_im_features, np.array(train_image_classes))

# Perform the predictions
predictions =  clf.predict(test_im_features)
clf.score(test_im_features, test_image_classes)

0.18

In [36]:
# play with parameters

from sklearn.grid_search import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}

grid = GridSearchCV(clf, param_grid)

%time grid.fit(test_im_features, test_image_classes)
print(grid.best_params_)

Wall time: 865 ms
{'C': 50, 'gamma': 0.001}


In [60]:
len(test_image_classes)

400

In [None]:
clf.score(test_im_features, test_image_classes)

In [39]:
predictions

array([1, 7, 1, 6, 0, 6, 6, 3, 0, 3, 7, 0, 3, 3, 7, 3, 6, 7, 6, 6, 6, 6,
       7, 1, 6, 6, 7, 6, 0, 7, 6, 3, 6, 6, 4, 0, 6, 3, 3, 7, 6, 0, 1, 3,
       4, 1, 6, 2, 6, 6, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6,
       1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 6, 1, 3, 6, 7, 1, 3,
       5, 4, 1, 6, 3, 4, 1, 1, 1, 1, 6, 4, 4, 1, 6, 7, 1, 6, 1, 6, 1, 1,
       6, 6, 1, 6, 1, 1, 7, 3, 0, 3, 7, 1, 6, 3, 1, 7, 6, 6, 3, 3, 6, 4,
       3, 0, 4, 4, 4, 3, 4, 4, 1, 4, 3, 3, 5, 3, 6, 5, 3, 7, 3, 4, 4, 3,
       4, 5, 4, 4, 4, 3, 3, 4, 4, 3, 4, 4, 3, 4, 3, 0, 7, 5, 3, 4, 3, 5,
       5, 4, 3, 4, 6, 6, 5, 6, 6, 2, 3, 4, 3, 0, 6, 6, 3, 6, 6, 3, 3, 4,
       3, 3, 4, 2, 4, 3, 1, 5, 6, 0, 6, 3, 4, 3, 0, 6, 6, 0, 5, 0, 0, 6,
       0, 6, 3, 6, 6, 6, 3, 4, 4, 1, 6, 4, 6, 1, 1, 6, 1, 7, 6, 6, 4, 6,
       1, 1, 0, 4, 4, 1, 5, 5, 1, 6, 1, 1, 1, 1, 6, 3, 4, 6, 3, 6, 6, 5,
       5, 6, 6, 6, 6, 0, 6, 3, 3, 6, 0, 4, 4, 1, 1,

# KNN algorithm

In [44]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
clf.fit(train_im_features, np.array(train_image_classes))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [45]:
predictions =  clf.predict(test_im_features)

In [46]:
clf.score(test_im_features, test_image_classes)

0.25

In [25]:
from sklearn.metrics import confusion_matrix

In [43]:
confusion_matrix(test_image_classes, predictions)

array([[11,  4,  9,  2,  3,  8, 12,  1],
       [ 5, 39,  1,  0,  1,  0,  4,  0],
       [ 1,  5,  2,  9,  4, 18, 10,  1],
       [ 8,  4,  6, 14,  1, 11,  5,  1],
       [ 9,  2,  1, 19,  1,  0, 18,  0],
       [ 1, 19,  2,  5,  3, 10,  5,  5],
       [ 4,  6,  3,  5,  1,  8, 22,  1],
       [15,  7,  4,  9,  2,  3,  5,  5]])