# Dogs vs Cats - features

[Kaggle](https://www.kaggle.com/c/dogs-vs-cats)

1 = dog

0 = cat

In [6]:
import cv2
from matplotlib import pyplot as plt
import sklearn
import numpy as np
import pickle as pk
from os import listdir

plt.style.use('ggplot')
%matplotlib inline

### Load training dataset

In [7]:
train_folder = 'data/train/'

In [8]:
imgs_paths = [train_folder + filepath for filepath in listdir(train_folder)]

In [9]:
# select a subset
imgs_paths = imgs_paths[:100]

In [10]:
from os import listdir

def load_images(imgs_paths, gray=False):
    for path in imgs_paths:
        img = cv2.imread(path)
        
        if gray:
            yield cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        else:
            yield cv2.imread(path)

In [11]:
labels = [1 if "dog" in path else 0 for path in imgs_paths]

In [None]:
print('Nr dogs:', labels.count(1))

In [None]:
print('Nr cats:', labels.count(0))

### Features Extraction

In [None]:
# SIFT features detector and extractor
sift = cv2.xfeatures2d.SIFT_create()

In [None]:
# SURF features detector and extractor
surf = cv2.xfeatures2d.SURF_create()

In [None]:
# FAST features detector
fast = cv2.FastFeatureDetector_create()

In [None]:
# BRISK descriptors extractor
br = cv2.BRISK_create()

In [None]:
# FLANN matcher
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params,search_params)

In [None]:
def get_descriptors(imgs, detector, extractor=None):
    for img in imgs:
        if extractor == None:
            yield detector(img, None)
        else:
            kp = detector(img, None)
            yield extractor(img, kp)

In [None]:
imgs = load_images(imgs_paths, gray=True)
imgs_sift_des = get_descriptors(imgs, sift.detectAndCompute)

In [None]:
imgs = load_images(imgs_paths, gray=True)
imgs_surf_des = get_descriptors(imgs, surf.detectAndCompute)

In [None]:
imgs = load_images(imgs_paths, gray=True)
imgs_fast_des = get_descriptors(imgs, detector=fast.detect, extractor=br.compute)

### Output tools

In [None]:
test_folder = 'data/test1/'

In [None]:
imgs_paths_test = [test_folder + filepath for filepath in listdir(test_folder)]

### Classifiers

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
# function(y_true, y_pred)

In [None]:
from sklearn.cross_validation import KFold
# kf = KFold(nr_samples, n_folds=2)
# returns indexes

In [None]:
from sklearn.cross_validation import ShuffleSplit
# for train, test in ShuffleSplit(nr_samples, n_iter=1, test_size=.3, random_state=0)

#### Nearest Neighbors

In [None]:
from queue import PriorityQueue

def knn(train_paths, test_paths, descriptor, extractor=None, k=5):

    # array to store predictions
    pred_labels = []
    
    # labels
    labels_train = [1 if "dog" in path else 0 for path in train_paths]
    
    # get images
    imgs_train = load_images(train_paths, gray=True)
    imgs_test = load_images(test_paths, gray=True)
    
    # generate test descriptors
    imgs_test_des = get_descriptors(imgs_test, descriptor, extractor)
    
    for test_sample in imgs_test_des:
        
        # queue to store votes
        pq = PriorityQueue()
        
        # generate train descriptors
        imgs_train_des = get_descriptors(imgs_test, descriptor, extractor)
        
        for train_sample, train_label in zip(imgs_train_des, labels_train):
            # match descriptors
            matches = flann.knnMatch(test_sample, train_sample, k=2)
            #matchesMasks = [[0,0] for _ in range(len(matches))]
            
            nr_matches = 0

            # ratio test as per Lowe's paper
            for i in range(len(matches)):
                for j, (m, n) in enumerate(matches[i]):
                    if m.distance < 0.7 * n.distance:
                        #matchesMasks[i][j] = [1,0]
                        nr_matches += 1
            
            pq.put((-nr_matches, train_labels))

        # count votes
        dog_votes = 0
        for i in range(k):
            dog_votes += pq.get()[1]
        
        if dog_votes > k / 2:
            pred_labels.append(1)
        else:
            pred_labels.append(0)
    
    return pred_labels

In [None]:
#imgs_paths_np = np.array(imgs_paths)

#for train, test in ShuffleSplit(100, n_iter=2, test_size=.3, random_state=0):
#    y_pred = knn(imgs_paths_np[train], imgs_paths_np[test], sift.detectAndCompute)
#    
#    y_true = [1 if "dog" in path else 0 for path in imgs_paths_np[test]]
#    
#    score = accuracy_score(y_true, y_pred)
#    conf_mat = confusion_matrix(y_true, y_pred)
#    
#    print('Accuracy:', score)
#    print('Confusion matrix:', conf_mat, sep='\n')