# Dogs vs Cats - features

[Kaggle](https://www.kaggle.com/c/dogs-vs-cats)

1 = dog

0 = cat

Notes for report:
    analyse how the variability of nr_features affects
    try different detectors

In [1]:
import cv2
from matplotlib import pyplot as plt
import sklearn
import numpy as np
import pickle as pk
from os import listdir

plt.style.use('ggplot')
%matplotlib inline

In [2]:
NR_WORDS = 1000

In [3]:
from os import listdir

def load_images(imgs_paths, gray=False):
    for path in imgs_paths:
        img = cv2.imread(path)
        
        if gray:
            yield cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        else:
            yield cv2.imread(path)

## Features Extraction

#### Features detectors, descriptors and matcher

In [7]:
# SIFT features detector and extractor
sift = cv2.xfeatures2d.SIFT_create()

In [8]:
# FLANN matcher
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params,search_params)

#### Bag of Words

In [9]:
def train_bow(detector, matcher, extractor=None):
    if extractor == None:
        extractor = detector
    
    bow_extractor = cv2.BOWImgDescriptorExtractor(extractor, matcher)
    
    vocabulary = pk.load(open('vocabulary.p', 'rb'))
    
    bow_extractor.setVocabulary(vocabulary)
    
    return bow_extractor

In [10]:
detector = sift
extractor = sift

In [11]:
sift_bow_extractor = train_bow(detector, flann, extractor=extractor)

In [12]:
train_folder = 'data/train/'

In [13]:
imgs_paths = [train_folder + filepath for filepath in listdir(train_folder)]

In [54]:
features = np.empty((0, NR_WORDS))
imgs = load_images(imgs_paths, gray=True)

for img in imgs:
    kp = detector.detect(img)
    
    img_features = sift_bow_extractor.compute(img, kp)
    
    features = np.concatenate((features, img_features), axis=0)

In [55]:
labels = [1 if "dog" in path else 0 for path in imgs_paths]

In [56]:
labels = np.asarray(labels)

In [57]:
target_names = ['dog', 'cat']

In [64]:
pk.dump((features, labels), open('features_labels_diogo.p', 'wb'))

## Prediction

In [65]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.base import clone as skl_clone
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

def k_fold_model_select(features, labels, raw_classifiers, n_folds=10, weigh_samples_fn=None): 
    # weigh_samples_fn is explained below
    # assumes that the raw_classifier output is in probability
    
    # split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(features, 
                                                        labels,
                                                        test_size=0.3,
                                                        stratify=labels,
                                                        random_state=0)
    
    
    # use stratified k-fold cross validation to select the model
    skf = StratifiedKFold(y_train, n_folds=n_folds)

    best_classifier = None
    best_score = float('-inf')

    for train_index, validation_index in skf:
        for raw_classifier in raw_classifiers:
            classifier = skl_clone(raw_classifier)
            classifier = classifier.fit(X_train[train_index], y_train[train_index])

            if weigh_samples_fn != None:
                y_pred = classifier.predict(X_train[validation_index])
                sample_weight = weigh_samples_fn(y_train[validation_index], y_pred)
            else:
                sample_weight = None

            score = accuracy_score(classifier.predict(X_train[validation_index]), y_train[validation_index],
                                     sample_weight=sample_weight)

            if score > best_score:
                best_classifier = classifier
                best_score = score
    
    # compute the confusion matrix
    y_pred = best_classifier.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
    
    # now compute the score for the test data of the best found classifier
    if weigh_samples_fn != None:
        sample_weight = weigh_samples_fn(y_test, y_pred)
    else:
        sample_weight = None
    test_score = accuracy_score(best_classifier.predict(X_test), y_test, sample_weight=sample_weight)
    
    # obtain the classification report
    report = classification_report(y_test, y_pred, target_names=['cat', 'dog'], sample_weight=sample_weight)
    
    # obtain ROC curve
    y_test_bin = label_binarize(y_test, classes=[0, 1])
    y_prob = best_classifier.predict_proba(X_test)
    
    #fpr, tpr, _ = roc_curve(y_test_bin[:, 1], y_prob[:, 1])
    fpr, tpr, _ = roc_curve(y_test_bin, y_prob[:, 1])
    roc_info = (best_classifier.__class__.__name__, (fpr, tpr))
    
    return (test_score, report, conf_mat, roc_info, best_classifier)

#### Nearest Neighbors

In [59]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(weights='distance', algorithm='auto')
knn_score, knn_rep, knn_cm, knn_roc, knn_clf = k_fold_model_select(features, labels, [knn])

print("Nearest Neighbors")
print("Score:", knn_score)
print("Confusion matrix:", knn_cm, sep='\n')
print("Classification report:", knn_rep, sep='\n')

Nearest Neighbors
Score: 0.604
Confusion matrix:
[[2062 1688]
 [1282 2468]]
Classification report:
             precision    recall  f1-score   support

        cat       0.62      0.55      0.58      3750
        dog       0.59      0.66      0.62      3750

avg / total       0.61      0.60      0.60      7500



In [66]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb_score, nb_rep, nb_cm, nb_roc, nb_clf = k_fold_model_select(features, labels, [nb])

print("Gaussian Naive Bayes")
print("Score:", nb_score)
print("Confusion matrix:", nb_cm, sep='\n')
print("Classification report:", nb_rep, sep='\n')

Gaussian Naive Bayes
Score: 0.575066666667
Confusion matrix:
[[1914 1836]
 [1351 2399]]
Classification report:
             precision    recall  f1-score   support

        cat       0.59      0.51      0.55      3750
        dog       0.57      0.64      0.60      3750

avg / total       0.58      0.58      0.57      7500



In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

svc = SVC(kernel='linear', random_state=0, probability=True, max_iter=100)

pipeline = Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('svc linear', svc)])

svc_score, svc_rep, svc_cm, svc_roc, svc_clf = \
    k_fold_model_select(features, labels, [pipeline])

print("SVM")
print("Score:", svc_score)
print("Confusion matrix:", svc_cm, sep='\n')
print("Classification report:", svc_rep, sep='\n')

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(min_samples_split=15, random_state=0, min_samples_leaf=5)

ab = AdaBoostClassifier(base_estimator=dt, random_state=0)
ab_score, ab_rep, ab_cm, ab_roc, ab_clf = k_fold_model_select( features, labels, [ab])

print("AdaBoos")
print("Score:", ab_score)
print("Confusion matrix:", ab_cm, sep='\n')
print("Classification report:", ab_rep, sep='\n')

In [68]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)

rf_score, rf_rep, rf_cm, rf_roc, rf_clf = k_fold_model_select(features, labels, [rf])

print("Random Forest")
print("Score:", rf_score)
print("Confusion matrix:", rf_cm, sep='\n')
print("Classification report:", rf_rep, sep='\n')

Random Forest
Score: 0.6636
Confusion matrix:
[[2686 1064]
 [1459 2291]]
Classification report:
             precision    recall  f1-score   support

        cat       0.65      0.72      0.68      3750
        dog       0.68      0.61      0.64      3750

avg / total       0.67      0.66      0.66      7500



In [None]:
#classifiers = [knn_clf, nb_clf, svc_clf, ab_clf, rf_clf]
classifiers = [knn_clf, nb_clf, rf_clf]

In [None]:
best_score, best_rep, best_cm, best_roc, best_clf = k_fold_model_select(features, labels, classifiers)

print("Classifier:", best_clf.__class__.__name__)
print("Score:", best_score)
print("Confusion matrix:", best_cm, sep='\n')
print("Classification report:", best_rep, sep='\n')

In [70]:
import pickle as pk

best_clf = rf_clf

best_clf = best_clf.fit(features, labels)

pk.dump(best_clf, open('best_clf.p', 'wb'))

In [14]:
best_clf = pk.load(open('best_clf.p', 'rb'))

In [72]:
def plot_roc_curves(roc_curves):
    for name, (fpr, tpr) in roc_curves:
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC for {} (area = {:0.2f})'.format(name, roc_auc))
        
    plt.legend(bbox_to_anchor=(2.1, 1.05))
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

In [73]:
roc_curves = [knn_roc, nb_roc, svc_roc, ab_roc, rf_roc]

plot_roc_curves(roc_curves)

NameError: name 'svc_roc' is not defined

## Testing

In [4]:
def save_labels_csv(labels):
    indexed_labels = np.concatenate((np.asmatrix(range(1, len(labels) + 1)).transpose(), np.asmatrix(labels)), axis=1)
    
    np.savetxt('result.csv', 
               indexed_labels,
               fmt='%d',
               delimiter=',',
               header='id,label',
               comments='')

In [5]:
test_folder = 'data/test1/'

In [6]:
test_imgs_paths = [test_folder + filepath for filepath in listdir(test_folder)]

In [83]:
vocabulary
#test_imgs_paths = test_imgs_paths[:10]

In [18]:
pred = []

#test_imgs_paths = test_imgs_paths[6384:]

test_imgs = load_images(test_imgs_paths, gray=True)

for i, img in enumerate(test_imgs):
    
    if( i is not 6383 ):
        print(i)
        kp = detector.detect(img)
        img_features = sift_bow_extractor.compute(img, kp)

        p = best_clf.predict(img_features)

        pred.append(p)
    else:
        pred.append(0)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108


KeyboardInterrupt: 

In [101]:
save_labels_csv(pred)

In [90]:
print(len(pred))

12499


In [94]:
pred2 = pred

In [96]:
pred2.insert(6383, 0)

In [100]:
pred[6383] = np.array([0])