### Some functions for preproccessing images

In [116]:
from imutils import paths
import numpy as np
import os
import imutils
import cv2
import random
from math import sqrt
from operator import itemgetter

In [117]:
def img_to_feature_vector(image, size=(320, 240)):
    feature_vector = cv2.resize(image, size).flatten()
    return feature_vector

In [118]:
def img_color_histogram(image, bins=(8, 8, 8)):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
 
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
 
    else:
        cv2.normalize(hist, hist)
    
    histogram = hist.flatten()
    
    return histogram

In [119]:
def load_images(path_to_data):
    imagePaths = list(paths.list_images(path_to_data))
    print(len(imagePaths), " images are collected")
    
    return(imagePaths)

In [120]:
def img_to_vectors(imagePaths):
    pix_images = []
    hist_images = []
    labels = []
    
    for (i, imagePath) in enumerate(imagePaths):

        image = cv2.imread(imagePath)
        label = imagePath.split(os.path.sep)[-2].split("/")[0]

        pixels = img_to_feature_vector(image)
        hist = img_color_histogram(image)

        pix_images.append(pixels)
        hist_images.append(hist)
        labels.append(label)
 
    # show an update every 1,000 images
    if i > 0 and i % 1000 == 0:
        print(i, "/", len(imagePaths), "of images are proccessed")
        
    return(pix_images, hist_images, labels)


### Functions for test and train splitting

In [121]:
def join_set_labels(dataset, labels):
    
    for i in range(len(dataset)):
        dataset[i].append(labels[i])
    
    return(dataset)

In [122]:
def train_test_split(dataset, labels, test_size = 0.25):
    
    training_set = []
    training_labels = []
    
    test_set = []
    test_labels = []
    
    random.seed()
    
    indexes = np.arange(len(dataset))
    np.random.shuffle(indexes)
    for i in indexes:
        
        probab = random.random()

        if probab > test_size:
            training_set.append(dataset[i])
            training_labels.append(labels[i])
        
        else:
            test_set.append(dataset[i])
            test_labels.append(labels[i])
            
    return(training_set, training_labels, test_set, test_labels)

### Accuracy counting

In [123]:
def get_accuracy(predicted_classes, testLabels):
    true = 0
    for i in range(len(testLabels)):
        if testLabels[i] == predicted_classes[i]:
            true += 1

    accuracy = true/len(testLabels)
    print("Accuracy: ", accuracy)

### KNN Implementation

In [124]:
def get_labels(training_set):
    return list(set([c[-1] for c in training_set]))

def find_neighbors(distances, k):
    return distances[0:k]

def predict_label(neighbors, classes):
    votes = [0] * len(classes)

    for instance in neighbors:
        for ctr, c in enumerate(classes):
            if instance[-2] == c:
                votes[ctr] += 1

    return max(enumerate(votes), key=itemgetter(1))

def dataset_to_float(data_set, mode):
    new_set = []
    try:
        if mode == 'training':
            for data in data_set:
                new_set.append([float(x) for x in data[:len(data)-1]] + [data[len(data)-1]])

        elif mode == 'test':
            for data in data_set:
                new_set.append([float(x) for x in data])

        else:
            print('Invalid mode, program will exit.')
            exit()

        return new_set

    except ValueError as v:
        print(v)
        print('Invalid data set format, program will exit.')
        exit()

In [125]:
def knn(training_set, test_set, k):
    distances = []
    dist = 0
    limit = len(training_set[0]) - 1

    # find labeles for training set
    
    labels = get_labels(training_set)

    predicted_labels = []
    
    try:
        for test_instance in test_set:

            for row in training_set:
                for x, y in zip(row[:limit], test_instance):
                    dist += (x-y) * (x-y)
                distances.append(row + [sqrt(dist)])
                dist = 0

            distances.sort(key=itemgetter(len(distances[0])-1))

            neighbors = find_neighbors(distances, k)
            
            index, value = predict_label(neighbors, labels)

            #print('Predicted label is : ' + classes[index])
            #print('Number of votes : ' + str(value) + '/' + str(k))

            predicted_labels.append(labels[index])

            distances.clear()
            
        return(predicted_labels)

    except Exception as e:
        print(e)

### Experiments

#### Experiment 1. Flowers dataset (https://www.kaggle.com/alxmamaev/flowers-recognition)
##### Step 1. Preproccessing

In [126]:
path_to_data = "flowers"

imagePaths = load_images(path_to_data)

4323  images are collected


In [127]:
pix_images, hist_images, labels = img_to_vectors(imagePaths)

##### Step 2. Split to TRAIN and TEST

In [129]:
training_set, training_labels, test_set, test_labels = train_test_split(hist_images, labels)

##### Step 3. Apply KNN

In [130]:
training = dataset_to_float(training_set, 'training')
test = dataset_to_float(test_set, 'test')

In [131]:
training = join_set_labels(training, training_labels)

In [148]:
predicted_classes_by_hist = knn(training, test, 17)
print('Finished')

Finished


#### Calculate accuracy

In [149]:
get_accuracy(predicted_classes_by_hist, test_labels)

Accuracy:  0.4516423357664234


##### Step 4. Apply KNN from SKlearn

In [147]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=17, n_jobs=-1)

model.fit(training_set, training_labels)
acc = model.score(test_set, test_labels)

print("Accuracy: ", acc)

Accuracy:  0.460766423358


#### Experiment 2.1. Fruits dataset (https://www.kaggle.com/moltean/fruits )
##### Step 1. Preproccessing

In [13]:
path_to_data = "fruits/Training"

imagePaths = load_images(path_to_data)

5376  images are collected


In [16]:
pix_images, hist_images, labels = img_to_vectors(imagePaths)

##### Step 2. Split to TRAIN and TEST

In [17]:
training_set, training_labels, test_set, test_labels = train_test_split(hist_images, labels)

##### Step 3. Apply KNN

In [18]:
training = dataset_to_float(training_set, 'training')
test = dataset_to_float(test_set, 'test')

In [19]:
training = join_set_labels(training, training_labels)

In [22]:
predicted_classes_by_hist = knn(training, test, 8)
print('Finished')

Finished


#### Calculate accuracy

In [24]:
get_accuracy(predicted_classes_by_hist, test_labels)

Accuracy:  1.0


##### Step 4. Apply KNN from SKlearn

In [25]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=8, n_jobs=-1)

model.fit(training_set, training_labels)
acc = model.score(test_set, test_labels)

print("Accuracy: ", acc)

Accuracy:  1.0


#### Experiment 2.2. Fruits dataset (http://www.vicos.si/Downloads/FIDS30)
##### Step 1. Preproccessing

In [165]:
path_to_data = "FIDS301"

imagePaths = load_images(path_to_data)

369  images are collected


In [166]:
pix_images, hist_images, labels = img_to_vectors(imagePaths)

##### Step 2. Split to TRAIN and TEST

In [167]:
training_set, training_labels, test_set, test_labels = train_test_split(hist_images, labels)

##### Step 3. Apply KNN

In [168]:
training = dataset_to_float(training_set, 'training')
test = dataset_to_float(test_set, 'test')

In [169]:
training = join_set_labels(training, training_labels)

In [182]:
predicted_classes_by_hist = knn(training, test, 6)
print('Finished')

Finished


#### Calculate accuracy

In [185]:
get_accuracy(predicted_classes_by_hist, test_labels)

Accuracy:  0.5959595959595959


##### Step 4. Apply KNN from SKlearn

In [186]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=6)

model.fit(training_set, training_labels)
acc = model.score(test_set, test_labels)

print("Accuracy: ", acc)

Accuracy:  0.565656565657


#### Experiment 3. Audio Cats and Dogs (https://www.kaggle.com/mmoreaux/audio-cats-and-dogs )
##### Step 1. Preproccessing

In [84]:
from scipy.io.wavfile import read
import numpy as np
import wave, os, glob
import csv

In [85]:
path = 'cats_dogs'

cat_dog_files = []
for filename in glob.glob(os.path.join(path, '*.wav')):
    cat_dog_files.append(filename)

data = []
labels = []
    
for file in cat_dog_files:
    
    if file.find("cats_dogs/cat_") != -1:
        label = "cat"
    else:
        label = "dog"
    
    a = read(file)
    sound = np.array(a[1],dtype=float)
    data_example = []
    data_example.append(np.mean(sound))
    data_example.append(np.median(sound))
    data_example.append(np.min(sound))
    data_example.append(np.max(sound))
    data_example.append(np.std(sound))
    labels.append(label)
    
    data.append(data_example)

In [86]:
cat_dog_file = open('cat_dog_file.csv', 'w')
with cat_dog_file:
    writer = csv.writer(cat_dog_file)
    writer.writerows(data)

##### Step 2. Split to TRAIN and TEST

In [87]:
training_set, training_labels, test_set, test_labels = train_test_split(data, labels)

In [89]:
training = dataset_to_float(training_set, 'training')
test = dataset_to_float(test_set, 'test')

In [90]:
training = join_set_labels(training, training_labels)

In [105]:
predicted_classes = knn(training, test, 25)
print('Finished')

Finished


In [106]:
get_accuracy(predicted_classes, test_labels)

Accuracy:  0.6956521739130435


In [113]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=25, n_jobs=-1)

model.fit(training_set, training_labels)
acc = model.score(test_set, test_labels)

print("Accuracy: ", acc)

Accuracy:  0.695652173913
