In [3]:
# import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import train_test_split
from imutils import paths
import numpy as np
import argparse
import imutils
import time
import cv2
import os

In [4]:
def image_to_feature_vector(image, size=(32, 32)):
    # resize the image to a fixed size, then flatten the image into
    # a list of raw pixel intensities
    return cv2.resize(image, size).flatten()

In [6]:
def extract_color_histogram(image, bins=(8, 8, 8)):
    # extract a 3D color histogram from the HSV color space using
    # the supplied number of `bins` per channel
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
 
    # handle normalizing the histogram if we are using OpenCV 2.4.X
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
 
    # otherwise, perform "in place" normalization in OpenCV 3 (I
    # personally hate the way this is done
    else:
        cv2.normalize(hist, hist)
 
    # return the flattened histogram as the feature vector
    return hist.flatten()

In [7]:
# grab the list of images that we'll be describing
print("[INFO] describing images...")
relevant_path = "data/train/"
imagePaths = list(paths.list_images(relevant_path))
 
# initialize the data matrix and labels list
rawImages = []
data = []
labels = []

[INFO] describing images...


In [8]:
# loop over the input images
for (i, imagePath) in enumerate(imagePaths):
    # load the image and extract the class label (assuming that our
    image = cv2.imread(imagePath)
    # extract the class label from the image path and update the
    # labels list
    label = imagePath.split(os.path.sep)[-2]
    label = 1 if (label.endswith("1")) else 0
    
    pixels = image_to_feature_vector(image)
    # extract a color histogram from the image, then update the
    # data matrix and labels list
    hist = extract_color_histogram(image)
    rawImages.append(pixels)
    data.append(hist)
    labels.append(label)
 
    # show an update every 50 images
    if i > 0 and i % 50 == 0:
        print("[INFO] processed {}/{}".format(i, len(imagePaths)))

[INFO] processed 50/248
[INFO] processed 100/248
[INFO] processed 150/248
[INFO] processed 200/248


In [9]:
import image_data as imdg

print("[INFO] Leyendo imagenes ...")
# initialize the data matrix and labels list
data = []
labels = []

data_train = []
label_train = []
data_valid = []
label_valid = []

relevant_path = "data/train/"
data_train, labels_train = imdg.get_data_flatten(relevant_path)
print('Cantidad imagenes training : ', len(data_train))

relevant_path = "data/valid/"
data_valid, labels_valid = imdg.get_data_flatten(relevant_path)
print('Cantidad imagenes validacion : ', len(data_valid))

data_train += data_valid
labels_train += labels_valid
print('Total imagenes : ', len(data_train))

Using TensorFlow backend.


[INFO] Leyendo imagenes ...
Cantidad imagenes training :  248
Cantidad imagenes validacion :  90
Total imagenes :  338


In [10]:
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
print("[INFO] constructing training/testing split...")
(trainData, testData, trainLabels, testLabels) = train_test_split(data_train, labels_train, test_size=0.25, random_state=42)
print('trainData {}, testData {}, trainLabels {}, testLabels {}'.format(len(trainData), len(testData), len(trainLabels), len(testLabels)))

[INFO] constructing training/testing split...
trainData 253, testData 85, trainLabels 253, testLabels 85


In [11]:
# construct the set of hyperparameters to tune
params = {"n_neighbors": np.arange(1, 31, 2),"metric": ["euclidean", "cityblock"]}


In [12]:
# tune the hyperparameters via a cross-validated grid search
print("[INFO] tuning hyperparameters via grid search")
model_KNN = KNeighborsClassifier(-1)
grid = GridSearchCV(model_KNN, params)
start = time.time()
grid.fit(trainData, trainLabels)
 
# evaluate the best grid searched model on the testing data
print("[INFO] grid search took {:.2f} seconds".format(time.time() - start))
acc = grid.score(testData, testLabels)
print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
print("[INFO] grid search best parameters: {}".format(grid.best_params_))

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 1.20 seconds
[INFO] grid search accuracy: 95.29%
[INFO] grid search best parameters: {'metric': 'euclidean', 'n_neighbors': 1}


In [13]:
# tune the hyperparameters via a randomized search
grid = RandomizedSearchCV(model_KNN, params)
start = time.time()
grid.fit(trainData, trainLabels)
 
# evaluate the best randomized searched model on the testing
# data
print("[INFO] randomized search took {:.2f} seconds".format(time.time() - start))

# Calculo el score (Exactitud) con el dataset de testeo
score_test_KNN = grid.score(testData, testLabels)

# Calculo tambien el score del dataset de entrenamiento para comparar
score_train_KNN = grid.score(trainData, trainLabels)

# Imprimir score de testing y de training
print("Test Accuracy (Exactitud): {:.2f}% ",format(score_test_KNN * 100)) 
print("Train Accuracy (Exactitud): {:.2f}% ",format(score_train_KNN * 100))
print("[INFO] randomized search best parameters: {}".format(grid.best_params_))


[INFO] randomized search took 0.46 seconds
Test Accuracy (Exactitud): {:.2f}%  88.23529411764706
Train Accuracy (Exactitud): {:.2f}%  91.699604743083
[INFO] randomized search best parameters: {'n_neighbors': 5, 'metric': 'cityblock'}


In [14]:
# instantiate lerning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=4)

In [15]:
# fitting the model
knn.fit(trainData, trainLabels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

In [16]:
# predict the response
pred = knn.predict(testData)

print(accuracy_score(testLabels, pred))

0.941176470588


In [17]:
print('Dimensión dataset:',trainData[0].shape)
# Imprimo primeros 10 elementos
print(trainData[:10])
A = model_KNN.kneighbors_graph(trainData[:50])
A.toarray()

Dimensión dataset: (512,)
[array([ 0.15535022,  0.04964944,  0.05185879,  0.06735902,  0.36534268,
        0.32966256,  0.85089475,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0. 

AttributeError: 'NoneType' object has no attribute 'shape'

In [18]:
# creating odd list of K for KNN
k_list = list(range(1,64))

# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, k_list))

# creating list of cv scores
cv_scores = []
cv_scores_std = []

# perform 10-fold cross validation
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, trainData, trainLabels, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    cv_scores_std.append(scores.std())

In [19]:
import seaborn as sns

# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("the optimal number of neighbors is {} ".format(optimal_k))  
                      

the optimal number of neighbors is 1 


In [22]:
# finding best k
best_k = k_list[MSE.index(min(MSE))]
print("The best number of neighbors is %d." % best_k)

The best number of neighbors is 1.


In [36]:
def predict(X_train, y_train, x_test, k):
    # create list for distances and targets
    distances = []
    targets = []

    for i in range(len(X_train)):
        # first we compute the euclidean distance
        distance = np.sqrt(np.sum(np.square(x_test - X_train[i, :])))
        # add it to list of distances
        distances.append([distance, i])

    # sort the list
    distances = sorted(distances)

    # make a list of the k neighbors' targets
    for i in range(k):
        index = distances[i][1]
        targets.append(y_train[index])

    # return most common target
    return Counter(targets).most_common(1)[0][0]