In [7]:
# Setup code for this notebook
import sys
import numpy as np
sys.path.insert(0, 'utils')
from utils import *

"""
Input:      
      in_train: a [number of samples x number of features] matrix with the training data
      out_train: a [number of samples x labels] matrix with the labels of the training data
      in_test: a [number of test samples x number of features] matrix with the testing data
      k: the number of K, of course
Output:
      labels: the number of neighbors for each label for the in_test array
"""

# Euclidean distance
def euclidean_distance (x,y):
      return np.sqrt(np.power(x-y,2).sum(axis=1))

def run_knn (in_train, out_train, in_test, k):
      size_in_train = in_train.shape
      size_out_train = out_train.shape
      size_in_test = in_test.shape

      #The labels array that will be returned
      labels = np.zeros ([size_in_test[0], size_out_train[1]])

      for i in range(size_in_test[0]):
           # Computing the distance from the sample test to the training set
           rpt_test = np.tile (in_test[i,:], (size_in_train[0], 1))
           dists = euclidean_distance (rpt_test,in_train)

           # Sorting the distances and getting the k nearest neighbors
           index_sort = np.argsort (dists)
           pos_labels = index_sort[:k]
           closeness = out_train [pos_labels]

           # The final label will be the highest value in the row
           labels[i] = closeness.sum(axis=0)

      return labels

def main():
      # loading the data set
      #dataset = read_csv ('documents/data.csv')
      dataset = np.genfromtxt('documents/data.csv', delimiter=',')

      # Number of samples and features + label (the last position of the array is the class label)
      [nsp, feat] = dataset.shape

      # Shuffling the dataset
      np.random.shuffle(dataset)

      # Getting 70% for training and 30% for tests
      sli = int(round(nsp*0.7))
      in_train = dataset[0:sli,0:feat-1]
      out_train = ind2vec((dataset[0:sli,feat-1])-1)
      in_test = dataset[sli:nsp,0:feat-1]
      out_test = ind2vec(dataset[sli:nsp,feat-1]-1)

      res = run_knn (in_train, out_train, in_test, 7)

      print('number of missclassification: {0}'.format(cont_error (out_test, res)))

main()

number of missclassification: 5977


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

_dataset = np.genfromtxt('documents/data.csv', delimiter=',')

nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(_dataset)
indices 