In [3]:
import numpy as np
import math
from sklearn.model_selection import train_test_split

# Iris Dataset

In [4]:
from sklearn.datasets import load_iris
iris = load_iris()

In [5]:
X = iris['data']
y = iris['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

### Function for 1 Nearest Neighbour Algorithm

In [7]:
def nn(X_train, y_train, X_test):
    
    y_pred = []
    
    for x_i in X_test:
        
        ed = []
        current_min = math.inf
        for x in X_train:
            d = 0
            i = 0
            while i < len(x):
                d += (x[i]-x_i[i])**2
                i += 1
            d = np.sqrt(d)
            ed.append(d)
            
            if d < current_min:
                current_min = d
                            
        y_pred.append(y_train[ed.index(current_min)])
        
    return y_pred


### Function to calculate the number of errors (n_errors) and error rate (r_errors)

In [8]:
def compute_errors(y_pred, y_test):
    
    n_errors = np.sum(y_pred != y_test)
    
    r_errors = n_errors / len(y_test)
    
    return n_errors, r_errors

### Creating predictions for test dataset and computing the number of errors and error rate

In [9]:
# Using function on test set to get predictions
y_pred_nn = nn(X_train, y_train, X_test)

# Calculating number of errors and error rate
n_test_errors, r_test_errors = compute_errors(y_pred_nn, y_test)
print('Number of errors:', n_test_errors)
print('Error rate:', r_test_errors)

Number of errors: 1
Error rate: 0.0263157894737


The results show that there is only 1 error and and the error rate is 0.0263

### Function for K Nearest Neighbours

In [10]:
def knn(X_train, y_train, X_test, k=1):
    
    y_pred = []
    
    for x_i in X_test:

        ed = []
        ed_mins = []
        
        for x in X_train:
            d = 0
            i = 0
            
            while i < len(x):
                d += (x[i]-x_i[i])**2
                i += 1
                
            d = np.sqrt(d)
            ed.append(d)
            
            if len(ed_mins) < k:
                ed_mins.append(d)
            else:
                ed_mins_max_i = np.argmax(ed_mins)
                if d < ed_mins[ed_mins_max_i]:
                    ed_mins[ed_mins_max_i] = d
        
        arr = []
        for i in ed_mins:
            arr.append(y_train[ed.index(i)])
        
        votes = {}
        
        for i in arr:
            if i in votes:
                votes[i] += 1
            else:
                votes[i] = 1

        num_votes = 0
        top_class = 0
        
        for key, val in votes.items():
            if val > num_votes:
                num_votes = val
                top_class = key
        
        y_pred.append(top_class) 
        
    return y_pred

In [11]:
y_pred_knn = knn(X_train, y_train, X_test, 3)
n_errors, r_errors = compute_errors(y_pred_knn, y_test)
print('Number of errors:', n_errors)
print('Error rate:', r_errors)

Number of errors: 1
Error rate: 0.0263157894737


The KNN algorithm for k=3 gives the same error results as the k=1 Algorithm.

### Testing results against scikit learn method

In [12]:
from sklearn.neighbors import KNeighborsClassifier 
knn_test = KNeighborsClassifier(n_neighbors=3)
knn_test.fit(X_train, y_train)
prediction = knn_test.predict(X_test)
n_errors_test, r_errors_test = compute_errors(prediction, y_test)
print('Number of errors:', n_errors_test)
print('Error rate:', r_errors_test)

Number of errors: 1
Error rate: 0.0263157894737


Results from the scikit learn method gives the same results.

### Ionsphere Dataset

In [13]:
X2 = np.genfromtxt('ionosphere.txt', delimiter=',', usecols=np.arange(34))
y2 = np.genfromtxt('ionosphere.txt', delimiter=',', usecols=34, dtype='int')
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=100)

For this new dataset, I will be using the knn method I created earlier. Since the method handles a general K, I can use it for 1 nearest neighbour also.

### One Nearest Neighbour

In [14]:
y2_pred_nn = knn(X2_train, y2_train, X2_test, k=1)
n2_errors, r2_errors = compute_errors(y2_pred_nn, y2_test)
print('Number of errors:', n2_errors)
print('Error rate:', r2_errors)

Number of errors: 10
Error rate: 0.113636363636


For this dataset, there are 10 errors and an error rate of 0.114.

### Three Nearest Neighbours

In [15]:
y2_pred_knn = knn(X2_train, y2_train, X2_test, 3)

n2_errors, r2_errors = compute_errors(y2_pred_knn, y2_test)
print('Number of errors:', n2_errors)
print('Error rate:', r2_errors)

Number of errors: 12
Error rate: 0.136363636364


When using 3 Nearest Neighbours, there are 12 errors and an error rate of 0.136. 

### Testing results against scikit-learn method

In [18]:
from sklearn.neighbors import KNeighborsClassifier 
knn_test = KNeighborsClassifier(n_neighbors=3)
knn_test.fit(X2_train, y2_train)
prediction = knn_test.predict(X2_test)
n2_errors, r2_errors = compute_errors(prediction, y2_test)
print(n2_errors)
print(r2_errors)

12
0.136363636364


When using the scikit-learn method, the results are the same.