Relevant imports and loading the datasets.

In [158]:
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.model_selection import train_test_split

iris = load_iris()
# Iris dataset loaded, ready to use
X = iris['data'] # Array of data values
y = iris['target'] # Array of corresponding labels

In [159]:
# Loading the ionosphere dataset from text file
A = np.genfromtxt("ionosphere.txt", delimiter=",",
usecols=np.arange(34)) # The first 34 columns contain the data values

B = np.genfromtxt("ionosphere.txt", delimiter=",",
usecols=34, dtype='int') # The last column contains the labels

Splitting the datasets into training and test sets.

In [160]:
# Splitting iris dataset
X_train, X_test, y_train, y_test = train_test_split(X,
y, random_state=1202)

# Splitting ionosphere dataset
A_train, A_test, B_train, B_test = train_test_split(A,
B, random_state=1202)

###### The Euclidean distance function - calculates distance between the given point and a vector. Returns an array of all the distances.


In [161]:
def euclidean_distance(point, data):
    distance = 0.0
    distances = []
    
    for i in range(len(data)):
        distance = np.linalg.norm(point - data[i])
        distances.append(distance)
    return distances # Array

#### A simple bubble sort to sort the array of distances in ascending order.

In [162]:
def bubble_sort_ascending(distances_and_indexes):
    n = len(distances_and_indexes)

    for i in range( n - 1 ) :
        flag = 0
        for j in range(n - 1) :
            
            if distances_and_indexes[j] > distances_and_indexes[j + 1] : 
                tmp = distances_and_indexes[j]
                distances_and_indexes[j] = distances_and_indexes[j + 1]
                distances_and_indexes[j + 1] = tmp
                flag = 1

        if flag == 0:
            break

    return distances_and_indexes

###### The array sorting function - sorts the array of distances in ascending order, returns the indexes of those points.

In [163]:
def k_least_distances(distances, k):
    distances_and_indexes = []
    
    # This makes a new list with sub-lists containing the distance value and adding an indexing value (to keep track of what point the distance belonged to later)
    for i in range(len(distances)):
        distances_and_indexes.append([distances[i],i])
    
    distances_and_indexes_sorted = []
    distances_and_indexes_sorted = bubble_sort_ascending(distances_and_indexes) # Using bubble sort on the [distance,index] array
    
    nearest_neighbour_indexes = [] 
    
    # Making an array of the indexes of the k lest distances (Need indexes to pair up to original points)
    for i in range(k):
        nearest_neighbour_indexes.append(distances_and_indexes_sorted[i][1])
        
    return nearest_neighbour_indexes 

###### Function that finds the labels and values of the nearest neighbours by their indexes. Returns 2 arrays: an array of labels and an array of values

In [164]:
def nearest_neighbours(neighbour_indexes, data, labels):
    nearest_n = []
    nearest_n_labels = []
    
    for i in range(len(neighbour_indexes)):
        current_index = neighbour_indexes[i]
        nearest_n.append(data[current_index])
        nearest_n_labels.append(labels[current_index])
    
    return nearest_n_labels, nearest_n

###### The majority vote function - If K > 1 a vote is needed: the most popular label of the nearest neighbours will be the prediction label of the new point.
Returns a single value.

In [165]:
def majority_vote(nearest_labels):
    labels = []
    majority_vote = ''
    majority_count = None # No value assigned yet
    
    # Add all labels of all of the k nearest neighbours to an array
    for i in range(len(nearest_labels)):
        if nearest_labels[i] not in labels:
            labels.append(nearest_labels[i])
    
    # Loops through array of all labels, decides which label is of highest frequency
    for i in range(len(labels)):
        counted = nearest_labels.count(labels[i])
        if majority_count == None: # If no value assigned yet, first label is the majority
            majority_vote = labels[i]
            majority_count = counted
        elif majority_count < counted: # Checks if it had exceeded the previous majority counted label
            majority_vote = labels[i]
            majority_count = counted
        elif majority_count == counted: # In case of a tie, assigns current majority label
            majority_vote = labels[i]
            
    return majority_vote

##### The K Nearest neighbours algorithm - pieces together all of the above functions.

Finds k nearest neighbours of point input, returns single value: the predicted label of the new point.

In [166]:
def k_nearest_neighbours(point, data, labels, k):
    
    if k == 0:
        print("Expecting K > 0 input.")
        return 0
    elif (k > len(data)):
        print("Expecting K < ", len(data)," input.")
        return 0
    else:
        
        neighbour_indexes = []
        neighbour_labels = []
        neighbour_data = []
    
        neighbour_indexes = k_least_distances(euclidean_distance(point, data), k) # Gets the indexes of the k nearest neighbours
    
        neighbour_labels, neighbour_data = nearest_neighbours(neighbour_indexes, data, labels) # Gets data and labels of neighbours
    
        point_label = majority_vote(neighbour_labels) # Calculates the predicted label of 'point' using majority vote
    
        return point_label # returns predicted label

## Testing for the iris dataset

Made the testing sequence into a function - easier to call it mutliple times with different values

In [167]:
def iris_predictions(X_test, X_train, y_test, y_train, k):
    if k == 0:
        print("Expecting K > 0 input.")
        return 0
    elif k > len(X_train):
        print("Expecting K < ", len(X_train)+1, " input.")
        return 0
    else:
        X_test_predicted_labels = []
        X_test_predicted_compare = []
        n_test_errors = 0
        test_error_rate = 0.0

        for i in range(len(X_test)):
            X_test_predicted_labels.append(k_nearest_neighbours(X_test[i], X_train, y_train, k))

        for j in range(len(X_test_predicted_labels)):
            if (X_test_predicted_labels[j] == y_test[j]):
                X_test_predicted_compare.append("True")
            else:
                X_test_predicted_compare.append("False")
                n_test_errors += 1

        test_error_rate = round(n_test_errors/len(X_test),5)
    
        print("For K = ", k, "\n")
        print("The predicted labels: \n", X_test_predicted_labels, "\n")
        print("The predicted labels compared to the true labels: \n", X_test_predicted_compare, "\n")
        print("Number of test errors: ", n_test_errors, "\n")
        print("Test error rate: ", test_error_rate)

### a) iris for K = 1

In [168]:
iris_predictions(X_test, X_train, y_test, y_train, 1)

For K =  1 

The predicted labels: 
 [1, 2, 1, 0, 0, 2, 0, 1, 2, 0, 0, 1, 2, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 1, 1, 0, 0, 1, 0, 1, 2, 2] 

The predicted labels compared to the true labels: 
 ['True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'] 

Number of test errors:  2 

Test error rate:  0.05263


### b) iris for K = 3

In [169]:
iris_predictions(X_test, X_train, y_test, y_train, 3)

For K =  3 

The predicted labels: 
 [1, 2, 1, 0, 0, 2, 0, 2, 2, 0, 0, 1, 2, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 1, 1, 0, 0, 1, 0, 1, 2, 2] 

The predicted labels compared to the true labels: 
 ['True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'] 

Number of test errors:  1 

Test error rate:  0.02632


### iris for general K (any K) and handling invalid K inputs

In [170]:
iris_predictions(X_test, X_train, y_test, y_train, 0) # # Handling K = 0 --> invalid input

Expecting K > 0 input.


0

In [171]:
iris_predictions(X_test, X_train, y_test, y_train, 12) # Works for any general K

For K =  12 

The predicted labels: 
 [1, 2, 1, 0, 0, 2, 0, 2, 2, 0, 0, 1, 2, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 1, 1, 0, 0, 1, 0, 1, 2, 2] 

The predicted labels compared to the true labels: 
 ['True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'] 

Number of test errors:  1 

Test error rate:  0.02632


In [172]:
iris_predictions(X_test, X_train, y_test, y_train, len(X_train)) # Testing for maximum valid value of K

For K =  112 

The predicted labels: 
 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] 

The predicted labels compared to the true labels: 
 ['False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'True'] 

Number of test errors:  28 

Test error rate:  0.73684


In [173]:
iris_predictions(X_test, X_train, y_test, y_train, len(X_train)+1) # Handling K > length of data array --> invalid input

Expecting K <  113  input.


0

## Testing for the ionosphere dataset

Made the testing sequence into a function - easier to call it mutliple times with different values

In [174]:
def ionosphere_predictions(A_test, A_train, B_test, B_train, k):
    if k == 0:
        print("Expecting K > 0 input.")
        return 0
    elif k > len(A_train):
        print("Expecting K < ", len(A_train)+1, " input.")
        return 0
    else:
        A_test_predicted_labels = []
        A_test_predicted_compare = []
        n_test_errors = 0
        test_error_rate = 0.0

        for i in range(len(A_test)):
            A_test_predicted_labels.append(k_nearest_neighbours(A_test[i], A_train, B_train, k))

        for j in range(len(A_test_predicted_labels)):
            if (A_test_predicted_labels[j] == B_test[j]):
                A_test_predicted_compare.append("True")
            else:
                A_test_predicted_compare.append("False")
                n_test_errors += 1

        test_error_rate = round(n_test_errors/len(A_test),5)
    
        print("For K = ", k, "\n")
        print("The predicted labels: \n", A_test_predicted_labels, "\n")
        print("The predicted labels compared to the true labels: \n", A_test_predicted_compare, "\n")
        print("Number of test errors: ", n_test_errors, "\n")
        print("Test error rate: ", test_error_rate)

### c) ionosphere for K = 1

In [175]:
ionosphere_predictions(A_test, A_train, B_test, B_train, 1)

For K =  1 

The predicted labels: 
 [-1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, 1, 1] 

The predicted labels compared to the true labels: 
 ['True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'Fal

### d) ionosphere for K = 3

In [176]:
ionosphere_predictions(A_test, A_train, B_test, B_train, 3)

For K =  3 

The predicted labels: 
 [-1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1] 

The predicted labels compared to the true labels: 
 ['True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'False'

### ionosphere for general K (any K) and handling invalid K inputs

In [177]:
ionosphere_predictions(A_test, A_train, B_test, B_train, 0) # Handling K = 0 --> invalid input

Expecting K > 0 input.


0

In [178]:
ionosphere_predictions(A_test, A_train, B_test, B_train, 18) # General K input

For K =  18 

The predicted labels: 
 [-1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

The predicted labels compared to the true labels: 
 ['True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'False',

In [179]:
ionosphere_predictions(A_test, A_train, B_test, B_train, len(A_train)) # Testing for maxium valid K input

For K =  263 

The predicted labels: 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

The predicted labels compared to the true labels: 
 ['False', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False'

In [180]:
ionosphere_predictions(A_test, A_train, B_test, B_train, len(A_train)+1) # Handling K > length of data array --> invalid input

Expecting K <  264  input.


0