## Classification using kNN

**Import the required libraries**

In [1]:
import re
import math
import numpy as np
import pandas as pd
import string
import scipy as sp
import nltk
import time
import operator
from random import randrange
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

**The following cell contains a class of methods to calculate distance between two points using various techniques**

**Formula to calculate Eucledian distance:**

<math>\begin{align}D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }\end{align}</math>

**Formula to calculate Manhattan Distance:**

<math>\begin{align}D(x, y) = \sum_i |x_i - y_i|\end{align}</math>

**Formula to calculate Hamming Distance:**

<math>\begin{align}D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}\end{align}</math>

In [2]:
class distanceMetrics:
    '''
    Description:
        This class contains methods to calculate various distance metrics
    '''
    def __init__(self):
        '''
        Description:
            Initialization/Constructor function
        '''
        pass
        
    def euclideanDistance(self, vector1, vector2):
        '''
        Description:
            Function to calculate Euclidean Distance
                
        Inputs:
            vector1, vector2: input vectors for which the distance is to be calculated
        Output:
            Calculated euclidean distance of two vectors
        '''
        self.vectorA, self.vectorB = vector1, vector2
        if len(self.vectorA) != len(self.vectorB):
            raise ValueError("Undefined for sequences of unequal length.")
        distance = 0.0
        for i in range(len(self.vectorA)-1):
            distance += (self.vectorA[i] - self.vectorB[i])**2
        return (distance)**0.5
    
    def manhattanDistance(self, vector1, vector2):
        """
        Desription:
            Takes 2 vectors a, b and returns the manhattan distance
        Inputs:
            vector1, vector2: two vectors for which the distance is to be calculated
        Output:
            Manhattan Distance of two input vectors
        """
        self.vectorA, self.vectorB = vector1, vector2
        if len(self.vectorA) != len(self.vectorB):
            raise ValueError("Undefined for sequences of unequal length.")
        return np.abs(np.array(self.vectorA) - np.array(self.vectorB)).sum()
    
    def hammingDistance(self, vector1, vector2):
        """
        Desription:
            Takes 2 vectors a, b and returns the hamming distance
            Hamming distance is meant for discrete-valued vectors, though it is a 
            valid metric for real-valued vectors.
        Inputs:
            vector1, vector2: two vectors for which the distance is to be calculated
        Output:
           Hamming Distance of two input vectors 
        """
        self.vectorA, self.vectorB = vector1, vector2
        if len(self.vectorA) != len(self.vectorB):
            raise ValueError("Undefined for sequences of unequal length.")
        return sum(el1 != el2 for el1, el2 in zip(self.vectorA, self.vectorB))


In [3]:
class kNNClassifier:
    '''
    Description:
        This class contains the functions to calculate distances
    '''
    def __init__(self,k = 3, distanceMetric = 'euclidean'):
        '''
        Description:
            KNearestNeighbors constructor
        Input    
            k: total of neighbors. Defaulted to 3
            distanceMetric: type of distance metric to be used. Defaulted to euclidean distance.
        '''
        pass
    
    def fit(self, xTrain, yTrain):
        '''
        Description:
            Train kNN model with x data
        Input:
            xTrain: training data with coordinates
            yTrain: labels of training data set
        Output:
            None
        '''
        assert len(xTrain) == len(yTrain)
        self.trainData = xTrain
        self.trainLabels = yTrain

    def getNeighbors(self, testRow):
        '''
        Description:
            Train kNN model with x data
        Input:
            testRow: testing data with coordinates
        Output:
            k-nearest neighbors to the test data
        '''
        
        calcDM = distanceMetrics()
        distances = []
        for i, trainRow in enumerate(self.trainData):
            if self.distanceMetric == 'euclidean':
                distances.append([trainRow, calcDM.euclideanDistance(testRow, trainRow), self.trainLabels[i]])
            elif self.distanceMetric == 'manhattan':
                distances.append([trainRow, calcDM.manhattanDistance(testRow, trainRow), self.trainLabels[i]])
            elif self.distanceMetric == 'hamming':
                distances.append([trainRow, calcDM.hammingDistance(testRow, trainRow), self.trainLabels[i]])
            distances.sort(key=operator.itemgetter(1))

        neighbors = []
        for index in range(self.k):
            neighbors.append(distances[index])
        return neighbors
        
    def predict(self, xTest, k, distanceMetric):
        '''
        Description:
            Apply kNN model on test data
        Input:
            xTest: testing data with coordinates
            k: number of neighbors
            distanceMetric: technique to calculate distance metric
        Output:
            predicted label 
        '''
        self.testData = xTest
        self.k = k
        self.distanceMetric = distanceMetric
        predictions = []
        
        for i, testCase in enumerate(self.testData):
            neighbors = self.getNeighbors(testCase)
            output= [row[-1] for row in neighbors]
            prediction = max(set(output), key=output.count)
            predictions.append(prediction)
        
        return predictions

In [4]:
def printMetrics(actual, predictions):
    '''
    Description:
        This method calculates the accuracy of predictions
    '''
    assert len(actual) == len(predictions)
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predictions[i]:
            correct += 1
    print("Acuracy of kNN model: ",correct / float(len(actual)) * 100.0)

In [5]:
def readData(fileName):
    '''
    Description:
        This method is to read the data from a given file
    '''
    data = []
    labels = []

    with open(fileName, "r") as file:
        lines = file.readlines() 
    for line in lines:
        splitline = line.strip().split(',')
        data.append(splitline[:-1])
        labels.append(splitline[-1])
    return data, labels

### Hayes-Roth Data

In [6]:
trainFile = 'Datasets/HayesRoth/hayes-roth.data'

trainData, trainLabel = readData(trainFile)

trainFeatures = []
for row in trainData:
    index = row[1:]
    temp = [int(item) for item in index]
    trainFeatures.append(temp)
    
trainLabels = [int(label) for label in trainLabel]

In [7]:
testFile = 'Datasets/HayesRoth/hayes-roth.test'

testData, testLabel = readData(testFile)

testFeatures = []
for row in testData:
    index = row[0:]
    temp = [int(item) for item in index]
    testFeatures.append(temp)
    
testLabels = [int(label) for label in testLabel]

In [8]:
knn = kNNClassifier()
knn.fit(trainFeatures, trainLabels)

In [9]:
eucPredictions = knn.predict(testFeatures, 3, 'euclidean')

In [10]:
printMetrics(testLabels, eucPredictions)

Acuracy of kNN model:  75.0


In [11]:
manPredictions = knn.predict(testFeatures, 3, 'manhattan')

In [12]:
printMetrics(testLabels, manPredictions)

Acuracy of kNN model:  67.85714285714286


In [13]:
hamPred = knn.predict(testFeatures, 3, 'hamming')

In [14]:
printMetrics(testLabels, hamPred)

Acuracy of kNN model:  75.0


### Car Evaluation Data

In [15]:
carFile = 'Datasets/CarEvaluation/car.data'

carData, carLabel = readData(carFile)
df = pd.DataFrame(carData)
df = df.apply(preprocessing.LabelEncoder().fit_transform)
carFeatures = df.values.tolist()
carLabels = preprocessing.LabelEncoder().fit_transform(carLabel)

In [16]:
xTrain, xTest, yTrain, yTest = train_test_split(carFeatures,carLabels,test_size = 0.2)

In [17]:
knn.fit(xTrain, yTrain)

In [18]:
carEucPred = knn.predict(xTest, 3, 'euclidean')

In [19]:
printMetrics(carEucPred, yTest)

Acuracy of kNN model:  62.138728323699425


In [20]:
carManPred = knn.predict(xTest, 3, 'manhattan')

In [21]:
printMetrics(carManPred, yTest)

Acuracy of kNN model:  88.15028901734104


In [22]:
carHamPred = knn.predict(xTest, 3, 'hamming')

In [23]:
printMetrics(carHamPred, yTest)

Acuracy of kNN model:  88.4393063583815


### Breast Cancer Data

In [24]:
cancerFile = 'Datasets/BreastCancer/breast-cancer.data'

cancerData, cancerLabel = readData(cancerFile)
cdf = pd.DataFrame(cancerData)
cdf[9] = cancerLabel

In [25]:
for i in range(len(cdf)) : 
    if '?' in list(cdf.loc[i]):
        cdf.drop(index = i, inplace = True)
cdf.reset_index(inplace=True)

In [26]:
cdf = cdf.apply(preprocessing.LabelEncoder().fit_transform)
cancerLabels = cdf[9].tolist()
cdf.drop(columns=9, inplace = True)
cancerFeatures = cdf.values.tolist()

In [27]:
cxTrain, cxTest, cyTrain, cyTest = train_test_split(cancerFeatures,cancerLabels,test_size = 0.2)

In [28]:
knn.fit(cxTrain, cyTrain)

In [29]:
canEucPred = knn.predict(cxTest, 3, 'euclidean')

In [30]:
printMetrics(canEucPred, cyTest)

Acuracy of kNN model:  78.57142857142857


In [31]:
canManPred = knn.predict(cxTest, 3, 'manhattan')

In [32]:
printMetrics(canManPred, cyTest)

Acuracy of kNN model:  75.0


In [33]:
canHamPred = knn.predict(cxTest, 3, 'hamming')

In [34]:
printMetrics(canHamPred, cyTest)

Acuracy of kNN model:  71.42857142857143
