In [0]:
#latest
class KNN:
    def __init__(self, dataset):
        self.data = dataset
        self.dist_metric = {0: 'euclidean', 1: 'manhattan'}
        self.kernel_type = {0: self.epanechnikov, 1: self.gaussian}
        self.kerneltype = ['epanechnikov', 'gaussian']

    def get_accuracy(self, prediction, label):
        correct = 0
        for i in range(len(prediction)):
            if prediction[i] == label[i]:
                correct += 1
        return correct / float(len(label)) * 100

    def epanechnikov(self, distance):  # Kernel functions that return the transformed distance
        kernel = []
        for x in distance:
            kernel.append(3 / 4 * (1 - x ** 2))
        return kernel

    def gaussian(self, distance):
        import math
        kernel = []
        for x in distance:
            kernel.append(1 / (math.sqrt(2 * math.pi)) * (math.e ** (-1 / 2 * (x ** 2))))
        return kernel

    def euclidean_distance(self, train_set, test_value):
        import math
        euclidean_distance = []
        for i in range(len(train_set)):
            distance = 0
            for j in range(len(test_value) - 1):
                distance = distance + ((train_set[i][j] - test_value[j]) ** 2)
            euclidean_distance.append(math.sqrt(distance))
        return euclidean_distance

    def manhattan_distance(self, train_set, test_value):
        import math
        manhattan_distance = []
        for i in range(len(train_set)):
            distance = 0
            for j in range(len(test_value) - 1):
                distance = distance + (abs(train_set[i][j] - test_value[j]))
            manhattan_distance.append(math.sqrt(distance))
        return manhattan_distance

    def predict_euclidean(self, train_set, test_value, k):
        import numpy as np
        import operator
        distances = np.array(self.euclidean_distance(train_set, test_value))
        index = np.argpartition(distances, k)
        count = {}
        for idx in range(k):
            if train_set[index[idx]][-1] in count:
                count[train_set[index[idx]][-1]] += 1
            else:
                count[train_set[index[idx]][-1]] = 1
        result_euc = max(count.items(), key=operator.itemgetter(1))[0]
        return result_euc

    def predict_manhattan(self, train_set, test_value, k):
        import numpy as np
        import operator
        distances = np.array(self.manhattan_distance(train_set, test_value))
        index = np.argpartition(distances, k)
        count = {}
        for idx in range(k):
            if train_set[index[idx]][-1] in count:
                count[train_set[index[idx]][-1]] += 1
            else:
                count[train_set[index[idx]][-1]] = 1
        result_man = max(count.items(), key=operator.itemgetter(1))[0]
        return result_man

    def predict_kernel(self, train_set, test_value, k, kernel):
        import numpy as np
        import operator
        distance = self.euclidean_distance(train_set, test_value)
        index = np.argpartition(distance, k)
        kernel_value = kernel(distance)
        count = {}
        for idx in range(k):
            if train_set[index[idx]][-1] in count:
                count[train_set[index[idx]][-1]] += kernel_value[index[idx]]
            else:
                count[train_set[index[idx]][-1]] = kernel_value[index[idx]]
        result_ker = max(count.items(), key=operator.itemgetter(1))[0]
        return result_ker

    def predict_parzen_window(self, train_set, test_value, R):
        import numpy as np
        import operator
        import random
        index_in_window = []
        count = {}
        distance = self.euclidean_distance(train_set, test_value)
        for i in range(len(distance)):
            if distance[i] <= R:
                index_in_window.append(i)
        if bool(index_in_window) == True:
            for index in index_in_window:
                if train_set[index][-1] in count:
                    count[train_set[index][-1]] += 1
                else:
                    count[train_set[index][-1]] = 1

            result_par = max(count.items(), key=operator.itemgetter(1))[0]
            return result_par
        else:
            result_par = random.choice(train_set)
            return result_par[1]

    def score_parzen_window(self, R):
        import numpy as np
        best_result = []
        for x in range(len(self.data)):
            result = self.predict_parzen_window(np.concatenate((self.data[:x], self.data[x + 1:]), axis=0),
                                                self.data[x], R, )

            best_result.append(result)
        target = []
        for y in range(len(self.data)):
            target.append(self.data[y][-1])
        score = self.get_accuracy(best_result, target)
        return score

    def best_parzenwindow_size(self):
        final_output_parzen = {'size_window': [], 'score': []}
        R = 0.1
        r_step = 0.01
        rstep = int(R / r_step)
        for i in range(1, rstep + 1):
            final_output_parzen['size_window'].append(i * r_step)
            score = self.score_parzen_window(i * r_step)
            final_output_parzen['score'].append(score)
        return final_output_parzen

    def predict_parzen_kernel(self, train_set, test_value, R, kernel):
        import numpy as np
        import operator
        distance = self.euclidean_distance(train_set, test_value)
        distance = np.divide(distance, R)
        kernel_value = kernel(distance)
        count = {}
        for index in range(len(kernel_value)):
            if train_set[index][-1] in count:
                count[train_set[index][-1]] += kernel_value[index]
            else:
                count[train_set[index][-1]] = kernel_value[index]
        result = max(count.items(), key = operator.itemgetter(1))[0]
        return  result

    def score_parzen_kernel(self, R, kernel):
        import numpy as np
        best_result = []
        for x in range(len(self.data)):
            result = self.predict_parzen_kernel(np.concatenate((self.data[:x], self.data[x + 1:]), axis=0),
                                                self.data[x], R, kernel)

            best_result.append(result)
        target = []
        for y in range(len(self.data)):
            target.append(self.data[y][-1])
        score = self.get_accuracy(best_result, target)
        return score

    def best_parzen_kernel(self):
        final_output_kernel_parzen = {'Kernel':[], 'Window_size':[], 'score':[]}
        R = 0.1
        r_step = 0.01
        rstep = int(R / r_step)
        for i in range(len(self.kerneltype)):
            for j in range(1, rstep+1):
                final_output_kernel_parzen['Kernel'].append(self.kerneltype[i])
                r = j*r_step
                final_output_kernel_parzen['Window_size'].append(r)
                score = self.score_parzen_kernel(r, kernel=self.kernel_type[i])
                final_output_kernel_parzen['score'].append(score)
        return final_output_kernel_parzen
            
            


    def predict_distance(self, k, dist_metric):
        import numpy as np
        best_result = []
        if dist_metric == 'euclidean':
            for x in range(len(self.data)):
                result = self.predict_euclidean(np.concatenate((self.data[:x], self.data[x + 1:]), axis=0),
                                                self.data[x], k)
                best_result.append(result)
        elif dist_metric == 'manhattan':
            for x in range(len(self.data)):
                result = self.predict_manhattan(np.concatenate((self.data[:x], self.data[x + 1:]), axis=0),
                                                self.data[x], k)
                best_result.append(result)
        target = []
        for y in range(len(self.data)):
            target.append(self.data[y][-1])
        score = self.get_accuracy(best_result, target)
        # print((best_result))
        # print((target))
        # print(score)
        return score

    def predict_kernel_knn(self, k, kernel):
        import numpy as np
        best_result = []
        for x in range(len(self.data)):
            result = self.predict_kernel(np.concatenate((self.data[:x], self.data[x + 1:]), axis=0), self.data[x], k,
                                         kernel)
            best_result.append(result)
        target = []
        for y in range(len(self.data)):
            target.append(self.data[y][-1])
        score = self.get_accuracy(best_result, target)
        return score

    def best_kernel_knn(self):
        final_output_kernel_knn = {'Kernel': [], 'K': [], 'score': []}
        K = [2, 3, 5, 7]
        for i in range(len(self.kerneltype)):
            for j in range(len(K)):
                final_output_kernel_knn['Kernel'].append(self.kerneltype[i])
                final_output_kernel_knn['K'].append(K[j])
                score_kernel_knn = self.predict_kernel_knn(K[j], kernel=self.kernel_type[i])
                final_output_kernel_knn['score'].append(score_kernel_knn)
        return final_output_kernel_knn

    def find_best_kanddistance(self):
        # import numpy as np
        final_output = {'Distance': [], 'K': [], 'Score': []}
        K = [2, 3, 5, 7]
        # scores = np.zeros(len(K))
        for i in range(len(self.dist_metric)):
            # final_output['Distance'].append(self.dist_metric[i])
            for j in range(len(K)):
                final_output['Distance'].append(self.dist_metric[i])
                final_output['K'].append(K[j])
                score = self.predict_distance(K[j], dist_metric=self.dist_metric[i])
                final_output['Score'].append(score)
        # print(final_output)
        return final_output

    def __display__(self, Version):
        import pandas as pd
        if Version == 'distance':
            ans = self.find_best_kanddistance()
            df = pd.DataFrame.from_dict(ans)
            display(df)
        elif Version == 'kernel_knn':
            ans = self.best_kernel_knn()
            df = pd.DataFrame.from_dict(ans)
            display(df)
        elif Version == 'parzen_window':
            ans = self.best_parzenwindow_size()
            df = pd.DataFrame.from_dict(ans)
            display(df)
        elif Version == 'parzen_kernel':
            ans = self.best_parzen_kernel()
            df = pd.DataFrame.from_dict(ans)
            display(df)
            














In [0]:
import pandas as pd
import numpy as np
data = np.array(pd.read_csv('iris.csv'))
Model = KNN(data)

In [40]:
Model.__display__('distance')

Unnamed: 0,Distance,K,Score
0,euclidean,2,96.0
1,euclidean,3,96.0
2,euclidean,5,96.666667
3,euclidean,7,96.666667
4,manhattan,2,95.333333
5,manhattan,3,96.0
6,manhattan,5,95.333333
7,manhattan,7,94.666667


In [41]:
Model.__display__('kernel_knn')

Unnamed: 0,Kernel,K,score
0,epanechnikov,2,96.0
1,epanechnikov,3,96.0
2,epanechnikov,5,96.666667
3,epanechnikov,7,96.666667
4,gaussian,2,96.0
5,gaussian,3,96.0
6,gaussian,5,96.666667
7,gaussian,7,96.666667


In [42]:
Model.__display__('parzen_window')

Unnamed: 0,size_window,score
0,0.01,3.333333
1,0.02,3.333333
2,0.03,3.333333
3,0.04,3.333333
4,0.05,3.333333
5,0.06,3.333333
6,0.07,3.333333
7,0.08,3.333333
8,0.09,3.333333
9,0.1,6.0


In [43]:
Model.__display__('parzen_kernel')

Unnamed: 0,Kernel,Window_size,score
0,epanechnikov,0.01,91.333333
1,epanechnikov,0.02,91.333333
2,epanechnikov,0.03,91.333333
3,epanechnikov,0.04,91.333333
4,epanechnikov,0.05,91.333333
5,epanechnikov,0.06,91.333333
6,epanechnikov,0.07,91.333333
7,epanechnikov,0.08,91.333333
8,epanechnikov,0.09,91.333333
9,epanechnikov,0.1,91.333333
