In [5]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
wine = datasets.load_wine()

print(wine.feature_names)


['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3)

In [10]:
class MyKNNClassifier:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def predict(self, X_test):
        predictions = []
        for x in X_test:
            # compute distances from x to all training nodes
            distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
            
            # get the labels of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            k_neighbor_labels = [self.y_train[i] for i in k_indices]

            # find the classification label by checking the majority
            label = max(set(k_neighbor_labels), key=k_neighbor_labels.count)
            predictions.append(label)
        return np.array(predictions)

In [None]:
# k = 7, good fit example
knn = MyKNNClassifier(k=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Train Accuracy:", accuracy_score(y_train, knn.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Train Accuracy: 0.782258064516129
Test Accuracy: 0.7222222222222222


In [15]:
# k = 100, underfit example
knn_underfit = MyKNNClassifier(k=100)
knn_underfit.fit(X_train, y_train)
y_pred = knn_underfit.predict(X_test)

print("Train Accuracy:", accuracy_score(y_train, knn_underfit.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Train Accuracy: 0.6290322580645161
Test Accuracy: 0.7222222222222222
