# Class definition

In [1]:
class KNearestNeighbors:
    """
    The k-nearest neighbors algorithm is a supervised learning classifier, using proximity 
    to make predictions about the grouping of an individual data poin. A class label is assigned
    on the basis of a majority vote: the label that is most frequently represented around a 
    given data point is predicted. 
    
    Needed packages:
    import numpy as np

    Parameters
    ----------
    k_neighbors : int, default = 5
        Number of neighbors to use while predicting new data point.
        
    Attributes
    ----------
    n_training_samples : int
        Sample size training data seen during training estimator.
    
    n_features : int
        Number of seatures training data seen during training estimator.
    
    n_classes : int
        Number of classes training data seen during training estimator.
        
    X_train : array of shape (n_training_samples, n_features)
        Values training data regressors.
        
    y_train : array of shape (n_training_samples, 1)
        Values training data target value.
        
    knneighbors : array of shape (M, k_neighbors)
        List of nearest classes for each new test X value. 
        
    list_distances : array of shape (n_training_samples, n_test_samples)
        List of distances to training observations for each new test X value. 
    
  
    Methods
    ----------
    fit(X_train, y_train): 
        Fit K-NN classifier with training data.
    predict(X_test):
        Predict class of test data with fitted K-NN classifier.

    Examples
    --------
    >>> import numpy as np
    >>> X_train = np.array([[2, 0, 3],         
                            [5, 8, 0],
                            [9, 5, 3],
                            [3, 5, 8],
                            [3, 9, 1],
                            [1, 7, 9],
                            [0, 1, 4],
                            [5, 9, 5],
                            [2, 6, 3],
                            [3, 8, 3]])
    >>> y_train = np.array(['Red', 'Green', 'Red', 'Green', 'Blue', 
                            'Blue', 'Green', 'Red', 'Green', 'Green'])
    >>> X_test = np.array([[5, 2, 6],
                           [7, 8, 4],
                           [6, 0, 2]])
    >>> y_test = np.array(['Green', 'Blue', 'Red'])
    >>> KNNfit = KNearestNeighbors(k_neighbors=3).fit(X_train, y_train)
    >>> KNNfit.n_training_samples
    10
    >>> KNNfit.n_features
    3
    >>> KNNfit.n_classes
    3
    >>> KNNfit.predict(X_test)
    array(['Green', 'Red', 'Red'], dtype='<U5')
    >>> KNNfit.knneighbors
    [['Green', 'Red', 'Green'], ['Red', 'Red', 'Green'], ['Red', 'Red', 'Green']]
    >>> KNNfit.list_distances.T
    array([[ 4.69041576,  9.48683298,  4.12310563],
           [ 8.48528137,  4.47213595,  8.30662386],
           [ 5.83095189,  3.74165739,  5.91607978],
           [ 4.12310563,  6.40312424,  8.36660027],
           [ 8.83176087,  5.09901951,  9.53939201],
           [ 7.07106781,  7.87400787, 11.09053651],
           [ 5.47722558,  9.89949494,  6.40312424],
           [ 7.07106781,  2.44948974,  9.53939201],
           [ 5.83095189,  5.47722558,  7.28010989],
           [ 7.        ,  4.12310563,  8.60232527]])
    >>> KNNfit.score(X_test, y_test)
    0.6666666666666666
    
    """
    def __init__(
        self, 
        k_neighbors = 5,
    ):
        self.k_neighbors = k_neighbors
        
    @staticmethod
    def euclidean_distance(point, dataset):
        return np.sqrt(np.sum((point - dataset)**2, axis=1)) 
    
    @staticmethod
    def most_common(classes_list):
        return max(set(classes_list), key=classes_list.count)
        
    def fit(self, X_train, y_train):
        """
        Fit K-NN classifier with training data.

        Parameters
        ----------
        X_train : array of shape (n_training_samples, n_features)
            Training data X-values.
        y_train : array of shape (n_training_samples, 1)
            Training data y-values.

        Returns
        -------
        self : object
            Fitted Estimator.
        """
        
        # Define n_samples 
        self.n_training_samples = len(X_train)
        
        # Define n_features
        self.n_features = len(X_train[0])
        
        # Define n_classes
        self.n_classes = len(np.unique(y_train))
        
        # Set training values
        self.X_train  = X_train
        self.y_train = y_train
        
        return self
    
    def predict(self, X_test):
        """
        Predict class of test data with fitted K-NN classifier.

        Parameters
        ----------
        X_test : array of shape (M, n_features)
            Data regressors new data, to predict class y.

        Returns
        -------
        y_predicted : array of shape (M)
            Predicted y classes.
        """
        if hasattr(self, 'X_train') == False:
                raise ValueError(
                     " This KNearestNeighbors instance is not fitted yet." \
                     " Call 'fit' method with appropriate X and y before using this predict function."
                 )
    
        # Create empty np matrix for storing distances and neighbors
        self.list_distances = np.empty((0,self.n_training_samples))
        self.knneighbors = []
        
        # For each test observation, find distance to each training observation
        for x_test in X_test:
            
            # Calculate distance between test and training points, append to list
            distance = self.euclidean_distance(x_test, self.X_train)
            self.list_distances = np.vstack([self.list_distances, distance])
            
            # Sort distances and zip to corresponding test y value 
            y_sorted = [y for _, y in sorted(zip(distance, self.y_train))]
            
            # Append K nearest neighbors to neighbors list
            self.knneighbors.append(y_sorted[:self.k_neighbors])

        return np.array(list(map(self.most_common, self.knneighbors)))
    
    def score(self, X_test, y_test):
        """
        Check accuracy of fitted K-NN classifier.

        Parameters
        ----------
        X_test : array of shape (M, n_features)
            Data regressors test data.
        y_test : array of shape (M, 1)
            Data target value test data.

        Returns
        -------
        accuracy : float
            Accuracy of fitted K-NN classifier.
        """
        if hasattr(self, 'X_train') == False:
                raise ValueError(
                     " This KNearestNeighbors instance is not fitted yet." \
                     " Call 'fit' method with appropriate X and y before using this predict function."
                 )
        
        return sum(self.predict(X_test) == y_test)/len(y_test)

## Examples

In [2]:
import numpy as np
X_train = np.array([[2, 0, 3],         
                    [5, 8, 0],
                    [9, 5, 3],
                    [3, 5, 8],
                    [3, 9, 1],
                    [1, 7, 9],
                    [0, 1, 4],
                    [5, 9, 5],
                    [2, 6, 3],
                    [3, 8, 3]])
y_train = np.array(['Red', 'Green', 'Red', 'Green', 'Blue', 
                    'Blue', 'Green', 'Red', 'Green', 'Green'])
X_test = np.array([[5, 2, 6],
                   [7, 8, 4],
                   [6, 0, 2]])
y_test = np.array(['Green', 'Blue', 'Red'])

KNNfit = KNearestNeighbors(k_neighbors=3).fit(X_train, y_train)
KNNfit.n_training_samples

10

In [3]:
KNNfit.n_features

3

In [4]:
KNNfit.n_classes

3

In [5]:
KNNfit.predict(X_test)

array(['Green', 'Red', 'Red'], dtype='<U5')

In [6]:
KNNfit.knneighbors

[['Green', 'Red', 'Green'], ['Red', 'Red', 'Green'], ['Red', 'Red', 'Green']]

In [7]:
KNNfit.score(X_test, y_test)

0.6666666666666666

In [8]:
KNNfit.list_distances.T

array([[ 4.69041576,  9.48683298,  4.12310563],
       [ 8.48528137,  4.47213595,  8.30662386],
       [ 5.83095189,  3.74165739,  5.91607978],
       [ 4.12310563,  6.40312424,  8.36660027],
       [ 8.83176087,  5.09901951,  9.53939201],
       [ 7.07106781,  7.87400787, 11.09053651],
       [ 5.47722558,  9.89949494,  6.40312424],
       [ 7.07106781,  2.44948974,  9.53939201],
       [ 5.83095189,  5.47722558,  7.28010989],
       [ 7.        ,  4.12310563,  8.60232527]])