In [7]:
import random
import numpy as np
from lib.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
import math
from past.builtins import xrange

In [2]:
cifar10_dir = 'lib/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

In [3]:
num_training = 5000
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]

In [4]:
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print(X_train.shape, X_test.shape)

(5000, 3072) (500, 3072)


In [5]:
def compute_distances_two_loops(X_train, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a nested loop over both the training data and the 
    test data.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data.

    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    """
    num_test = X.shape[0]
    num_train = X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
          for j in xrange(num_train):
            #####################################################################
            # TODO:                                                             #
            # Compute the l2 distance between the ith test point and the jth    #
            # training point, and store the result in dists[i, j]. You should   #
            # not use a loop over dimension.                                    #
            #####################################################################
            #L2 sqrt(sum((i1-i2)^2))
            dists[i,j] = math.sqrt(np.sum((X[i] - X_train[j])**2))


            #####################################################################
            #                       END OF YOUR CODE                            #
            #####################################################################
    return dists

In [8]:
compute_distances_two_loops(X_train,X_test)

array([[3803.92350081, 4210.59603857, 5504.0544147 , ..., 4007.64756434,
        4203.28086142, 4354.20256764],
       [6336.83367306, 5270.28006846, 4040.63608854, ..., 4829.15334194,
        4694.09767687, 7768.33347636],
       [5224.83913628, 4250.64289255, 3773.94581307, ..., 3766.81549853,
        4464.99921613, 6353.57190878],
       ...,
       [5366.93534524, 5062.8772452 , 6361.85774755, ..., 5126.56824786,
        4537.30613911, 5920.94156364],
       [3671.92919322, 3858.60765044, 4846.88157479, ..., 3521.04515734,
        3182.3673578 , 4448.65305458],
       [6960.92443573, 6083.71366848, 6338.13442584, ..., 6083.55504619,
        4128.24744898, 8041.05223214]])

In [21]:
def compute_distances_one_loop(X_train, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a single loop over the test data.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    print(dists.shape)
    for i in xrange(num_test):
        tmp =  (X_train-X[i])**2
        dists[i] = np.sqrt(tmp.sum(axis=1))
    print(dists)
    return dists

In [71]:
def compute_distances_no_loops(X_train, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    print(len(dists),len(dists[0]))
    train2 = X_train**2
    train2 = train2.sum(axis=1)
    #np.tile(np.array(idk),(2,1))
    tile = np.tile(np.array(train2),(num_test,1))
    
    test2 = X**2
    test2 = test2.sum(axis=1)
    
    dist_add = tile + np.vstack(test2)
    print(len(dist_add),len(dist_add[0]))
    
    dist_mul = np.dot(X,X_train.transpose())
    print(len(dist_mul),len(dist_mul[0]))
    
    dists = dist_add - 2 * dist_mul
    return np.sqrt(dists)

In [72]:
compute_distances_no_loops(X_train, X_test)

500 5000
500 5000
500 5000


array([[3803.92350081, 4210.59603857, 5504.0544147 , ..., 4007.64756434,
        4203.28086142, 4354.20256764],
       [6336.83367306, 5270.28006846, 4040.63608854, ..., 4829.15334194,
        4694.09767687, 7768.33347636],
       [5224.83913628, 4250.64289255, 3773.94581307, ..., 3766.81549853,
        4464.99921613, 6353.57190878],
       ...,
       [5366.93534524, 5062.8772452 , 6361.85774755, ..., 5126.56824786,
        4537.30613911, 5920.94156364],
       [3671.92919322, 3858.60765044, 4846.88157479, ..., 3521.04515734,
        3182.3673578 , 4448.65305458],
       [6960.92443573, 6083.71366848, 6338.13442584, ..., 6083.55504619,
        4128.24744898, 8041.05223214]])

In [14]:
idk = np.array([[1,2,3,4,5],[6,7,8,9,10]])
test = np.array([0,1,2,3,4])
tmp = idk-test
tmp.sum(axis=1)

array([ 5, 30])

In [15]:
tmp

array([[1, 1, 1, 1, 1],
       [6, 6, 6, 6, 6]])

In [16]:
dists = np.zeros((test.shape[0],idk.shape[0]))

In [18]:
dists[1]

array([0., 0.])

In [23]:
idk = np.array([[1,2],[3,4]])
idk = idk**2
idk = idk.sum(axis=1)
idk

array([ 5, 25], dtype=int32)

In [52]:
idk.transpose()

array([ 5, 25], dtype=int32)

In [24]:
tmp = np.array([[5,6],[7,8]])
tmp = tmp**2
tmp = tmp.sum(axis=1)
tmp

array([ 61, 113], dtype=int32)

In [47]:
np.tile(np.array(idk),(2,1))

array([[ 5, 25],
       [ 5, 25]], dtype=int32)

In [None]:
def predict_labels(self, dists, k=1):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.

    Inputs:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    num_test = dists.shape[0]
    y_pred = np.zeros(num_test)
    for i in xrange(num_test):

        # A list of length k storing the labels of the k nearest neighbors to
        # the ith test point.
        closest_y = []
        #########################################################################
        # TODO:                                                                 #
        # Use the distance matrix to find the k nearest neighbors of the ith    #
        # testing point, and use self.y_train to find the labels of these       #
        # neighbors. Store these labels in closest_y.                           #
        # Hint: Look up the function numpy.argsort.                             #
        #########################################################################
        tmp = np.argsort(dists[i])[:k]
        closest_y = np.take(self.y_train,tmp)




        #########################################################################
        # TODO:                                                                 #
        # Now that you have found the labels of the k nearest neighbors, you    #
        # need to find the most common label in the list closest_y of labels.   #
        # Store this label in y_pred[i]. Break ties by choosing the smaller     #
        # label.                                                                #
        #########################################################################
        values, counts = np.unique(closest_y, return_counts=True)
        
        y_pred[i] = closest_y[np.argmax(counts)]



        #########################################################################
        #                           END OF YOUR CODE                            # 
        #########################################################################

    return y_pred

In [75]:
tmp = np.array([1,1,2,1,3])
np.take(tmp,[1,2,3])
values, counts = np.unique(tmp, return_counts=True)
tmp[np.argmax(counts)]


1

In [8]:
import numpy as np
test = [2,3,4,1,5,2,1,2,1]
values, counts = np.unique(test, return_counts=True)
print(values,counts)
values[np.argmax(counts)]

[1 2 3 4 5] [3 3 1 1 1]


1