In [1]:
from numpy import *
import operator

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels

In [2]:
group, labels = createDataSet()

In [3]:
def classify0(inX, dataSet, labels, k):
    dataSetSieze = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSieze, 1)) - dataSet
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range (k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
    return sortedClassCount[0][0]

In [4]:
classify0([0.5,0.1], group, labels, 3)

'B'

Let's analyze the code line by line

In [5]:
inX = [0.5,0.1] # input data to classify
dataSet = group # training examples
k = 3 # number of nearest neighbors to use in the voting

In [6]:
dataSetSize = dataSet.shape[0]

In [7]:
dataSetSize

4

"dataSet.shape[0]" returns the number of rows of "dataSet"; similarly, "dataSet.shape[1]" returns the number of columns of "dataSet". In general, "shape" returns a tuple of $N$ positive integers that specify the size of each dimension for a $N$ dimensional array.

In [8]:
diffMat = tile(inX, (dataSetSize, 1)) - dataSet

In [9]:
diffMat

array([[-0.5, -1. ],
       [-0.5, -0.9],
       [ 0.5,  0.1],
       [ 0.5,  0. ]])

"tile" makes "dataSetSize" copies of the "inX":

In [10]:
tile(inX, (dataSetSize, 1))

array([[ 0.5,  0.1],
       [ 0.5,  0.1],
       [ 0.5,  0.1],
       [ 0.5,  0.1]])

"diffMat" finds all the differences between "inX" and "dataSet".

In [11]:
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)

In [12]:
sqDistances

array([ 1.25,  1.06,  0.26,  0.25])

"sqDistances" computes the distances between "inX" and the training examples "dataSet". 

Here "axis=1" means to sum on axis $1$. For example,

In [13]:
tile(inX, (dataSetSize, 1)).sum(axis=0)

array([ 2. ,  0.4])

In [14]:
tile(inX, (dataSetSize, 1)).sum(axis=1)

array([ 0.6,  0.6,  0.6,  0.6])

Since "tile(inX, (dataSetSize, 1))" is a two dimensional array or $2\times 4$ matrix, "sum(axis=0)" means to sum up all rows; and "sum(axis=1)" means to sum up all columns.

In [15]:
distances = sqDistances ** 0.5

In [16]:
sortedDistIndicies = distances.argsort()

In [17]:
distances.argsort()

array([3, 2, 1, 0])

"argsort()" by default returns the indices whose elements are sorted from least to greatest. There is at least a way to sort from greatest to least:

In [18]:
(-distances).argsort()

array([0, 1, 2, 3])

Ok, now let's use first $k$ or lowest $k$ distances to vote on the class of "inX".

We first create an empty list:

In [19]:
classCount = {}

In [20]:
voteIlabel = labels[sortedDistIndicies[0]]
voteIlabel

'B'

In [21]:
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1

"classCount.get(voteIlabel, 0)" returens $0$ since we search the empty list "classCount" for the key "B". Then after adding $1$, we pass the value $1$ to the key "B" of the updated dict "classCount", which is

In [22]:
classCount

{'B': 1}

Let's repeat this procedure twice (run three times in total) for the rest of indices.

In [23]:
voteIlabel = labels[sortedDistIndicies[1]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
voteIlabel = labels[sortedDistIndicies[2]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1

So the dict "classCount" now is

In [24]:
classCount

{'A': 1, 'B': 2}

In [25]:
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)

In [26]:
sortedClassCount

[('B', 2), ('A', 1)]

Here we have to use "classCount.iteritems()" as a generator that yields $2$-tuples, like ([key, value), (key, value), ...]). Then we use "operater.itemgetter(1)" to sort the second item, which is the value for the corresponding key. At last, this sort is done in reverse so we have largest to smallest.

Last step, we need to return the first tuple's first item, which is

In [27]:
sortedClassCount [0][0]

'B'