In [1]:
import numpy as np, pandas as pd

In [27]:
def pairwiseDist(x, y=None):
    if y is None:
        y = x
    return np.sum((x[:,None]-y)**2,axis=2)**0.5

def prepData(dataPathDir, fieldNames, featSlices, sep=',', standardize=False):
    raw = pd.read_csv(dataPathDir , sep=sep, names=fieldNames) # read dlm file
    dataFeats = fieldNames[featSlices] # list of feature names
    if standardize: # if choose to standardize data
        meanVals = raw[dataFeats].mean().values # mean of all features
        stdVals = raw[dataFeats].std().values # standard deviations
        dataMat = (raw[dataFeats].values - meanVals) / stdVals # [x-E(x)]/S(X)
    else:
        dataMat = raw[dataFeats].values # original values
    return dataMat,dataFeats,raw['class'].values

In [47]:
################################################################################
def getCrossValidFolds(dataMat, classVec, nFolds=5, stratify=False):
    ''' Cut N-fold cross validation of the data set
    '''
    idx = np.arange(dataMat.shape[0]) # construct index of data rows
    
    if stratify:
        unqs,unqInv = np.unique(classVec, return_inverse=True)
        tmpHold = [None] * len(unqs)
        for n,k in enumerate(unqs):
            grpIdx = (unqInv==n) # idx of all elems in current class
            tmpHold[n] = np.array_split(idx[grpIdx], nFolds) # split: N equals
        chunks = [np.hstack(k) for k in zip(*tmpHold)] # concat sub chunks
    else:
        np.random.shuffle(idx) # random shuffle data
        chunks = np.array_split(idx, nFolds) # split into N equal sized chunks

    #trnIdx = np.hstack([chunks[x] for x in range(slices) if x != n])
    #vldIdx = np.hstack([chunks[x] for x in range(slices) if x == n])
    return chunks # return the prediction of the last fold

In [49]:
ecoliPD = './data/ecoli.data'
ecoliVars = ['seq', 'mcg', 'gvh', 'lip', 'chg', 'aac', 'alm1', 'alm2', 'class']
################################################################################
ecoliMat,ecoliFeat,ecoliY = prepData(ecoliPD, ecoliVars, slice(1,-1), '\s+')
folds = getCrossValidFolds(ecoliMat, ecoliY, stratify=True)

In [286]:
def mostCommonElem(elems):
    counts = dict() # dict to keep track of counts
    for e in elems: # loop over array
        counts[e] = counts.get(e,0) + 1 # increase count by 1 (def count of 0)
    maxCount = -1
    for e,ct in counts.items(): # loop over counts, set maxCount if count larger
        maxCount = ct if ct > maxCount else maxCount
    # get list of elems which has same count as maxCount (if multiple elems)
    mostFreqElem = [e for e,ct in counts.items() if ct==maxCount]
    return np.random.choice(mostFreqElem) # randomly choose one elemn from all

def kMinValIdx(mat, k):
    mat = np.copy(mat) # create copy of variable
    idx = np.ones(d.shape,int).cumsum(axis=1)-1 # rows of idx: 0,1,...,nCol
    
    for it in range(k): # perform k bubbles to get k smallest
        for col in range(mat.shape[1]-it-1):
            toSwap = mat[:,col] < mat[:,col+1] # if elem smaller than next elem
            # swap cols of data matrix and matrix of indices
            mat[np.ix_(toSwap,[col,col+1])] = mat[np.ix_(toSwap,[col+1,col])]
            idx[np.ix_(toSwap,[col,col+1])] = idx[np.ix_(toSwap,[col+1,col])]
    return idx[:,-k:],mat[:,-k:] # return smallest elemenst per row and the idxs

################################################################################
def KNN(trainX, trainY, testX, K, regression=False):
    dists = pairwiseDist(testX,trainX) # all pairwise dist of two datasets
    knnIdx,_ = kMinValIdx(dists, K) # idx of K closest data pts in training set
    knnLabels = trainY[knnIdx] # labels of these closest data points
    
    testY = np.empty(testX.shape, trainY.dtype) # pre-allocate test data labels
    if regression: # regression, calculate mean
        testY = knnLabl.mean(axis=1) # mean of k-closest label values
    else: # classification, get most common class label
        testY = np.array([mostCommonElem(lab) for lab in knnLabels])
    return testY # return results

In [282]:
testX = np.vstack([np.random.randn(8,5)+9, np.random.randn(5,5)+2])
testY = np.array(['B']*8 + ['S']*5)
testPred = np.vstack([np.random.randn(5,5)+9, np.random.randn(2,5)+2])
pairwiseDist(testPred, testX)

array([[ 2.35968568,  2.64737534,  1.62612899,  2.21499893,  3.33869582,
         3.47065923,  3.01280921,  3.09664356, 14.84349249, 18.52315473,
        15.90429987, 15.71910361, 16.13707107],
       [ 3.91166784,  1.47297886,  3.83122604,  4.40969213,  4.42262601,
         2.53066954,  3.47358719,  2.7803554 , 12.29147976, 15.85377886,
        13.14361229, 13.21176856, 13.30184389],
       [ 2.18384144,  1.48345118,  1.50232591,  2.45115853,  2.49878792,
         2.16701931,  2.48616783,  2.27381092, 14.22601711, 17.85282483,
        15.18519974, 15.15361428, 15.44737518],
       [ 2.93110888,  1.44298489,  2.18709245,  2.94293404,  3.39103615,
         2.52378416,  2.72465258,  2.3079126 , 13.82707269, 17.45133668,
        14.76625728, 14.76440045, 15.04431087],
       [ 2.74992211,  1.39073655,  1.97807805,  2.77326018,  2.61970962,
         1.85966333,  2.42844157,  2.01559566, 14.12705641, 17.70461843,
        15.02076642, 15.06618522, 15.30803819],
       [15.35694262, 13.834474

In [287]:
KNN(testX, testY, testPred, 9)

array(['B', 'B', 'B', 'B', 'B', 'S', 'S'], dtype='<U1')