In [1]:
import numpy as np, pandas as pd

In [87]:
def pairwiseDist(x, y=None):
    if y is None:
        y = x
    return np.sum((x[:,None]-y)**2,axis=2)**0.5

def prepData(dataPathDir, fieldNames, featSlices, sep=',', transf=None):
    raw = pd.read_csv(dataPathDir , sep=sep, names=fieldNames) # read dlm file
    dataFeats = fieldNames[featSlices] # list of feature names
    if transf is None: # no transformation
        dataMat = raw[dataFeats].values # original values
    elif transf.lower() == 'std' : # if choose to standardize data
        meanVals = raw[dataFeats].mean().values # mean of all features
        stdVals = raw[dataFeats].std().values # standard deviations
        dataMat = (raw[dataFeats].values - meanVals) / stdVals # [x-E(x)]/S(X)
    elif transf.lower() == 'rescale': # rescale to values in [0,1]
        mins = raw[dataFeats].min().values # min of feature vals
        maxs = raw[dataFeats].max().values # max of feature vals
        dataMat = (raw[dataFeats].values-mins) / (maxs-mins) # x-min/range(x)
    else: # error out
        raise Exception('No such transformation available')
    return dataMat,dataFeats,raw['class'].values

def errRate(pred, actual, categorical=True):
    if categorical: # if categ., return classification err rate
        return sum(pred!=actual) / pred.size
    else: # if numeric, return RMSE
        return np.linalg.norm(pred-actual)/np.sqrt(pred.size)

In [209]:
def mostCommonElem(elems):
    counts = dict() # dict to keep track of counts
    for e in elems: # loop over array
        counts[e] = counts.get(e,0) + 1 # increase count by 1 (def count of 0)
    maxCount = -1
    for e,ct in counts.items(): # loop over counts, set maxCount if count larger
        maxCount = ct if ct > maxCount else maxCount
    # get list of elems which has same count as maxCount (if multiple elems)
    mostFreqElem = [e for e,ct in counts.items() if ct==maxCount]
    return np.random.choice(mostFreqElem) # randomly choose one elemn from all

def kMinValIdx(mat, k):
    mat = np.copy(mat) # create copy of variable
    if mat.ndim == 1:
        mat = mat.reshape([1,-1])
    idx = np.ones(mat.shape,int).cumsum(axis=1)-1 # rows of idx: 0,1,...,nCol
    
    for it in range(k): # perform k bubbles to get k smallest
        for col in range(mat.shape[1]-it-1):
            toSwap = mat[:,col] < mat[:,col+1] # if elem smaller than next elem
            # swap cols of data matrix and matrix of indices
            mat[np.ix_(toSwap,[col,col+1])] = mat[np.ix_(toSwap,[col+1,col])]
            idx[np.ix_(toSwap,[col,col+1])] = idx[np.ix_(toSwap,[col+1,col])]
    return idx[:,-k:],mat[:,-k:] # return smallest elemenst per row and the idxs

################################################################################
def KNN(trainX, trainY, testX, K, regression=False):
    dists = pairwiseDist(testX,trainX) # all pairwise dist of two datasets
    knnIdx,_ = kMinValIdx(dists, K) # idx of K closest data pts in training set
    knnLabels = trainY[knnIdx] # labels of these closest data points
    
    testY = np.empty(testX.shape, trainY.dtype) # pre-allocate test data labels
    if regression: # regression, calculate mean
        testY = knnLabl.mean(axis=1) # mean of k-closest label values
    else: # classification, get most common class label
        testY = np.array([mostCommonElem(lab) for lab in knnLabels])
    return testY # return results

In [213]:
def pickAndRemove(arr):
    j = np.random.randint(arr.size)
    return arr[j], np.delete(arr,j)
    

def connsistentSubset(trainX, trainY, K=1):
    dists = pairwiseDist(trainX) # all pairwise dist of two datasets
    idx = np.arange(dataMat.shape[0]) # construct index of data rows
    Z,idx = pickAndRemove(idx) # randomly pick 1st pt of of subset 
    
    converged = False
    while not converged:
        converged = True # stop unless a misclassification
        np.random.shuffle(idx) # shuffle sequence of sample to train randomly
        for x in idx: # loop over all samples
            nnIdx = kMinValIdx(dist[np.ix_(x,Z)], 1)[0] # idx of NN in Z
            nnLabel = trainY[nnIdx].flatten() # label of NN of x in Z
            if nnLabel!=trainY[x]: # if misclassification
                Z = np.hstack(Z,x) # add to consistent subset
                converged = False # continue training
        idx = np.setdiff1d(idx, Z) # remove training set from samples
        
    return Z, idx

In [198]:
np.arange(10)[kMinValIdx(j, 2)[0]]

array([[2, 0],
       [2, 6],
       [5, 6],
       [4, 6],
       [3, 8]])

In [4]:
################################################################################
def getCrossValidFolds(dataMat, classVec, nFolds=5, categorical=False):
    ''' Cut N-fold cross validation of the data set
    '''
    
    idx = np.arange(dataMat.shape[0]) # construct index of data rows
    if categorical:
        unqs = np.unique(classVec)
        tmpHold = [None] * len(unqs)
        for n,k in enumerate(unqs):
            grpIdx = idx[classVec==k] # idx of all elems in current class
            np.random.shuffle(grpIdx) # permutate idx for random selection
            tmpHold[n] = np.array_split(grpIdx, nFolds) # split: N equals
        chunks = [np.hstack(k) for k in zip(*tmpHold)] # concat sub chunks
    else:
        np.random.shuffle(idx) # random shuffle data
        chunks = np.array_split(idx, nFolds) # split into N equal sized chunks

    return chunks # return the prediction of the last fold

In [91]:
ecoliPD = './data/ecoli.data'
ecoliVars = ['seq', 'mcg', 'gvh', 'lip', 'chg', 'aac', 'alm1', 'alm2', 'class']
################################################################################
ecoliMat,ecoliFeat,ecoliY = prepData(ecoliPD, ecoliVars, slice(1,-1), '\s+')
folds = getCrossValidFolds(ecoliMat, ecoliY, categorical=True)

In [31]:
################################################################################
def crossValidate(dataMat, labels, chunks, k, categ):
    err = np.empty(len(chunks))
    
    for ck,idx in enumerate(chunks):
        # get index and dataset for current fold of cross-validation
        trnIdx = np.hstack([x for n,x in enumerate(folds) if n != ck])
        vldIdx = np.hstack([x for n,x in enumerate(folds) if n == ck])
        dataTrain,labelTrain = dataMat[trnIdx,:],labels[trnIdx] # training
        dataTest,labelTest = dataMat[vldIdx,:],labels[vldIdx] # validation
        
        pred = KNN(dataTrain, labelTrain, dataTest, k)
        err[ck] = errRate(pred, labelTest, categorical=categ)
    return err

In [50]:
################################################################################
def tuneK(dataMat, labels, chunks, categ):
    Ks = np.arange(1,15)
    err = np.empty(len(Ks))
    
    pick = np.random.randint(len(chunks)) # randomly pick fold as validation set
    trnIdx = np.hstack([x for n,x in enumerate(folds) if n != pick]) # train
    vldIdx = np.hstack([x for n,x in enumerate(folds) if n == pick]) # validate
    dataTrain,labelTrain = dataMat[trnIdx,:],labels[trnIdx] # training
    dataTest,labelTest = dataMat[vldIdx,:],labels[vldIdx] # validation
    
    for n,k in enumerate(Ks):
        pred = KNN(dataTrain, labelTrain, dataTest, k)
        err[n] = errRate(pred, labelTest, categorical=categ)
    
    return Ks[np.argmin(err)], err # return K with smallest error

In [92]:
minK,errs = tuneK(ecoliMat, ecoliY, folds, categ=True)

In [93]:
crossValidate(ecoliMat, ecoliY, folds, minK, categ=True)

array([0.12857143, 0.12857143, 0.15151515, 0.12307692, 0.12307692])

In [20]:
trnIdx = np.hstack([x for n,x in enumerate(folds) if n != 1])
vldIdx = np.hstack([x for n,x in enumerate(folds) if n == 1])

pred = KNN(ecoliMat[trnIdx], ecoliY[trnIdx], ecoliMat[vldIdx], 3)
errRate(pred, ecoliY[vldIdx])

0.15714285714285714

In [76]:
testX = np.vstack([np.random.randn(8,5)+9, np.random.randn(5,5)+2])
testY = np.array(['B']*8 + ['S']*5)
testPred = np.vstack([np.random.randn(5,5)+9, np.random.randn(2,5)+2])
z = pairwiseDist(testPred, testX)
print(z)

[[ 3.7531095   1.90570435  2.35356582  2.78923996  2.75831607  5.09819285
   2.59454517  4.09085054 15.14762401 16.15038255 17.29086775 15.53368289
  15.89654084]
 [ 2.89632479  2.42758659  1.78430884  3.09164191  3.69661392  4.88415513
   2.20077493  2.93689178 16.01803308 16.85705599 18.08449835 16.2482904
  16.6680666 ]
 [ 2.48400306  4.20228236  4.83424079  2.97053469  5.98471444  7.4226921
   3.24846557  2.34355008 13.96268961 14.42267718 15.76245646 13.98130953
  14.53907182]
 [ 4.05646107  3.21798525  3.57839308  4.03044589  6.29779001  6.9612301
   4.18227002  3.3522941  16.50386363 17.35561302 18.59506191 16.63545198
  17.17488197]
 [ 3.77493685  3.13927818  1.78213933  3.03847036  3.19118653  4.1723466
   2.54653494  3.8257698  14.89907238 15.7502842  17.20118056 15.18789626
  15.53537594]
 [15.62919977 14.65630948 16.17242659 14.01604968 14.99672879 17.4843315
  15.2171927  15.29248839  2.15042781  2.21585641  4.3617274   1.96923112
   2.08483425]
 [18.12475458 17.07915063 1

In [287]:
KNN(testX, testY, testPred, 9)

array(['B', 'B', 'B', 'B', 'B', 'S', 'S'], dtype='<U1')