In [115]:
import numpy as np
import pandas as pd
import os

In [4]:
x = np.random.randint(5, size=[10,4]) +10
y = np.random.randint(5, size=[10,4])

In [5]:
def pairwiseDist(x, y=None):
    if y is None:
        y = x
    return np.sum((x[:,None]-y)**2,axis=2)**0.5

def arrayDist(x,y):
    return np.sum((x-y)**2,axis=1)**0.5

def errRates(pred, actual):
    return (actual!=pred).sum() / pred.size # return error rate

In [448]:
def initMeans(data, n, algo=1):
    if algo==1: # choose n random points
        idx = np.random.choice(range(data.shape[0]), n, False) # no replace
        out = data[idx,]
    if algo==2: # random means points (0,1)
        out = np.random.random([n,data.shape[1]])
    if algo==3: # always take first n point as centroid
        out = data[:n,]
    return out

def shortestCentroid(centr, mat):
    tmpDist = pairwiseDist(centr,mat) # dist between means and all data pts
    return tmpDist.argmin(axis=0) # find group where distance is smallest

def updateMeans(data, means):
    ## Assign each pt to the mean for which it has the shortest distance
    tmpDist = pairwiseDist(means,data) # dist between means and all data pts
    minDist = tmpDist.argmin(axis=0) # find group where distance is smallest

    ## Calculate new means to be centroid of all the points in the group
    newMeans = np.zeros([len(means),data.shape[1]]) # new mean points
    for n,x in enumerate(means): # loop over all clusters
        tmp = np.vstack( (data[minDist==n,],x) ) # concat data pt and centroid
        newMeans[n] = tmp.mean(axis=0) # new mean = centroid of all pts 
    
    return newMeans,minDist

################################################################################
def kMeans(data, k, trace=False, initAlgo=1):
    means = initMeans(data, k, initAlgo) # initialize mean points
    converged = False
    while not converged:
        newMeans,grpIdx = updateMeans(data, means)
        converged = np.allclose(means,newMeans)
        if trace:
            print(means)
        means = newMeans
        
    return means,grpIdx # return final centroids and labels

In [112]:
updateMeans(x, x[:3])

(array([[13.33333333, 14.        , 14.        , 13.        ],
        [11.        , 12.5       , 10.375     , 11.875     ],
        [10.        , 11.        , 14.        , 12.        ]]),
 array([0, 1, 2, 1, 1, 1, 1, 1, 0, 1], dtype=int64))

In [113]:
z = np.vstack([x,y])

kMeans(z, 2, True, 2)

[[0.37447605 0.12174801 0.34172695 0.38724677]
 [0.29682263 0.21363869 0.96782684 0.33721441]]
[[2.34361901 2.530437   0.08543174 1.59681169]
 [6.73871237 7.95631326 7.38710149 7.6854008 ]]
[[2.34361901 2.530437   0.08543174 1.59681169]
 [6.73871237 7.95631326 7.38710149 7.6854008 ]]
[[ 1.57669264  2.68458518  1.55322107  2.23607379]
 [11.06715567 12.17784666 11.12610014 11.6986728 ]]
[[ 1.57669264  2.68458518  1.55322107  2.23607379]
 [11.06715567 12.17784666 11.12610014 11.6986728 ]]
[[ 1.50697206  2.69859865  1.68665646  2.29418853]
 [11.46065052 12.56162242 11.4660091  12.06351571]]
[[ 1.50697206  2.69859865  1.68665646  2.29418853]
 [11.46065052 12.56162242 11.4660091  12.06351571]]
[[ 1.50063382  2.6998726   1.69878695  2.29947168]
 [11.49642277 12.59651113 11.49690992 12.09668325]]
[[ 1.50063382  2.6998726   1.69878695  2.29947168]
 [11.49642277 12.59651113 11.49690992 12.09668325]]
[[ 1.50005762  2.69998842  1.69988972  2.29995197]
 [11.4996748  12.59968283 11.49971908 12.09969

(array([[ 1.50000048,  2.6999999 ,  1.69999909,  2.2999996 ],
        [11.49999731, 12.59999738, 11.49999768, 12.09999751]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int64))

In [446]:
def SelectBestFeature(dataMat, selected, Nk):
    # get list of index of currently unselected features
    unselect = np.where(~np.isin(np.arange(dataMat.shape[1]),selected))[0]
    bestCoeff = -1-1e-9 # worst possible coefficient value is -1
    for n,j in enumerate(unselect): # loop over unselected features
        testSet = np.hstack([selected,j]) # add curr feature to selected ones
        means,labels = kMeans(dataMat[:,testSet], Nk) # cluster w/ test features
        coeff = Silhouette(dataMat,labels).mean() # mean silhouette coeff
        #print((coeff,bestCoeff))
        if coeff > bestCoeff: # if this feature produce better coeff
            bestCoeff = coeff # record new best coeff
            outs = (j,coeff,means,labels) # record output variables
    #print(unselect)
    return outs # output: the feature, best coeff, means, and labels
################################################################################

def ForwardSelect(data, k):
    selected = np.zeros(0, int) # idx of selected features, start w/ empty
    baseCoeff = -1-1e-9 # -1 is worst possible performance
    converged = False
    while not converged: # loop until convergence
        bestFeat,bestCoeff,means,labels = SelectBestFeature(data, selected, k) 
        if bestCoeff <= baseCoeff: # if new feature doesn't improve performance
            converged = True
        else: # if new feature improves performance
            print(bestCoeff-baseCoeff)
            selected = np.hstack([selected,bestFeat]) # add feature to selection
            baseCoeff = bestCoeff # set new coeff as baseline performance
            outs = (means,labels) # save output vars
            if len(selected) == data.shape[1]: 
                converged = True # algo converged if all features selected
    return (selected,)+outs # return selected features, means, cluster labels

In [336]:
testData = np.random.randint(0,10,[100,8])

ForwardSelect(testData,2)

1.1044337798534718
0.0027974021309899316


(array([3, 4], dtype=int64), array([[2.03225794, 4.59677437],
        [7.39473531, 3.02631531]]), array([1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64))

In [447]:
################################################################################
def Silhouette(data, labels):
    distMat = pairwiseDist(data) # pre-calc all pairwise dist for memoization
    grpIdx = pd.Series(labels).groupby(labels).groups.items() # idx for each grp
    
    aVals = np.zeros(data.shape[0]) # pre-allocate a and b-values for data
    bVals = np.zeros(data.shape[0])
    for grp,idx in grpIdx: # loop over all groups
        aVals[idx] = distMat[np.ix_(idx,idx)].mean(axis=1) # a's for curr grp
        
        # loop over all groups that's not the current gruop
        tmp = np.zeros([len(grpIdx)-1,len(idx)]) # tmp for all b's for curr grp
        for n,(_,outIdx) in enumerate([x for x in grpIdx if x[0]!=grp]):
            # calculate mean dist of points within cluster to out of cluster
            tmp[n,] = distMat[np.ix_(idx,outIdx)].mean(axis=1) 
        #print(tmp.shape)
        #print(idx.shape)
        bVals[idx] = tmp.min(axis=0) # pick min b of all out-groups

    return (bVals-aVals)/np.maximum(aVals,bVals) # return silhouette coeff

1.218264548496111
0.10202167454565866
0.011917492844527922
0.0915066380507315
0.010737041792362234
0.013112969092185567


(array([5, 4, 7, 6, 3, 2], dtype=int64),
 array([[2.44444449e-01, 7.25177778e+01, 1.74999996e-01, 1.25800000e+01,
         1.30888889e+00, 1.69444481e-01],
        [1.65925926e-01, 7.17748148e+01, 2.96296296e-02, 9.58481481e+00,
         8.69259259e-01, 3.69074074e+00],
        [3.33999997e-01, 7.21999999e+01, 1.96666675e-01, 9.68799987e+00,
         1.65866667e+00, 1.97333337e+00],
        [5.66260163e-01, 7.28155285e+01, 5.20325203e-03, 8.38772358e+00,
         1.37284553e+00, 3.50658537e+00],
        [3.46399994e+00, 7.10260000e+01, 1.00399998e+00, 6.19600003e+00,
         2.71799998e+00, 1.82800003e+00],
        [2.11923077e-01, 7.34469231e+01, 9.57692308e-01, 8.59884615e+00,
         2.10961538e+00, 6.69230771e-02]]),
 array([1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 1,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 3, 3, 1,
        3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3

In [129]:
irisFile = os.path.join('./data/', 'iris.data')
irisName = ['sepalLen', 'sepalWth', 'petalLen', 'petalWth', 'class']
raw = pd.read_csv(irisFile , names=irisName)  # read CSV file
irisFeats = irisName[:-1]
irisMat = raw[irisFeats].values
irisK = len(raw['class'].unique())

In [351]:
ForwardSelect(irisMat, irisK)

1.5459229763687348


(array([2], dtype=int64), array([[4.43181827],
        [5.82647044],
        [1.46400021]]), array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int64))

In [424]:
glassData = os.path.join('./data/', 'glass.data')
glassNames = ['id','RI','Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'class']
raw = pd.read_csv(glassData , names=glassNames)  # read CSV file

glassFeats = glassNames[1:-1] # list of feature names
glassMat = raw[glassFeats].values # 2d-array of feature values
glassK = len(raw['class'].unique()) # number of classes

In [425]:
ForwardSelect(glassMat, glassK)

1.280279874484533
0.05061651564669223


(array([5, 2], dtype=int64), array([[5.04      , 0.        ],
        [0.78533314, 2.91400041],
        [0.11724138, 3.71275862],
        [0.32625   , 1.91      ],
        [0.15365854, 0.02707317],
        [0.58681818, 3.55854545]]), array([2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 5, 5, 2,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 5, 5, 5, 2,
        5, 5, 5, 2, 2, 5, 2, 5, 1, 1, 1, 1, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5,
        5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 2, 1, 4, 4, 4, 4, 4,
        4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 4,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 5, 2, 2, 5,
        5, 5, 5, 2, 5, 5, 5, 2, 2, 1, 3, 3, 3, 4, 4, 4, 4, 0, 0, 4, 3, 4,
        3, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 5, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int64))

In [440]:
spamData = os.path.join('./data/', 'spambase.data')
spamNames = ['make', 'address', 'all', '3d', 'our', 'over', 'remove',
	'internet', 'order', 'mail', 'receive', 'will', 'people', 'report',
	'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font',
	'0', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
	'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs',
	'meeting', 'original', 'project', 're', 'edu', 'table', 'conference',
	'semicolon', 'paren', 'bracket', 'exclaim', 'dollar', 'pound', 'capsAvg',
	'capsMax', 'capsTotal', 'class']
raw = pd.read_csv(spamData , names=spamNames)  # read CSV file

spamFeats = spamNames[:-1] # list of feature names
spamMat = raw[spamFeats].values # 2d-array of feature values
spamK = len(raw['class'].unique()) # number of classes

In [None]:
ForwardSelect(spamMat, spamK) # run algorithm

1.842031262105331


In [50]:
testX = np.array([[1,1,1],[10,10,10],[11,11,11],[11,10,11],
                  [2,2,1],[1.5,1.5,1.5],[12,12,12]]) + np.random.random([7,3])/10
testY = np.array([1,2,2,2,1,1,2])
testY_bad = np.array([2,2,1,2,1,2,1])
print(testX)

print(Silhouette(testX,testY))
print(Silhouette(testX,testY_bad))

[[ 1.04280722  1.07598526  1.055384  ]
 [10.04850651 10.07301026 10.0161049 ]
 [11.09731025 11.05113568 11.09967267]
 [11.05970864 10.00012083 11.01400918]
 [ 2.04633279  2.09427173  1.06731713]
 [ 1.51476797  1.57534432  1.59872265]
 [12.09157277 12.03561811 12.02184439]]


## Sources

- [A Modified k-means Algorithm to Avoid Empty Clusters](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.379.3148&rep=rep1&type=pdf)