In [567]:
import numpy as np
import pandas as pd
import os
from scipy.spatial import distance
from itertools import product as binCombo

In [2]:
def pairwiseDist(x, y=None):
    if y is None:
        y = x
    return np.sum((x[:,None]-y)**2,axis=2)**0.5

In [457]:
def SelectBestFeature(dataMat, selected, Nk, dists):
    # get list of index of currently unselected features
    unselect = np.where(~np.isin(np.arange(dataMat.shape[1]),selected))[0]
    bestCoeff = -1-1e-9 # worst possible coefficient value is -1
    for n,j in enumerate(unselect): # loop over unselected features
        testSet = np.hstack([selected,j]) # add curr feature to selected ones
        means,labels = kMeans(dataMat[:,testSet], Nk) # cluster w/ test features
        coeff = Silhouette(dataMat,labels,dists).mean() # mean silhouette coeff
        #print((coeff,bestCoeff))
        if coeff > bestCoeff: # if this feature produce better coeff
            bestCoeff = coeff # record new best coeff
            outs = (j,coeff,means,labels) # record output variables
    #print(unselect)
    return outs # output: the feature, best coeff, means, and labels
################################################################################

def ForwardSelect(data, k, trace=False):
    selected = np.zeros(0, int) # idx of selected features, start w/ empty
    baseCoeff = -1-1e-9 # -1 is worst possible performance
    dM = pairwiseDist(data) # pre-calc distance matrix for memoization
    
    converged,nRound = False,1
    while not converged: # loop until convergence
        bestFeat,bestCoeff,means,labels = SelectBestFeature(data,selected,k,dM) 
        if bestCoeff <= baseCoeff: # if new feature doesn't improve performance
            converged = True
        else: # if new feature improves performance
            selected = np.hstack([selected,bestFeat]) # add feature to selection
            baseCoeff = bestCoeff # set new coeff as baseline performance
            outs = (means,labels) # save output vars
            if len(selected) == data.shape[1]: 
                converged = True # algo converged if all features selected
        if trace: # print iteration info if requesed
            tmplate = "[%02d] Best coeff=%f, set:%s"
            print( tmplate%(nRound,bestCoeff,str(selected)) )
        nRound += 1
    return (selected,)+outs # return selected features, means, cluster labels

In [603]:
np.where(np.array([True,False,False,True]))

(array([0, 3], dtype=int64),)

In [529]:
np.empty([10,2,5])[0]

array([[4.9, 4.9, 3. , 4.9, 4.7],
       [4.7, 3.2, 4.7, 4.6, 4.6]])

In [504]:
np.array(list(itertools.product([0, 1], repeat=3)))

array([[0, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 1],
       [1, 0, 0],
       [1, 0, 1],
       [1, 1, 0],
       [1, 1, 1]])

In [668]:
def initMeans(data, n, algo=1):
    gen = np.random.RandomState() # use independent stream for reproducibility
    gen.seed(42) # set initial seed
    
    if algo==1: # choose n random points
        idx = gen.choice(range(data.shape[0]), n, False) # no replace
        out = data[idx,]
    if algo==2: # random means points (0,1)
        out = gen.random([n,data.shape[1]])
    if algo==3: # always take first n point as centroid
        out = data[:n,]
    return out

def shortestCentroid(centr, mat):
    tmpDist = pairwiseDist(centr,mat) # dist between means and all data pts
    return tmpDist.argmin(axis=0) # find group where distance is smallest

def updateMeans(data, means):
    ## Assign each pt to the mean for which it has the shortest distance
    tmpDist = pairwiseDist(means,data) # dist between means and all data pts
    minClus = tmpDist.argmin(axis=0) # find group where distance is smallest

    ## Calculate new means to be centroid of all the points in the group
    newMeans = np.zeros([len(means),data.shape[1]]) # new mean points
    for n,x in enumerate(means): # loop over all clusters
        tmp = np.vstack( (data[minClus==n,],x) ) # concat data pt and centroid
        newMeans[n] = tmp.mean(axis=0) # new mean = centroid of all pts 
    
    return newMeans,minClus

################################################################################
def kMeans(data, k, trace=False, initAlgo=1):
    means = initMeans(data, k, initAlgo) # initialize mean points
    converged = False
    while not converged:
        newMeans,grpIdx = updateMeans(data, means)
        converged = np.allclose(means,newMeans)
        if trace:
            print(means)
        means = newMeans
        
    return means,grpIdx # return final centroids and labels

In [112]:
updateMeans(x, x[:3])

(array([[13.33333333, 14.        , 14.        , 13.        ],
        [11.        , 12.5       , 10.375     , 11.875     ],
        [10.        , 11.        , 14.        , 12.        ]]),
 array([0, 1, 2, 1, 1, 1, 1, 1, 0, 1], dtype=int64))

[[0.37447605 0.12174801 0.34172695 0.38724677]
 [0.29682263 0.21363869 0.96782684 0.33721441]]
[[2.34361901 2.530437   0.08543174 1.59681169]
 [6.73871237 7.95631326 7.38710149 7.6854008 ]]
[[2.34361901 2.530437   0.08543174 1.59681169]
 [6.73871237 7.95631326 7.38710149 7.6854008 ]]
[[ 1.57669264  2.68458518  1.55322107  2.23607379]
 [11.06715567 12.17784666 11.12610014 11.6986728 ]]
[[ 1.57669264  2.68458518  1.55322107  2.23607379]
 [11.06715567 12.17784666 11.12610014 11.6986728 ]]
[[ 1.50697206  2.69859865  1.68665646  2.29418853]
 [11.46065052 12.56162242 11.4660091  12.06351571]]
[[ 1.50697206  2.69859865  1.68665646  2.29418853]
 [11.46065052 12.56162242 11.4660091  12.06351571]]
[[ 1.50063382  2.6998726   1.69878695  2.29947168]
 [11.49642277 12.59651113 11.49690992 12.09668325]]
[[ 1.50063382  2.6998726   1.69878695  2.29947168]
 [11.49642277 12.59651113 11.49690992 12.09668325]]
[[ 1.50005762  2.69998842  1.69988972  2.29995197]
 [11.4996748  12.59968283 11.49971908 12.09969

(array([[ 1.50000048,  2.6999999 ,  1.69999909,  2.2999996 ],
        [11.49999731, 12.59999738, 11.49999768, 12.09999751]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int64))

In [5]:
################################################################################
def Silhouette(data, labels, distMat=None):
    if distMat is None:
        distMat = pairwiseDist(data) # calc pairwise dist if not provided
    grpIdx = pd.Series(labels).groupby(labels).groups.items() # idx for each grp
    
    aVals = np.zeros(data.shape[0]) # pre-allocate a and b-values for data
    bVals = np.zeros(data.shape[0])
    for grp,idx in grpIdx: # loop over all groups
        aVals[idx] = distMat[np.ix_(idx,idx)].mean(axis=1) # a's for curr grp
        
        # loop over all groups that's not the current gruop
        tmp = np.zeros([len(grpIdx)-1,len(idx)]) # tmp for all b's for curr grp
        for n,(_,outIdx) in enumerate([x for x in grpIdx if x[0]!=grp]):
            # calculate mean dist of points within cluster to out of cluster
            tmp[n,] = distMat[np.ix_(idx,outIdx)].mean(axis=1) 
        bVals[idx] = tmp.min(axis=0) # pick min b of all out-groups

    return (bVals-aVals)/np.maximum(aVals,bVals) # return silhouette coeff

In [651]:
testData = np.random.randint(0,10,[100,8])

ForwardSelect(testData, 2, trace=True)

[01] Best coeff=0.118785, set:[3]
[02] Best coeff=0.119855, set:[3 5]
[03] Best coeff=0.128725, set:[3 5 4]
[04] Best coeff=0.130233, set:[3 5 4 0]
[05] Best coeff=0.127747, set:[3 5 4 0]


(array([3, 5, 4, 0], dtype=int64),
 array([[3.15789472, 3.39473683, 6.89473678, 5.57894738],
        [5.08064516, 5.5967742 , 2.4516129 , 3.64516129]]),
 array([1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1], dtype=int64))

In [543]:
def printResults(outputLoad, featNames):
    selected,centroids,labels = outputLoad[:3]
    counts = np.zeros(len(centroids))
    for cl in labels:
        counts[cl] += 1
    
    out = pd.DataFrame(np.vstack([centroids.T,counts]))
    selectedFeats = [featNames[n] for n in selected] + ['Counts']
    out = out.rename(index={n:x for n,x in enumerate(selectedFeats)})
    print("Features and cluster centroids")
    print(out)

In [496]:
def prepData(dataPathDir, fieldNames, featSlices):
    raw = pd.read_csv(dataPathDir , names=fieldNames)  # read CSV file
    dataFeats = fieldNames[featSlices] # list of feature names
    meanVals = raw[dataFeats].mean().values # mean of all features
    #dataMat = raw[dataFeats].values/meanVals # standardized array of vals
    dataMat = raw[dataFeats].values # standardized array of vals
    nK = len(raw['class'].unique()) # number of classes
    return dataMat,dataFeats,nK,meanVals

In [722]:
irisData = os.path.join('./data/', 'iris.data')
irisName = ['sepalLen', 'sepalWth', 'petalLen', 'petalWth', 'class']
#raw = pd.read_csv(irisFile , names=irisName)  # read CSV file
#irisFeats = irisName[:-1]
#irisMat = raw[irisFeats].values
#irisK = len(raw['class'].unique())

irisMat,irisFeats,irisK,irisMeans = prepData(irisData,irisName,slice(0,-1))

In [727]:
irisOut = ForwardSelect(irisMat, irisK, trace=True)
print()
printResults(irisOut, irisFeats)

[01] Best coeff=0.554728, set:[2]
[02] Best coeff=0.556750, set:[2 1]
[03] Best coeff=0.556793, set:[2 1 0]
[04] Best coeff=0.561486, set:[2 1 0 3]

Features and cluster centroids
                  0       1          2
petalLen   4.393548   1.464   5.742105
sepalWth   2.748387   3.418   3.073684
sepalLen   5.901613   5.006   6.850000
petalWth   1.433871   0.244   2.071053
Counts    62.000000  50.000  38.000000


In [673]:
irisBrute = BruteForceSelect(irisMat, irisK)
print("Best coeff: %f"%irisBrute[-1])
printResults(irisBrute, irisFeats)

Best coeff: 0.561486
Features and cluster centroids
                  0       1          2
sepalLen   5.901613   5.006   6.850000
sepalWth   2.748387   3.418   3.073684
petalLen   4.393548   1.464   5.742105
petalWth   1.433871   0.244   2.071053
Counts    62.000000  50.000  38.000000


In [None]:
################################################################################
def BruteForceSelect(data, k):
    if data.shape[1] > 15: # error out if no hope of algorithm finishing
        raise ValueError("Too many combinations to try.")
    
    # get all binary combination of features (e.g. whether to include)
    combos = np.array( list(binCombo([True,False],repeat=data.shape[1])) )
    combos = combos[(combos==True).any(axis=1)] # remove combo w/ no features
    
    dist = pairwiseDist(data) # pre-calc distance matrix for memoization
    coeffs = np.empty(combos.shape[0]) # store Silhouette coeff of combos
    means = [None]*combos.shape[0] # store centroids of all combos
    groups = [None]*combos.shape[0] # to store labels of all combinations
    
    for n,featIdx in enumerate(combos):
        means[n],groups[n] = kMeans(data[:,featIdx], k) # cluster w/ features
        coeffs[n] = Silhouette(data,groups[n],dist).mean() # mean coeffs
    
    idx = np.argmax(coeffs) 
    return np.where(combos[idx])[0],means[idx],groups[idx],coeffs[idx]

In [665]:
glassData = os.path.join('./data/', 'glass.data')
glassNames = ['id','RI','Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'class']
#raw = pd.read_csv(glassData , names=glassNames)  # read CSV file

#glassFeats = glassNames[1:-1] # list of feature names
#glassMat = raw[glassFeats].values # 2d-array of feature values
#glassK = len(raw['class'].unique()) # number of classes

glassMat,glassFeats,glassK,glassMeans = prepData(glassData,glassNames,slice(1,-1))

In [674]:
glassOut = ForwardSelect(glassMat, glassK, trace=True)
printResults(glassOut, glassFeats)

[01] Best coeff=0.280280, set:[5]
[02] Best coeff=0.350447, set:[5 2]
[03] Best coeff=0.383476, set:[5 2 3]
[04] Best coeff=0.409510, set:[5 2 3 7]
[05] Best coeff=0.409510, set:[5 2 3 7]
Features and cluster centroids
                0             1          2             3          4        5
K         0.58272  1.052174e-01   0.117308  6.209999e+00   0.485000   0.3455
Mg        3.49720  1.581473e-22   3.731154  1.690569e-09   2.122778   0.1060
Al        1.38616  2.263043e+00   0.802692  3.030000e+00   1.775556   1.2500
Ba        0.01984  1.219565e+00   0.005769  2.221190e-09   0.376667   0.0000
Counts  125.00000  2.300000e+01  26.000000  2.000000e+00  18.000000  20.0000


In [675]:
glassBrute = BruteForceSelect(glassMat, glassK)
print("Best coeff: %f"%glassBrute[-1])
printResults(glassBrute, glassFeats)

Best coeff: 0.483080
Features and cluster centroids
                 0          1          2          3             4          5
RI        1.517430   1.516997   1.521391   1.514567  1.513837e+00   1.528267
Na       13.117287  14.604231  13.596429  14.755714  1.334334e+01  11.867143
Al        1.358062   2.163462   1.145952   1.650000  3.186662e+00   1.218571
Si       72.805659  73.031538  72.208095  73.004286  7.035667e+01  71.672857
K         0.571550   0.028462   0.241667   0.841429  4.699986e+00   0.251429
Ca        8.438992   8.829615  10.291667   6.624286  6.586663e+00  14.315714
Fe        0.062016   0.015000   0.065000   0.017143  9.295126e-17   0.137143
Counts  129.000000  26.000000  42.000000   7.000000  3.000000e+00   7.000000


In [497]:
spamData = os.path.join('./data/', 'spambase.data')
spamNames = ['make', 'address', 'all', '3d', 'our', 'over', 'remove',
	'internet', 'order', 'mail', 'receive', 'will', 'people', 'report',
	'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font',
	'0', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
	'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs',
	'meeting', 'original', 'project', 're', 'edu', 'table', 'conference',
	'semicolon', 'paren', 'bracket', 'exclaim', 'dollar', 'pound', 'capsAvg',
	'capsMax', 'capsTotal', 'class']
#raw = pd.read_csv(spamData , names=spamNames)  # read CSV file

#spamFeats = spamNames[:-1] # list of feature names
#spamMat = raw[spamFeats].values # 2d-array of feature values
#spamK = len(raw['class'].unique()) # number of classes

spamMat,spamFeats,spamK,spamMeans = prepData(spamData,spamNames,slice(-1))

In [705]:
spamOut = ForwardSelect(spamMat, spamK, trace=True) # run algorithm
printResults(spamOut, spamFeats)

[01] Best coeff=0.842031, set:[56]
[02] Best coeff=0.847709, set:[56 55]
[03] Best coeff=0.847709, set:[56 55]
Features and cluster centroids
                     0            1
capsTotal   178.854717  2148.131147
capsMax      32.760845   398.803279
Counts     4357.000000   244.000000


In [746]:
################################################################################
def evalFitness(dataMat, k, pop, preEval, dist): # eval fitness of individuals
    fitness = np.empty(pop.shape[0]) # store fitness of individuals
    for n,indv in enumerate(pop): # loop over populations one by one
        gene = ''.join(['1' if x else '0' for x in indv]) # string repr of DNA
        if gene in preEval: # combo of features previously evaluated
            fitness[n] = preEval[gene] # recall from dict
        else: # never evaluated before
            means,labels = kMeans(dataMat[:,indv], k) # cluster w/ features
            fitness[n] = Silhouette(dataMat,labels,dist).mean()+1 # fit > 0
            preEval[gene] = fitness[n] # store into dict for memoization
    return fitness,preEval

def crossOver(pop, parentIdx):
    popN = pop.shape[0]
    idxDad = parentIdx[:len(parentIdx)//2] # first half of selected
    idxMom = parentIdx[len(parentIdx)//2:] # second half of selected
    breakPts = np.random.randint(1,pop.shape[1],popN//2) # x-over points
    
    out = np.empty(pop.shape, bool) # pre-allocate array for next gen
    for n,(d,m) in enumerate(zip(idxDad,idxMom)): # loop over parents and cross
        out[n] = np.hstack([ pop[d,:breakPts[n]],pop[m,breakPts[n]:] ])
        out[popN-n-1] = np.hstack([ pop[m,:breakPts[n]],pop[d,breakPts[n]:] ])
    out = minOneFeature(out)
    return out

def selectParents(fitness, popSize):
    cumFit = np.cumsum((fitness+1)**3)
    probVec = cumFit/cumFit[-1] # cum array of normalized fitness
    rands = np.random.rand(popSize) # uniform random between 0,1
    outInd = np.searchsorted(probVec, rands) # higher prob of select high fitness
    return outInd

def mutate(pop, prob):
    toMutate = np.where(np.random.rand(pop.shape[0])<prob)[0] # idx of pop to mutate
    mutatePts = np.random.randint(0,pop.shape[1],len(toMutate)) # where to mutate
    for idx,n in zip(toMutate,mutatePts): # mutate selected individuals
        pop[idx,n] = ~pop[idx,n] # flip the selection bit
    pop = minOneFeature(pop) # all individuals must have 1 feature chosen
    return pop

def minOneFeature(pop): 
    noFeatIdx = pop.sum(axis=1)==0 # data pts with no features selected
    for n in np.where(noFeatIdx)[0]: # loop over all data with no features
        pop[n, np.random.randint(pop.shape[1])] = True # randomly select 1
    return pop
    
################################################################################
def geneticAlgoSelect(data, k, prm, trace=False):
    pop = np.random.rand(prm['popSize'],data.shape[1]) < prm['onProb']
    pop = minOneFeature(pop) # at least 1 feature must be selected
    memo = dict() # dict of result for memoization
    dMat = pairwiseDist(data) # pre-calc distance matrix for memoization
    
    baseFit = 0 # worst possible fitenss score
    converged,gen,stagnGens = False,1,0 # initialize loop vars
    while not converged: # loop until GA has converged
        #print(np.asanyarray(pop,int))
        fit,memo = evalFitness(data, k, pop, memo, dMat) # evaluate fitness
        bestIdx = np.argmax(fit) # keep track of best indiviaul
        bestFit,bestIndv = fit[bestIdx],pop[bestIdx] # best fit and features
        #print((bestFit,np.where(bestIndv)[0]))

        if (bestFit-baseFit < prm['minImprove']) and stagnGens>prm['stagnLim']:
            converged = True
            out = baseFit-1,np.where(baseIndv)[0] # silhouette coeff and list
        else: # not converged, selection + crossover + mutation
            if (bestFit-baseFit < prm['minImprove']):
                stagnGens += 1
            else:
                baseFit,baseIndv = bestFit,bestIndv # record long-run best
            parentInd = selectParents(fit, pop.shape[0]) # select parents
            pop = crossOver(pop, parentInd) # cross-over to get next gen
            pop = mutate(pop,prm['mutateProb']) # mutate

        if trace:
            print('Generation %d: best fitness = %.10f'%(gen,bestFit))
            print('\tBest set: %s'%str(np.where(bestIndv)[0]))
        gen += 1
    return out

In [759]:
################################################################################
glassPrm = {'popSize':50, 'minImprove':0.01, 'mutateProb':0.1, 'onProb':0.1,
          'stagnLim': 3}
out = geneticAlgoSelect(glassMat, glassK, glassPrm, trace=True)
print("Best coeff: %f"%out[0])
out2 = kMeans(glassMat[:,out[1]], glassK)
printResults((out[1],out2[0],out2[1]), glassFeats)

Generation 1: best fitness = 1.3957205447
	Best set: [1 6 8]
Generation 2: best fitness = 1.3504467709
	Best set: [2 5]
Generation 3: best fitness = 1.3504467709
	Best set: [2 5]
Generation 4: best fitness = 1.3504467709
	Best set: [2 5]
Generation 5: best fitness = 1.3957205447
	Best set: [1 6]
Generation 6: best fitness = 1.4304823898
	Best set: [3 5 6 7 8]
Generation 7: best fitness = 1.4304823898
	Best set: [3 5 6 7]
Best coeff: 0.430482
Features and cluster centroids
                 0          1             2       3             4          5
Al        1.349259   2.301667  1.103611e+00   1.898  2.568333e+00   1.278889
K         0.555926   0.137778  2.052778e-01   0.031  3.121667e+00   0.267778
Ca        8.408741   9.082778  1.029333e+01   8.516  6.241667e+00  13.883333
Ba        0.010667   0.501111  2.077491e-15   1.595  1.316667e+00   0.350000
Fe        0.061407   0.051111  5.222222e-02   0.015  2.564399e-10   0.106667
Counts  135.000000  18.000000  3.600000e+01  10.000  6.000000

In [761]:
################################################################################
spamPrm = {'popSize':100, 'minImprove':0.01, 'mutateProb':0.05, 'onProb':0.05,
          'stagnLim': 3}
geneticAlgoSelect(spamMat, spamK, spamPrm, trace=True)

Generation 1: best fitness = 1.8420312611
	Best set: [23 27 29 42 46 56]
Generation 2: best fitness = 1.8420312611
	Best set: [ 3 32 56]
Generation 3: best fitness = 1.8477090602
	Best set: [21 27 48 55 56]
Generation 4: best fitness = 1.8477090602
	Best set: [ 6 18 19 21 27 48 55 56]
Generation 5: best fitness = 1.8420312611
	Best set: [ 2 27 56]
Generation 6: best fitness = 1.8477090602
	Best set: [55 56]


(0.8420312611053309, array([23, 27, 29, 42, 46, 56], dtype=int64))

In [721]:
testY_bad = np.random.randint(1,3,15)
print(testY_bad)
print(Silhouette(testX,testY_bad))

[2 2 2 1 1 1 2 1 2 1 2 1 1 2 2]
[-0.1704883  -0.14702439 -0.1482036   0.21342786  0.20650958  0.22240796
 -0.15010846  0.21393639 -0.17083812  0.22790979  0.14201912 -0.11624433
 -0.1109911   0.13400671  0.1363605 ]


In [695]:
testX = np.array([[1,1,1],[10,10,10],[11,11,11],[11,10,11],
                  [2,2,1],[1.5,1.5,1.5],[12,12,12]]) + np.random.random([7,3])/10
testY = np.array([1,2,2,2,1,1,2])
testY_bad = np.array([2,2,1,2,1,2,1])
print(testX)

print(Silhouette(testX,testY))
print(Silhouette(testX,testY_bad))

[[ 1.09358709  1.03347603  1.0959687 ]
 [10.01969228 10.06036641 10.00798611]
 [11.03935893 11.06572294 11.04024076]
 [11.0749028  10.05744201 11.00251991]
 [ 2.04587169  2.01376558  1.02323262]
 [ 1.54765144  1.59813284  1.55391589]
 [12.01736099 12.07408518 12.0918329 ]]
[0.95669206 0.88681422 0.93194897 0.92324479 0.95426446 0.96528731
 0.89463319]
[ 0.3415154  -0.16742914  0.34283323 -0.24885527 -0.29067762  0.33595025
  0.39107926]


In [732]:
print(np.empty([1000,50]))

[[3.17723658e-312 3.17723102e-312 4.32700440e+006 ... 7.20032292e+003
  8.18615233e+006 5.17169939e+005]
 [2.12735983e+004 2.11072850e-002 3.26206935e+005 ... 1.86151549e+003
  1.47507910e+002 4.20613426e+003]
 [2.21577266e+004 1.28018552e+004 1.27361870e+004 ... 6.75922867e+005
  3.11974936e+003 2.29007389e+003]
 ...
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]]


## Sources

- [A Modified k-means Algorithm to Avoid Empty Clusters](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.379.3148&rep=rep1&type=pdf)