In [1]:
import os
import numpy as np, pandas as pd
from kmeans import kMeans
from geneticAlgorithm import geneticAlgoSelect
from functions import Silhouette, pairwiseDist,prepData,printResults

## Demo of Silhouette Coefficients 
Good grouping:

In [2]:
testX = np.vstack((np.random.random([10,10])*2+10, # center around 11
                  np.random.random([5,10])*2+1)) # center around 2
testY_good = np.array([1]*10 + [2]*5)
print(testX)
print(testY_good)
print(Silhouette(testX,testY_good))

[[10.96005431 10.05623629 10.43671211 10.99260141 10.33610321 11.03506016
  10.10151861 11.90763509 10.15305178 11.71353649]
 [11.90987118 10.38196872 10.77858371 10.56866381 10.18197088 10.86943699
  11.91679383 10.73463274 11.82252443 11.06522282]
 [10.59487343 11.5968444  10.72679695 10.74766561 11.94535337 11.04809606
  10.93591857 11.47281318 10.21192972 10.37238787]
 [10.40401873 10.77623294 11.92175193 11.66430007 10.85698505 11.63040303
  10.18478435 10.11952904 10.33676399 11.21437842]
 [11.85013128 11.66352128 10.50835579 11.80708931 10.89600354 11.29491977
  10.32827046 10.58460029 11.5537624  10.26024315]
 [10.93774379 11.93994413 10.08866117 10.02508459 10.70410744 11.53241552
  10.53947126 11.94888381 11.15629039 11.88551583]
 [11.72296169 10.7313729  10.04011087 11.23620166 10.31198613 10.0838416
  10.70265162 10.33778304 10.80801518 10.44802794]
 [11.13619376 10.51792928 10.5045847  10.29171729 10.47167603 10.91079723
  10.47953035 10.19702372 11.48334779 11.33116233]
 

Bad Grouping:

In [3]:
testY_bad = np.random.randint(1,3,15) # randomly generate groups
print(testY_bad)
print(Silhouette(testX,testY_bad))

[2 1 1 2 1 1 1 1 2 1 2 2 2 2 2]
[-0.84542212  0.88812992  0.8738713  -0.841805    0.89037426  0.87171431
  0.88954117  0.89778848 -0.85210595  0.89944284  0.58093422  0.58749991
  0.57985857  0.58623764  0.5908527 ]


## Demo of K-Means

In [4]:
kMeans(testX, 2, True, initAlgo=2)

[[0.37454012 0.95071431 0.73199394 0.59865848 0.15601864 0.15599452
  0.05808361 0.86617615 0.60111501 0.70807258]
 [0.02058449 0.96990985 0.83244264 0.21233911 0.18182497 0.18340451
  0.30424224 0.52475643 0.43194502 0.29122914]]
[[7.74125175 7.53787338 7.41850737 7.41174834 7.37784304 7.48342282
  7.27479027 7.42467179 7.35491852 7.44458809]
 [0.02058449 0.96990985 0.83244264 0.21233911 0.18182497 0.18340451
  0.30424224 0.52475643 0.43194502 0.29122914]]
[[10.86036934 10.62622164 10.52302646 10.47479328 10.49990651 10.68328611
  10.36025002 10.46606588 10.70202208 10.61519884]
 [ 1.96387692  1.87910081  1.74363048  1.7317794   1.65836108  1.62157512
   1.6591405   1.79187933  1.19036699  1.5623278 ]]
[[11.14392548 10.90698058 10.80525547 10.75325191 10.78373046 10.97418278
  10.64074637 10.74255625 11.00630422 10.90343618]
 [ 2.28775899  2.03063263  1.89549512  1.98501945  1.90445043  1.86127022
   1.88495688  2.00306648  1.31677065  1.77417758]]
[[11.16970331 10.93250412 10.8309126

(array([[11.17228108, 10.93505646, 10.83347836, 10.78109776, 10.81211284,
         11.00327243, 10.66879598, 10.77020528, 11.03673242, 10.9322599 ],
        [ 2.35253401,  2.06093835,  1.9258674 ,  2.03566637,  1.95366725,
          1.90920822,  1.93011918,  2.04530301,  1.34205084,  1.81654662]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64))

## Genetic Algorithm Selection Demo

In [5]:
## Load data
glassData = os.path.join('./data/', 'glass.data')
glassNames = ['id','RI','Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'class']
glassMat,glassFeats,glassK,glassMeans = prepData(glassData,glassNames,slice(1,-1))

In [6]:
glassPrm = {'popSize':50, 'minImprove':0.01, 'mutateProb':0.1, 'onProb':0.1,
          'stagnLim': 3}
out = geneticAlgoSelect(glassMat, glassK, glassPrm, trace=True)
print("Best coeff: %f"%out[0])
out2 = kMeans(glassMat[:,out[1]], glassK)
printResults((out[1],out2[0],out2[1]), glassFeats)

Generation 1: best fitness = 1.4200162806
	Best set: [1 4 5 6]
Generation 2: best fitness = 1.4200162806
	Best set: [1 4 5 6]
Generation 3: best fitness = 1.4200162806
	Best set: [1 4 5 6]
Generation 4: best fitness = 1.4200162806
	Best set: [1 4 5 6]
Generation 5: best fitness = 1.4200162806
	Best set: [1 4 5 6]
Generation 6: best fitness = 1.4595424823
	Best set: [1 2 4 5 6]
Generation 7: best fitness = 1.4595424823
	Best set: [1 2 4 5 6]
Best coeff: 0.459542
Features and cluster centroids
                 0          1          2          3          4             5
Na       13.112562  13.133529  13.861579  14.674615  13.608001  1.186714e+01
Mg        3.505702   0.572941   3.345263   0.165385   1.828001  1.401454e-09
Si       72.837355  73.068235  71.811053  73.313846  71.026001  7.167286e+01
K         0.567686   0.501765   0.216579   0.070769   3.463998  2.514286e-01
Ca        8.384711  11.005294   9.496579   8.580385   6.196001  1.431571e+01
Counts  121.000000  17.000000  38.000000 

In [10]:
spamData = os.path.join('./data/', 'spambase.data')
spamNames = ['make', 'address', 'all', '3d', 'our', 'over', 'remove',
	'internet', 'order', 'mail', 'receive', 'will', 'people', 'report',
	'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font',
	'0', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
	'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs',
	'meeting', 'original', 'project', 're', 'edu', 'table', 'conference',
	'semicolon', 'paren', 'bracket', 'exclaim', 'dollar', 'pound', 'capsAvg',
	'capsMax', 'capsTotal', 'class']
spamMat,spamFeats,spamK,spamMeans = prepData(spamData,spamNames,slice(-4))

In [12]:
spamPrm = {'popSize':200, 'minImprove':0.01, 'mutateProb':0.05, 'onProb':0.10,
          'stagnLim': 3}
out = geneticAlgoSelect(spamMat, spamK, spamPrm, trace=True)
print("Best coeff: %f"%out[0])
out2 = kMeans(spamMat[:,out[1]], spamK)
printResults((out[1],out2[0],out2[1]), spamFeats)

Generation 1: best fitness = 1.8106858311
	Best set: [3 5 8]
Generation 2: best fitness = 1.8106858311
	Best set: [3 5 8]
Generation 3: best fitness = 1.8106858311
	Best set: [3 5 8]
Generation 4: best fitness = 1.8106858311
	Best set: [3 5 8]
Generation 5: best fitness = 1.8106858311
	Best set: [3 5 8]
Generation 6: best fitness = 1.8106858311
	Best set: [3 5 8]
Best coeff: 0.810686
Features and cluster centroids
                   0            1
3d      3.364000e+01     0.014266
over    6.269728e-10     0.096047
order   1.268288e-09     0.090205
Counts  7.000000e+00  4594.000000
